In [9]:
from collections import defaultdict
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import random
import torch
from apex import amp
import sklearn
import sklearn.decomposition
from sklearn.preprocessing import StandardScaler

In [None]:
ROOT = 'D:/data/stuff'
OUT = f'{ROOT}/fav_1'

if not Path(OUT).exists():
    ! mkdir "$OUT"
    ! mkdir "$OUT/images"

In [75]:
df = pd.read_csv(f"{ROOT}/updated_posts.csv")
len(df)

1762920

In [23]:
import string
printable = set(string.printable)
df["tags"] = df["tags"].apply(lambda s: ''.join(filter(lambda x: x in printable, s)))

In [13]:
re_read_csv = False
if re_read_csv:
    all_posts = set(df.id.values)
    favs = pd.read_csv(f"{ROOT}/favs.csv")
    user_favs = defaultdict(set)
    for post, user in favs.values:
        if post in all_posts:
            user_favs[user].add(post)
    del all_posts
    tmp = pd.DataFrame([(x, user_favs[x]) for x in user_favs])
    tmp.to_csv(f"{ROOT}/favs_grouped.csv")
    del tmp
else:
    tmp = pd.read_csv(f"{ROOT}/favs_grouped.csv")
    user_favs = dict()
    for _, u, f in tmp.values:
        user_favs[u] = set([int(x) for x in f[1:-1].split(", ")])
    del tmp
len(user_favs)

148259

In [24]:
tags_count = defaultdict(lambda: 0)
for tags in df["tags"]:
    for t in tags.split():
        tags_count[t] += 1

In [26]:

re_read_tags = False
if re_read_tags:
    sorted_tags = [(-tags_count[x], x) for x in tags_count]
    sorted_tags.sort()
    common_tags = [(x, -n) for n, x in sorted_tags]
    with open(f"{ROOT}/common_tags.csv", "w") as f:
        for t, n in common_tags:
            print(f"{t},{n}", file=f)
else:
    common_tags = pd.read_csv(f"{ROOT}/common_tags.csv", header=None).values
    

n_tags = 3 * 1024
common_tags = [x for x, _ in common_tags[:n_tags]]
len(common_tags)

3072

In [27]:
normalizer = np.array([len(df) / tags_count[t] for t in common_tags])
normalizer = np.log(normalizer)
normalizer

array([0.25956624, 0.4489894 , 0.53869315, ..., 7.42023862, 7.42308357,
       7.42308357])

In [None]:
reduced_components = 256

regen_data = True
data_preprocess_path = f"{ROOT}/data_preprocess"

if regen_data:

    posts_mapping = dict()
    posts_encoded = []

    for df in pd.read_csv(f"{ROOT}/data_all.csv", chunksize=4096):
        df = df.drop(columns=["Unnamed: 0", "Unnamed: 0.1", "author", "file_url", "sample_url"])
        df["tags"] = df["tags"].apply(lambda x: set(x.split()))
        for fav, id, rating, score, tags in df.values:

            tags = np.array([1 if t in tags else 0 for t in common_tags]) * normalizer
            res = [score, fav]
            res += [1 if r == rating else 0 for r in ['s', 'q', 'e']]
            res += tags.tolist()
            posts_mapping[id] = len(posts_encoded)
            posts_encoded.append(np.array(res))
    all_ids = list(posts_mapping)
    
    pca = sklearn.decomposition.PCA(n_components=reduced_components - 5, whiten=True)
    #scaler = StandardScaler()
    X = np.array(np.array(posts_encoded)[:, 5:])
    #X = scaler.fit_transform(X)
    X = pca.fit_transform(X)
    posts_encoded = np.concatenate([np.array(posts_encoded)[:, :5], X], axis=1)
    
    
    pickle.dump([posts_mapping, posts_encoded, all_ids, pca], open(data_preprocess_path, "wb"), protocol=4)
else:
    posts_mapping, posts_encoded, all_ids, _ = pickle.load(open(data_preprocess_path, "rb"))

In [None]:
def convert(post_id):
    return posts_encoded[posts_mapping[post_id]]
def convert_list(l):
    for x in l:
        try:
            yield(convert(x))
        except:
            pass
def get_n_posts(n, user_id):
    return [convert(x) for x in random.sample(user_favs[user_id], n)]

In [None]:
group_sizes = [64, 128, 256, 512, 1024]

regenerate_valid_users = False
if regenerate_valid_users:
    valid_users_dict = dict()
    for s in group_sizes:
        res = set()
        for u in user_favs:
            if len(list(get_n_posts(s, u))) == s:
                res.add(u)
        valid_users_dict[s] = res
    pickle.dump(valid_users_dict, open(f"{ROOT}/valid_users.p", "wb"))
else:
    valid_users_dict = pickle.load(open(f"{ROOT}/valid_users.p", "rb"))

In [None]:
def save_all(model, opti, n_batch):
    out_path = f"{OUT}/all_train_data"
    data = [model.state_dict(), opti.state_dict(), n_batch]
    out_path_2 = f"{OUT}/train_{(n_batch // 5000) % 50}"
    torch.save(data, open(out_path, "wb"))
    ! cp "$out_path" "$out_path_2"
def load_all(model, opti):
    in_path = f"{OUT}/all_train_data"
    l = torch.load(open(in_path, "rb"))
    model.load_state_dict(l[0])
    opti.load_state_dict(l[1])
    return [model, opti] + l[2:]

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

In [None]:
def layer(in_depth, k):
    return torch.nn.Sequential(
        torch.nn.Conv1d(in_depth, k, kernel_size=1, stride=1, padding=0, bias=False),
        torch.nn.BatchNorm1d(k),
        torch.nn.LeakyReLU(0.2)
    )
size_bottleneck = 64
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        
        
        self.conv = torch.nn.ModuleList([
            layer(reduced_components, 1024),
            layer(1024, 512),
        ])
        
        self.features_extract = torch.nn.ModuleList([
            torch.nn.Linear(512 * 2 + reduced_components, 1024),
            torch.nn.LeakyReLU(0.2),
            torch.nn.Linear(1024, 256),
            torch.nn.LeakyReLU(0.2),
            torch.nn.Linear(256, size_bottleneck),
        ])
        
        self.is_fav = torch.nn.ModuleList([
            layer(reduced_components + size_bottleneck, 1024),
            layer(1024, 512),
            layer(512, 128),
            layer(128, 128),
            layer(128, 32),
            torch.nn.Conv1d(32, 1, kernel_size=1, stride=1, padding=0),
            torch.nn.Sigmoid()
        ])
    
    def forward(self, favs, to_identify):
        #print(favs.mean())
        x = favs
        for l in self.conv:
            x = l(x)
        x_mean = x.mean(dim=2)
        x_var = x.var(dim=2)
        inputs_mean = favs.mean(dim=2)
        x = torch.cat([x_mean, x_var, inputs_mean], dim=1)
        for l in self.features_extract:
            x = l(x)
            #print(l, x)
        
        fingerprint = x
        
        user_extended = x.view(-1, size_bottleneck, 1).repeat(1, 1, to_identify.shape[-1])
        x = torch.cat([to_identify, user_extended], dim=1)
        
        for l in self.is_fav:
            x = l(x)
        return x.reshape(x.shape[0], -1), fingerprint
model = Network().to(device)



lr = 0.000005

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss = torch.nn.BCELoss()
MP = False

if MP:
    model, optimizer = amp.initialize(
       model, optimizer, opt_level="O2"
    )

n_batch = 0
losses = []

In [39]:
model, optimizer, n_batch = load_all(model, optimizer)

In [50]:
accumulate_steps = 1
model.train()
for epoch in range(100000000):
    n_samples = random.choice(group_sizes) // 2
    batch_size = int((512 / n_samples) * 32)
    ones = torch.ones(batch_size).to(device)
    valid_users = list(valid_users_dict[n_samples * 2])

    n2_samples = n_samples * 2
    random.shuffle(valid_users)
    #print(valid_users)

    #print(n_batch)
    n_batch += 1

    extract1 = []
    extract2 = []
    inputs_test1 = []
    inputs_test2 = []

    for u in random.sample(valid_users, batch_size):
        favs = list(get_n_posts(n2_samples, u))

        extract1.append(np.array(favs[:n_samples]).T)
        extract2.append(np.array(favs[n_samples:n2_samples]).T)

    targets = torch.tensor([[1] * n_samples + [0] * n_samples] * batch_size, device=device, dtype=torch.float)
    
    weights = targets * 2 + 1
    weights = weights / weights.mean()
    loss = torch.nn.BCELoss(weight=weights)
    
    extract1 = torch.tensor(extract1, device=device, dtype=torch.float)
    extract2 = torch.tensor(extract2, device=device, dtype=torch.float)
    
    to_test1 = torch.cat([extract2, extract2.roll(1, dims=[0])], dim=2)
    to_test2 = torch.cat([extract1, extract1.roll(1, dims=[0])], dim=2)

    res1, fingerprint1 = model(extract1, to_test1)
    res2, fingerprint2 = model(extract2, to_test2)

    loss1 = loss(res1, targets)
    loss2 = loss(res2, targets)
    loss_diff = ((fingerprint1 - fingerprint2) ** 2).mean()
    variance = fingerprint1.var(dim=0).mean() + fingerprint2.var(dim=0).mean()
    norm_loss = (fingerprint1 ** 2).mean() + (fingerprint2 ** 2).mean()
    loss_variance = -variance
    both_loss = loss1 + loss2
    l = both_loss + loss_diff ** 2 * 0.002


    assert(l == l)

    l = l / accumulate_steps
    if MP:
        with amp.scale_loss(l, optimizer) as scaled_loss:
            scaled_loss.backward()
    else:
        l.backward()
        

    if n_batch % accumulate_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
    
    
    if n_batch % 1 == 0:
        losses.append(l.item() * accumulate_steps)
    if n_batch % 100 == 0:
        mean_positive = (res1[:, :n_samples] > 0.5).float().mean().item()
        mean_negative = (res1[:, n_samples:] > 0.5).float().mean().item()
        print(f"{n_batch} : {both_loss.item()}, positive : {mean_positive}, negative : {mean_negative}, diff : {loss_diff.item()}, variance : {variance.item()}")
    if n_batch % 5000 == 0:
        save_all(model, optimizer, n_batch)

111900 : 0.6220861673355103, positive : 0.95782470703125, negative : 0.3804931640625, diff : 0.02625551074743271, variance : 2.7482106685638428


KeyboardInterrupt: 

In [None]:
save_all(model, optimizer, n_batch)