In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import numpy as np
import pandas as pd

import torch
from torch.nn import *
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, default_collate
from torch.utils.tensorboard import SummaryWriter

import time
import os

In [None]:
seed=24
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic=True

In [None]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()

        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [None]:
class UserClickNetwork(Module):
    def __init__(self, input_dimension, hidden_dimension, latent_dimension, id_dimension):
        super().__init__()

        def init_parameters(layer, scale=1):
            torch.nn.init.xavier_normal_(layer.weight, gain=scale)
            torch.nn.init.zeros_(layer.bias)

        self.fc1 = Linear(in_features=input_dimension, out_features=hidden_dimension)
        init_parameters(self.fc1)
        self.fc2 = Linear(in_features=hidden_dimension, out_features=latent_dimension)
        init_parameters(self.fc2)
        self.fc3 = Linear(in_features=latent_dimension, out_features=hidden_dimension)
        init_parameters(self.fc3)
        self.fc4 = Linear(in_features=hidden_dimension, out_features=id_dimension)
        init_parameters(self.fc4)
        
        self.relu = ReLU()
        self.softmax = Softmax(dim=-1)

    def forward(self, input_vector):
        latent = self.relu(self.fc1(input_vector))
        latent = self.relu(self.fc2(latent))
        latent2 = self.softmax(self.fc3(latent))
        id = self.softmax(self.fc4(latent2))

        return latent, id

In [None]:
# load news data
column_names = ['News_id', 'Category', 'Subcategory', 'Title', 'Abstract', 'URL', 'Title_Entities', 'Abstract_Entities']
news = pd.read_csv('../data/train_news.tsv', sep='\t', names=column_names)
column_names = ['Impression_id', 'User', 'Time', 'Clicked_News', 'Impressions']
behaviors = pd.read_csv('../data/train_behaviors.tsv', sep='\t', names=column_names, parse_dates=['Time'])

candidate_pairs = behaviors['Impressions'].str.split(expand=True)
for col in candidate_pairs.columns:
    candidate_pairs[['Article '+str(col), 'label '+str(col)]] = candidate_pairs[col].str.split(pat='-', expand=True)
behaviors['Target_News'] = candidate_pairs[['Article '+str(i) for i in range(15)]].agg(' '.join, axis=1)
behaviors['Target_News_Clicked'] = candidate_pairs[['label '+str(i) for i in range(15)]].agg(' '.join, axis=1)

del candidate_pairs

print('data loaded')

In [None]:
number_of_epoch = 100
learning_rate = 0.005
batch_size = 128
embedding_dimension = 128
hidden_dimension = 256
latent_dimension = 256
output_dimension = len(behaviors['User'].unique())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
news_embeddings = np.load('./data/news_embedding/news_embedding_vector.npy')
user_embeddings = np.load('./data/user_embedding/user_embedding_vector.npy')

dot_vector = np.empty((behaviors.shape[0], latent_dimension), dtype=np.float32)
pointer = 0
for i in range(len(behaviors['Target_News'])):
    vec_list = []
    user_embedding = user_embeddings[i]
    for j in behaviors['Target_News'][i].split():
        index = news[news['News_id']==j].index[0]
        vec_list.append(news_embeddings[index])
    dot_vector[pointer] = np.dot(np.mean(vec_list, axis=0), user_embedding)
    pointer += 1

dot_vector_tensor = torch.FloatTensor(dot_vector)

print('user mapped with news vector (input)')

In [None]:
target_clicked = np.empty((behaviors.shape[0], target_dimension), dtype=np.float32)
pointer = 0
for i in behaviors['Target_News_Clicked']:
    target_clicked[pointer] = list(map(int, i.split()))

target_clicked_tensor = torch.LongTensor(target_clicked)

print('user clicked target (target) encoded')

In [None]:
dot_dataset = CustomDataset(dot_vector_tensor, target_clicked_tensor)

train_loader = DataLoader(dot_dataset, batch_size=batch_size, shuffle=True,
                          collate_fn=lambda x: tuple(x_.to(device) for x_ in default_collate(x)))

output_dimension = len(news['Category'].unique())

In [None]:
writer = SummaryWriter(log_dir=f'./runs/user_click_{time.strftime("%Y%m%d-%H%M%S")}')

network = UserClickNetwork(input_dimension=embedding_dimension,
                               hidden_dimension=hidden_dimension,
                               latent_dimension=latent_dimension,
                               id_dimension=output_dimension).to(device)

optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)
criterion = CrossEntropyLoss()

start_time = time.time()
for epoch in range(number_of_epoch):
    for i, (x, y) in enumerate(train_loader):
        latent, id = network.forward(x)
        loss = criterion(id, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    writer.add_scalar('Loss', loss, epoch)

    with torch.no_grad():
        _, id = network.forward(dot_vector_tensor.to(device))
        id = id.cpu().numpy()
        predict = np.argmax(id, axis=1)
        label = target_clicked
        accuracy = accuracy_score(label, predict)
        f1 = f1_score(label, predict, average='weighted')
        auc = roc_auc_score(label, id, multi_class='ovr')


    writer.add_scalar('Accuracy', accuracy, epoch)
    writer.add_scalar('F1', f1, epoch)
    writer.add_scalar('AUC', auc, epoch)

print(f'user embedding train time: {time.time() - start_time} seconds')
writer.close()