In [1]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import numpy as np
import pandas as pd

import torch
from torch.nn import Module, Linear, ReLU, Softmax, CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

import time
import os
from tqdm import trange

In [2]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()

        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [3]:
class UserEmbeddingNetwork(Module):
    def __init__(self, input_dimension, hidden_dimension, latent_dimension, output_dimension):
        super().__init__()

        def init_parameters(layer, scale=1):
            torch.nn.init.xavier_normal_(layer.weight, gain=scale)
            torch.nn.init.zeros_(layer.bias)

        self.fc1 = Linear(in_features=input_dimension, out_features=hidden_dimension)
        init_parameters(self.fc1)
        self.fc2 = Linear(in_features=hidden_dimension, out_features=hidden_dimension)
        init_parameters(self.fc2)
        self.fc3 = Linear(in_features=hidden_dimension, out_features=latent_dimension)
        init_parameters(self.fc3)
        self.fc4 = Linear(in_features=latent_dimension, out_features=output_dimension)
        init_parameters(self.fc4)

        self.relu = ReLU()
        self.softmax = Softmax(dim=-1)

    def forward(self, input_vector):
        latent = self.relu(self.fc1(input_vector))
        latent = self.relu(self.fc2(latent))
        latent = self.softmax(self.fc3(latent))
        output = self.softmax(self.fc4(latent))

        return latent, output

In [4]:
seed=24
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic=True

In [5]:
# load news data
column_names = ['News_id', 'Category', 'Subcategory', 'Title', 'Abstract', 'URL', 'Title_Entities', 'Abstract_Entities']
news = pd.read_csv('./data/train_news.tsv', sep='\t', names=column_names)
news2 = pd.read_csv('./data/test_news.tsv', sep='\t', names=column_names)
news = pd.concat([news, news2], ignore_index=True)
column_names = ['Impression_id', 'User', 'Time', 'Clicked_News', 'Impressions']
behaviors = pd.read_csv('./data/train_behaviors.tsv', sep='\t', names=column_names)
behaviors2 = pd.read_csv('./data/test_behaviors.tsv', sep='\t', names=column_names)
behaviors = pd.concat([behaviors, behaviors2], ignore_index=True)

del news2, behaviors2

print('data loaded')

data loaded


In [8]:
number_of_epoch = 80
learning_rate = 0.001
batch_size = 128
embedding_dimension = 256
hidden_dimension = 256
latent_dimension = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
news_id2index_map = {id: index for index, id in enumerate(news['News_id'].unique())}

news_embedding = np.load('./data/news_embedding/news_embedding_vector.npy')

train_x = np.empty((behaviors.shape[0], embedding_dimension), dtype=np.float32)
pointer = 0
for i in behaviors['Clicked_News']:
    vec_list = []
    for j in i.split():
        index = news_id2index_map[j]
        vec_list.append(news_embedding[index])

    train_x[pointer] = np.mean(vec_list, axis=0)
    pointer += 1

train_x_tensor = torch.FloatTensor(train_x)

print('user mapped with news vector (input)')

user mapped with news vector (input)


In [10]:
one_hot_category = pd.get_dummies(news['Category']).astype(float)
columns = news['Category'].value_counts().index
one_hot_category = one_hot_category[columns].values
one_hot_category_map = {news['News_id'][i]:one_hot_category[i] for i in range(len(news))}

num_of_class = one_hot_category.shape[1]

num_of_rows = behaviors.shape[0]

# User history sum
train_y = np.empty((num_of_rows, num_of_class))
click_news = behaviors['Clicked_News'].values
for i in trange(num_of_rows):
    for click in click_news[i].split():
        train_y[i] += one_hot_category_map[click]
    t = np.argmax(train_y[i])
    train_y[i] = np.zeros(num_of_class)
    train_y[i][t] = 1

100%|██████████| 331629/331629 [00:13<00:00, 24591.40it/s]


In [12]:
len(np.unique(train_y, axis=0))

14

In [14]:
np.where(~train_y.any(axis=0))

(array([14, 15, 16, 17], dtype=int64),)

In [15]:
train_y2 = np.delete(train_y, np.where(~train_y.any(axis=0)), axis=1)
len(np.unique(train_y2, axis=0))

14

In [16]:
train_y2.shape

(331629, 14)

In [17]:
train_y_tensor = torch.FloatTensor(train_y2)

output_dimension = train_y2.shape[1]

print('user id (target) encoded')

user id (target) encoded


In [18]:
user_dataset = CustomDataset(train_x_tensor, train_y_tensor)

train_loader = DataLoader(user_dataset, batch_size=batch_size, shuffle=True)

del news, behaviors

In [20]:
if not os.path.exists('./runs/'):
    os.makedirs('./runs/')

if not os.path.exists('./model/'):
    os.makedirs('./model/')

writer = SummaryWriter(log_dir=f'./runs/user_embedding_{time.strftime("%Y%m%d-%H%M%S")}')

network = UserEmbeddingNetwork(input_dimension=embedding_dimension,
                               hidden_dimension=hidden_dimension,
                               latent_dimension=latent_dimension,
                               output_dimension=output_dimension).to(device)

optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)
criterion = CrossEntropyLoss()

start_time = time.time()
for epoch in trange(number_of_epoch):
    network.to(device)
    predicts = np.empty((len(train_x),output_dimension))
    labels = np.empty((len(train_x), output_dimension))
    pointer = 0
    for i, (x, y) in enumerate(train_loader):
        x = x.to(device)
        y = y.to(device)
        latent, predict = network.forward(x)
        loss = criterion(predict, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            predict = predict.cpu().numpy()
            y = y.cpu().numpy()
            mb_size = len(predict)
            predicts[pointer:pointer+mb_size] = predict
            labels[pointer:pointer+mb_size] = y
            pointer += mb_size

        del x, y, predict

    writer.add_scalar('Loss', loss, epoch)

    auc = roc_auc_score(labels, predicts, multi_class='ovr')

    writer.add_scalar('AUC', auc, epoch)

print(f'news embedding train time: {time.time() - start_time} seconds')
writer.close()

 20%|██        | 16/80 [02:05<08:30,  7.98s/it]

In [None]:
with torch.no_grad():
    network.to(device)
    predicts = np.empty((len(train_x),output_dimension))
    latents = np.empty((len(train_x), latent_dimension))
    pointer = 0
    for i, (x, y) in enumerate(train_loader):
        x = x.to(device)
        y = y.to(device)
        latent, predict = network.forward(x)
        loss = criterion(predict, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        predict = predict.cpu().numpy()
        latent = latent.cpu().numpy()
        mb_size = len(predict)
        predicts[pointer:pointer+mb_size] = predict
        latents[pointer:pointer+mb_size] = latent
        pointer += mb_size

        del x, y, predict, latent

predicts = np.argmax(predicts, axis=1)
print(np.unique(predicts))

In [None]:
if not os.path.exists('./model/'):
    os.makedirs('./model/')
torch.save(network.state_dict(), 
           f'./model/user_embedding_{time.strftime("%Y%m%d-%H%M%S")}_{auc:.2f}.pth')

In [None]:
if not os.path.exists('./data/user_embedding/'):
    os.makedirs('./data/user_embedding/')
np.save('./data/user_embedding/user_vector.npy', train_x)
np.save('./data/user_embedding/user_embedding_vector.npy', latents)