In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import numpy as np
import pandas as pd

import torch
from torch.nn import *
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, default_collate
from torch.utils.tensorboard import SummaryWriter

import time
import os
from tqdm import trange

In [2]:
seed=24
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic=True

In [3]:
class UserDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()

        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [4]:
class UserEmbeddingNetwork(Module):
    def __init__(self, input_dimension, hidden_dimension, latent_dimension, id_dimension):
        super().__init__()

        def init_parameters(layer, scale=1):
            torch.nn.init.xavier_normal_(layer.weight, gain=scale)
            torch.nn.init.zeros_(layer.bias)

        self.fc1 = Linear(in_features=input_dimension, out_features=hidden_dimension)
        init_parameters(self.fc1)
        self.fc2 = Linear(in_features=hidden_dimension, out_features=hidden_dimension)
        init_parameters(self.fc2)
        self.fc3 = Linear(in_features=hidden_dimension, out_features=latent_dimension)
        init_parameters(self.fc3)
        self.fc4 = Linear(in_features=latent_dimension, out_features=id_dimension)
        init_parameters(self.fc4)

        self.relu = ReLU()
        self.softmax = Softmax(dim=-1)

    def forward(self, input_vector):
        latent = self.relu(self.fc1(input_vector))
        latent = self.relu(self.fc2(latent))
        latent = self.softmax(self.fc3(latent))
        id = self.softmax(self.fc4(latent))

        return latent, id

In [5]:
# load news data
column_names = ['News_id', 'Category', 'Subcategory', 'Title', 'Abstract', 'URL', 'Title_Entities', 'Abstract_Entities']
news = pd.read_csv('./data/train_news.tsv', sep='\t', names=column_names)
news2 = pd.read_csv('./data/test_news.tsv', sep='\t', names=column_names)
news = pd.concat([news, news2], axis=0)
column_names = ['Impression_id', 'User', 'Time', 'Clicked_News', 'Impressions']
behaviors = pd.read_csv('./data/train_behaviors.tsv', sep='\t', names=column_names)
behaviors2 = pd.read_csv('./data/test_behaviors.tsv', sep='\t', names=column_names)

print('data loaded')

data loaded


In [6]:
number_of_epoch = 80
learning_rate = 0.005
batch_size = 256
embedding_dimension = 256
hidden_dimension = 256
latent_dimension = 256
output_dimension = len(behaviors['User'].unique())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
news_id2index_map = {id: index for index, id in enumerate(news['News_id'].unique())}

news_embedding = np.load('./data/news_embedding/news_embedding_vector.npy')

user_vector = np.empty((behaviors.shape[0], embedding_dimension), dtype=np.float32)
pointer = 0
for i in behaviors['Clicked_News']:
    vec_list = []
    for j in i.split():
        index = news_id2index_map[j]
        vec_list.append(news_embedding[index])

    user_vector[pointer] = np.mean(vec_list, axis=0)
    pointer += 1

user_vector_tensor = torch.FloatTensor(user_vector)

print('user mapped with news vector (input)')

user mapped with news vector (input)


In [8]:
user_id = LabelEncoder().fit_transform(behaviors['User'].values)
user_id_tensor = torch.LongTensor(user_id)

print('user id (target) encoded')

user id (target) encoded


In [9]:
user_dataset = UserDataset(user_vector_tensor, user_id_tensor)


# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_loader = DataLoader(user_dataset, batch_size=batch_size, shuffle=True)

In [10]:
del news, news2, behaviors, behaviors2

In [11]:
writer = SummaryWriter(log_dir=f'./runs/user_embedding_{time.strftime("%Y%m%d-%H%M%S")}')

network = UserEmbeddingNetwork(input_dimension=embedding_dimension,
                               hidden_dimension=hidden_dimension,
                               latent_dimension=latent_dimension,
                               id_dimension=output_dimension)

optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)
criterion = CrossEntropyLoss()

start_time = time.time()
for epoch in range(number_of_epoch):
    for i, (x, y) in enumerate(train_loader):
        latent, id = network.forward(x)
        loss = criterion(id, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    writer.add_scalar('Loss', loss, epoch)

    with torch.no_grad():
        _, id = network.forward(user_vector_tensor)
        id = id.cpu().numpy()
        predict = np.argmax(id, axis=1)
        label = user_id
        accuracy = accuracy_score(label, predict)
        f1 = f1_score(label, predict, average='weighted')
        auc = roc_auc_score(label, id, multi_class='ovr')


    writer.add_scalar('Accuracy', accuracy, epoch)
    writer.add_scalar('F1', f1, epoch)
    writer.add_scalar('AUC', auc, epoch)

print(f'user embedding train time: {time.time() - start_time} seconds')
writer.close()

OutOfMemoryError: CUDA out of memory. Tried to allocate 226.65 GiB (GPU 0; 6.00 GiB total capacity; 1.47 GiB already allocated; 2.79 GiB free; 1.60 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
with torch.no_grad():
    _, id = network.forward(user_vector_tensor.to(device))
    id = id.cpu().numpy()
    predict = np.argmax(id, axis=1)
    label = user_id
    accuracy = accuracy_score(label, predict)
    f1 = f1_score(label, predict, average='weighted')
    auc = roc_auc_score(label, id, multi_class='ovr')

In [None]:
if not os.path.exists('./model/'):
    os.makedirs('./model/')
torch.save(network.state_dict(), 
           f'./model/user_embedding_{time.strftime("%Y%m%d-%H%M%S")}_{auc:.2f}.pth')

In [None]:
latent, _ = network.forward(user_vector_tensor.to(device))
latent = latent.detach().cpu().numpy()

if not os.path.exists('./data/user_embedding/'):
    os.makedirs('./data/user_embedding/')
np.save('./data/user_embedding/user_vector.npy', user_vector)
np.save('./data/user_embedding/user_embedding_vector.npy', latent)