In [1]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import numpy as np
import pandas as pd

import torch
from torch.nn import Module, Linear, ReLU, Sigmoid, BCELoss
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

import time
import os
import tqdm

# Set Random Seed

In [2]:
seed=24
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic=True

# Dataset

In [3]:
class CustomDataset(Dataset):
    def __init__(self, x, y=None):
        super().__init__()

        self.x = x
        if torch.is_tensor(y):
            self.train = True
            self.y = y
        else:
            self.train = False

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        if self.train:
            return self.x[index], self.y[index]
        else:
            return self.x[index]

# Network

In [4]:
class UserClickNetwork(Module):
    def __init__(self, input_dimension, hidden_dimension, output_dimension):
        super().__init__()

        def init_parameters(layer, scale=1):
            torch.nn.init.xavier_normal_(layer.weight, gain=scale)
            torch.nn.init.zeros_(layer.bias)

        self.fc1 = Linear(in_features=input_dimension, out_features=hidden_dimension)
        init_parameters(self.fc1)
        self.fc2 = Linear(in_features=hidden_dimension, out_features=hidden_dimension*2)
        init_parameters(self.fc2)
        self.fc3 = Linear(in_features=hidden_dimension*2, out_features=hidden_dimension)
        init_parameters(self.fc3)
        self.fc4 = Linear(in_features=hidden_dimension, out_features=output_dimension)
        init_parameters(self.fc4)
        
        self.relu = ReLU()
        self.sigmoid = Sigmoid()

    def forward(self, input_vector):
        latent = self.relu(self.fc1(input_vector))
        latent = self.relu(self.fc2(latent))
        latent = self.relu(self.fc3(latent))
        output = self.sigmoid(self.fc4(latent))

        return output

# Load Data

In [5]:
# load news data
column_names = ['News_id', 'Category', 'Subcategory', 'Title', 'Abstract', 'URL', 'Title_Entities', 'Abstract_Entities']
news = pd.read_csv('./data/train_news.tsv', sep='\t', names=column_names)
news2 = pd.read_csv('./data/test_news.tsv', sep='\t', names=column_names)
news = pd.concat([news, news2], ignore_index=True)
del news2

column_names = ['Impression_id', 'User', 'Time', 'Clicked_News', 'Impressions']
behaviors = pd.read_csv('./data/train_behaviors.tsv', sep='\t', names=column_names)
behaviors_test = pd.read_csv('./data/test_behaviors.tsv', sep='\t', names=column_names)

# User Vector

In [6]:
one_hot_category = pd.get_dummies(news['Category']).astype(float).values
one_hot_category_map = {news['News_id'][i]:one_hot_category[i] for i in range(len(news))}

num_of_class = one_hot_category.shape[1]

num_of_rows = behaviors.shape[0]
num_of_rows_test = behaviors_test.shape[0]

# User history sum
user_clicked = np.empty((num_of_rows, num_of_class))
click_news = behaviors['Clicked_News'].values
for i in tqdm.tqdm(range(num_of_rows)):
    for click in click_news[i].split():
        user_clicked[i] += one_hot_category_map[click]

user_clicked_test = np.empty((num_of_rows_test, num_of_class))
click_news_test = behaviors_test['Clicked_News'].values
for i in tqdm.tqdm(range(num_of_rows_test)):
    for click in click_news_test[i].split():
        user_clicked_test[i] += one_hot_category_map[click]

## User impressions
impression = np.empty((num_of_rows, 15, num_of_class))
impression_clicked = []
impressions = behaviors['Impressions'].values
for i in tqdm.tqdm(range(num_of_rows)):
    impression_i = impressions[i].split()
    impression_label = []
    for j in range(15):
        impress_j = impression_i[j].split('-')
        # impress_j = impression_i[j]
        impress = impress_j[0]
        impression[i, j] = one_hot_category_map[impress]

        impression_label.append(int(impress_j[1]))
    impression_clicked.append(impression_label)

impression_test = np.empty((num_of_rows_test, 15, num_of_class))
impressions = behaviors_test['Impressions'].values
for i in tqdm.tqdm(range(num_of_rows_test)):
    impression_i = impressions[i].split()
    for j in range(15):
        impress_j = impression_i[j]
        impression_test[i, j] = one_hot_category_map[impress_j]

user_vector = np.empty((num_of_rows, 15))
for i in tqdm.tqdm(range(num_of_rows)):
    user_vector[i] = impression[i] @ user_clicked[i]

user_vector_test = np.empty((num_of_rows_test, 15))
for i in tqdm.tqdm(range(num_of_rows_test)):
    user_vector_test[i] = impression_test[i] @ user_clicked_test[i]

100%|██████████| 285297/285297 [00:12<00:00, 23347.20it/s]
100%|██████████| 46332/46332 [00:01<00:00, 24419.89it/s]
100%|██████████| 285297/285297 [00:06<00:00, 44574.34it/s]
100%|██████████| 46332/46332 [00:00<00:00, 84550.56it/s]
100%|██████████| 285297/285297 [00:00<00:00, 412481.18it/s]
100%|██████████| 46332/46332 [00:00<00:00, 392938.51it/s]


In [7]:
train_x = torch.FloatTensor(user_vector)
print(train_x.shape)
train_y = torch.FloatTensor(np.array(impression_clicked))
print(train_y.shape)
test_x = torch.FloatTensor(user_vector_test)
print(test_x.shape)

torch.Size([285297, 15])
torch.Size([285297, 15])
torch.Size([46332, 15])


In [None]:
number_of_epoch = 200
learning_rate = 0.0001
batch_size = 128
embedding_dimension = train_x.shape[1]
hidden_dimension = 256
latent_dimension = 256
output_dimension = 15
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
_, counts = train_y.reshape(-1).unique(return_counts=True)
class_weights = (1 / (counts / counts.sum()))

In [60]:
class_weights

tensor([1.1259, 8.9415])

In [8]:
number_of_epoch = 200
learning_rate = 0.0001
batch_size = 128
hidden_dimension = 256
latent_dimension = 256
output_dimension = 15
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
candidate_pairs = behaviors['Impressions'].str.split(expand=True)
for col in candidate_pairs.columns:
    candidate_pairs[['Article '+str(col), 'label '+str(col)]] = candidate_pairs[col].str.split(pat='-', expand=True)
behaviors['Target_News'] = candidate_pairs[['Article '+str(i) for i in range(15)]].agg(' '.join, axis=1)
behaviors['Target_News_Clicked'] = candidate_pairs[['label '+str(i) for i in range(15)]].agg(' '.join, axis=1)

del candidate_pairs

In [40]:
news_embeddings = np.load('./data/news_embedding/news_embedding_vector.npy')
user_embeddings = np.load('./data/user_embedding/user_embedding_vector.npy')

news_id2index_map = {id: index for index, id in enumerate(news['News_id'].unique())}

target_news = behaviors['Target_News']
train_x = np.empty((len(target_news), latent_dimension))
for i in tqdm.tqdm(range(len(target_news))):
    vec_list = []
    for j in target_news[i].split():
        vec_list.append(news_embeddings[news_id2index_map[j]])
    train_x[i] = np.dot(np.mean(vec_list, axis=0), user_embeddings[i])

train_x_tensor = torch.FloatTensor(train_x)

print('user mapped with news vector (input)')

100%|██████████| 285297/285297 [00:12<00:00, 22308.40it/s]

user mapped with news vector (input)





In [71]:
train_x[1]

array([1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
       1.27778578e-11, 1.27778578e-11, 1.27778578e-11, 1.27778578e-11,
      

In [41]:
target_news = behaviors_test['Impressions']
test_x = np.empty((len(target_news), latent_dimension))
for i in tqdm.tqdm(range(len(target_news))):
    vec_list = []
    for j in target_news[i].split():
        vec_list.append(news_embeddings[news_id2index_map[j]])
    test_x[i] = np.dot(np.mean(vec_list, axis=0), user_embeddings[i])
    
test_x_tensor = torch.FloatTensor(test_x)

print('user mapped with news vector (input)')

100%|██████████| 46332/46332 [00:01<00:00, 23552.33it/s]

user mapped with news vector (input)





In [35]:
behaviors['Target_News_Clicked'][0]

'0 0 0 0 0 0 0 0 1 0 0 1 0 1 0'

In [61]:
target_news_clicked = behaviors['Target_News_Clicked']
train_y = np.empty((len(target_news_clicked), output_dimension))

for i in range(len(target_news_clicked)):
    train_y[i] = [int(j) for j in target_news_clicked[i].split()]
train_y[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.])

In [68]:
_, counts = np.unique(train_y.reshape(-1), return_counts=True)
class_weights = (1 / (counts / counts.sum()))
class_weights

array([1.12592052, 8.94151754])

In [62]:
train_y_tensor = torch.FloatTensor(train_y)
train_y_tensor.shape

torch.Size([285297, 15])

# Train

In [63]:
train_dataset = CustomDataset(train_x_tensor, train_y_tensor)

whole_train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

generator = torch.Generator()
data_list = random_split(train_dataset, [0.7, 0.3], generator)

train_loader = DataLoader(data_list[0], batch_size=batch_size, shuffle=True)
val_loader = DataLoader(data_list[1], batch_size=batch_size)

test_dataset = CustomDataset(test_x_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [39]:
class BCELoss_weight(Module):
    def __init__(self, weight):
        super().__init__()
        self.weight = weight

    def forward(self, target, predict):
        predict = torch.clamp(predict, min=1e-6, max=1-1e-6)
        bce = -self.weight[1]*target*torch.log(predict) - self.weight[0]*(1-target)*torch.log(1-predict)
        return torch.mean(bce)

In [69]:
if not os.path.exists('./runs/'):
    os.makedirs('./runs/')

if not os.path.exists('./model/'):
    os.makedirs('./model/')

writer = SummaryWriter(log_dir=f'./runs/user_click_{time.strftime("%Y%m%d-%H%M%S")}')

network = UserClickNetwork(input_dimension=latent_dimension,
                            hidden_dimension=hidden_dimension,
                            output_dimension=output_dimension).to(device)

optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)
criterion = BCELoss_weight(weight=class_weights)

start_time = time.time()
for epoch in tqdm.tqdm(range(number_of_epoch)):
    network.to(device)
    predicts = np.empty((len(train_x)*15,))
    labels = np.empty((len(train_x)*15,))
    pointer = 0
    for i, (x, y) in enumerate(train_loader):
        x = x.to(device)
        y = y.to(device)
        predict = network.forward(x)
        predict = torch.flatten(predict)
        y = torch.flatten(y)
        loss = criterion(predict, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            predict = predict.cpu().numpy()
            y = y.cpu().numpy()
            mb_size = len(predict)
            predicts[pointer:pointer+mb_size] = predict
            labels[pointer:pointer+mb_size] = y
            pointer += mb_size

        del x, y, predict

    writer.add_scalar('Loss', loss, epoch)

    writer.add_scalar('Predict', predicts.sum(), epoch)

    auc = roc_auc_score(labels, predicts)
    predicts = np.where(predicts >= 0.1, 1, 0)
    accuracy = accuracy_score(labels, predicts)
    f1 = f1_score(labels, predicts)
    writer.add_scalar('Train AUC', auc, epoch)
    writer.add_scalar('Train Accuracy', accuracy, epoch)
    writer.add_scalar('Train F1', f1, epoch)

    network.to('cpu')
    with torch.no_grad():
        predicts = np.zeros((len(train_x)*15,))
        labels = np.empty((len(train_x)*15,))
        pointer = 0
        for i, (x, y) in enumerate(val_loader):
            predict = network.forward(x)
            predict = torch.flatten(predict).numpy()
            y = torch.flatten(y).numpy()
            mb_size = len(predict)
            predicts[pointer:pointer+mb_size] = predict
            labels[pointer:pointer+mb_size] = y
            pointer += mb_size

            del x, y, predict
    auc = roc_auc_score(labels, predicts)
    predicts = np.where(predicts >= 0.5, 1, 0)
    accuracy = accuracy_score(labels, predicts)
    f1 = f1_score(labels, predicts)
    writer.add_scalar('Val AUC', auc, epoch)
    writer.add_scalar('Val Accuracy', accuracy, epoch)
    writer.add_scalar('Val F1', f1, epoch)

    if epoch % 10 == 0:
        torch.save(network.state_dict(),
                    f'./model/user_click_{epoch}_{auc:.2f}.pth')

print(f'train time: {time.time() - start_time} seconds')
writer.close()

 10%|█         | 21/200 [03:37<30:53, 10.35s/it]


KeyboardInterrupt: 

In [None]:
torch.save(network.state_dict(), f'./model/100_{auc:.2f}.pth')

In [None]:
writer = SummaryWriter(log_dir=f'./runs/user_click_{time.strftime("%Y%m%d-%H%M%S")}')

network = UserClickNetwork(input_dimension=embedding_dimension,
                            hidden_dimension=hidden_dimension,
                            output_dimension=output_dimension).to(device)

optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)
criterion = BCELoss()

start_time = time.time()
for epoch in tqdm.tqdm(range(number_of_epoch)):
    network.to(device)
    predicts = np.empty((len(train_x)*15,))
    labels = np.empty((len(train_x)*15,))
    pointer = 0
    for i, (x, y) in enumerate(train_loader):
        x = x.to(device)
        y = y.to(device)
        predict = network.forward(x)
        predict = torch.flatten(predict)
        y = torch.flatten(y)
        loss = criterion(predict, y)

        writer.add_scalar('Loss', loss, epoch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            predict = predict.cpu().numpy()
            y = y.cpu().numpy()
            mb_size = len(predict)
            predicts[pointer:pointer+mb_size] = predict
            labels[pointer:pointer+mb_size] = y
            pointer += mb_size

        del x, y, predict

    writer.add_scalar('Loss', loss, epoch)
    
    auc = roc_auc_score(labels, predicts)

    predicts = np.where(predicts >= 0.5, 1, 0)
    accuracy = accuracy_score(labels, predicts)
    f1 = f1_score(labels, predicts)
    writer.add_scalar('Train AUC', auc, epoch)
    writer.add_scalar('Train Accuracy', accuracy_score, epoch)
    writer.add_scalar('Train F1', f1_score, epoch)

    del predicts

    if epoch % 10 == 0:
        torch.save(network.state_dict(),
                    f'./model/whole_{epoch}_{auc:.2f}.pth')

print(f'train time: {time.time() - start_time} seconds')
writer.close()

torch.save(network.state_dict(), f'./model/whole_100_{auc:.2f}.pth')

In [None]:
network.load_state_dict(torch.load('./model/100_0.87.pth'))
network.to('cpu')

with torch.no_grad():
    predicts = np.empty((len(test_x),15))
    pointer = 0
    for i, x in enumerate(test_loader):
        predict = network.forward(x)
        predict = predict.numpy()
        mb_size = len(predict)
        predicts[pointer:pointer+mb_size] = predict
        pointer += mb_size

        del x, predict

In [None]:
submit = pd.DataFrame(predicts, columns=[f'p{i}' for i in range(1, 16)])
submit.index.name = 'index'
submit.to_csv('./data/submit.csv')