In [None]:
%load_ext tensorboard 

In [None]:
 #%reload_ext tensorboard

In [None]:
import os
logs_base_dir = "runs2"
os.makedirs(logs_base_dir, exist_ok=True)

In [None]:
from torch.utils.tensorboard import SummaryWriter

tb_PopularityRS = SummaryWriter(log_dir=f'{logs_base_dir}/{logs_base_dir}_PopularityRS/')
tb_RandomRS = SummaryWriter(log_dir=f'{logs_base_dir}/{logs_base_dir}_RandomRS/')

In [None]:
Drive = 1
Path = ""

In [None]:
if Drive == 1:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/My Drive/LabData/ 

Mounted at /content/drive
/content/drive/My Drive/LabData


In [None]:
if Drive == 0: Path = "./data/" 


# Código Solución Python (Bernat)

In [None]:
import torch
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm


def add_label_column(transaction, value):
    transaction["label"] = value  # add label column with fixed value
    return transaction


def extract_dictionary(transaction):
    cust_dict = {};
    art_dict = {}
    count_cust = 1;
    count_art = 1;
    for index, row in transaction.iterrows():
        customer=row["customer_id"]; article=row["article_id"]
        if (customer not in cust_dict):
            cust_dict[customer]=count_cust; count_cust+=1
        if (article not in art_dict):
            art_dict[article]=count_art; count_art+=1
    return cust_dict, art_dict


def generate_datasets(transaction, cust_dict, art_dict):
    test_data_list = [];
    train_data_list = [];
    last_customer_id = -999;
    current_customer_id = -999
    for index, row in transaction.iterrows():
        customer = row["customer_id"];
        customer_id = cust_dict[customer]
        article = row["article_id"];
        article_id = art_dict[article]
        timestamp = int(row["t_dat"].replace('-', ''))
        if (last_customer_id != customer_id):
            current_customer_id = customer_id
            last_customer_id = customer_id
            row = [current_customer_id, article_id, row["label"], timestamp]
            test_data_list.append(row)
        else:
             row = [current_customer_id, article_id, row["label"], timestamp]
             train_data_list.append(row)
    len_test=len(test_data_list)
    len_train=len(train_data_list)
    if __name__ == "build_dataset":
        print(f' \tTest dataset generated, length:: {len_test}')
        print(f' \tTrain dataset generated, length: {len_train}')
    return test_data_list, train_data_list


def build_adj_mx(dims, interactions):
    adj_mat = sp.dok_matrix((dims, dims), dtype=np.float32)
    for x in tqdm(interactions, desc="BUILDING ADJACENCY MATRIX..."):
        adj_mat[x[0], x[1]] = 1.0
        adj_mat[x[1], x[0]] = 1.0
    return adj_mat


class CustomerArticleDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_path, num_negatives_train=4, num_negatives_test=100, sep='\t'):
        number_customers = 10000
        column_names = ["customer_id", "article_id", "label", "t_dat"]
        train_data = pd.read_csv(f'{dataset_path}customer.train.article', sep=sep,
                                 header=None, names=column_names).to_numpy()
        test_data = pd.read_csv(f'{dataset_path}customer.test.article', sep=sep,
                                header=None, names=column_names).to_numpy()

        # TAKE items, targets and test_items
        self.targets = train_data[:, 2]
        self.items = self.preprocess_items(train_data, number_customers)

        # Save dimensions of max users and items and build training matrix
        self.field_dims = np.max(self.items, axis=0) + 1  # ([ 943, 2625])
        self.train_mat = build_adj_mx(self.field_dims[-1], self.items.copy())

        # Generate train interactions with 4 negative samples for each positive
        self.negative_sampling(num_negatives=num_negatives_train)

        # Build test set by passing as input the test item interactions
        self.test_set = self.build_test_set(self.preprocess_items(test_data, number_customers),
                                            num_neg_samples_test=num_negatives_test)

    def __len__(self):
        return self.targets.shape[0]

    def __getitem__(self, index):
        return self.interactions[index]

    def preprocess_items(self, data, num_customers):
        reindexed_items = data[:, :2].astype(np.int)
        reindexed_items[:, 1] = reindexed_items[:, 1] + num_customers
        return reindexed_items

    def negative_sampling(self, num_negatives):
        self.interactions = []
        data = np.c_[(self.items, self.targets)].astype(int)
        max_users, max_items = self.field_dims[:2]  # number users (943), number items (2625)

        for x in tqdm(data, desc="Performing negative sampling on test data..."):  # x are triplets (u, i , 1)
            # Append positive interaction
            self.interactions.append(x)
            # Copy user and maintain last position to 0. Now we will need to update neg_triplet[1] with j
            neg_triplet = np.vstack([x, ] * (num_negatives))
            neg_triplet[:, 2] = np.zeros(num_negatives)

            # Generate num_negatives negative interactions
            for idx in range(num_negatives):
                j = np.random.randint(max_users, max_items)
                # IDEA: Loop to exclude true interactions (set to 1 in adj_train) user - item
                while (x[0], j) in self.train_mat:
                    j = np.random.randint(max_users, max_items)
                neg_triplet[:, 1][idx] = j
            self.interactions.append(neg_triplet.copy())

        self.interactions = np.vstack(self.interactions)

    def build_test_set(self, gt_test_interactions, num_neg_samples_test):
        max_users, max_items = self.field_dims[:2]  # number users (943), number items (2625)
        test_set = []
        for pair in tqdm(gt_test_interactions, desc="BUILDING TEST SET..."):
            negatives = []
            for t in range(num_neg_samples_test):
                j = np.random.randint(max_users, max_items)
                while (pair[0], j) in self.train_mat or j == pair[1]:
                    j = np.random.randint(max_users, max_items)
                negatives.append(j)
            # APPEND TEST SETS FOR SINGLE USER
            single_user_test_set = np.vstack([pair, ] * (len(negatives) + 1))
            single_user_test_set[:, 1][1:] = negatives
            test_set.append(single_user_test_set.copy())
        return test_set


transactions = pd.read_csv(Path + "transactions_ddup_2019-09-22_nart_5_ncust_20_ncustr_10000.csv")
transactions = add_label_column(transactions, 1)
transactions = transactions.sort_values(['customer_id', 't_dat'], ascending=[True, False])

customer_dict, article_dict = extract_dictionary(transactions)
test_dataset, train_dataset = generate_datasets(transactions, customer_dict, article_dict)

column_names = ["customer_id", "article_id", "label", "t_dat"]
train_data = pd.DataFrame(train_dataset, columns= column_names)
test_data = pd.DataFrame(test_dataset, columns= column_names)
train_data.to_csv(Path + "customer.train.article" , sep="\t", index=False,header=False)
test_data.to_csv(Path + "customer.test.article" , sep="\t", index=False,header=False)

In [None]:
dataset_path = ""
full_dataset = CustomerArticleDataset(dataset_path, num_negatives_train=4, num_negatives_test=99)
#data_loader = DataLoader(full_dataset, batch_size=256, shuffle=True, num_workers=0)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
BUILDING ADJACENCY MATRIX...: 100%|██████████| 397946/397946 [00:10<00:00, 38019.13it/s]
Performing negative sampling on test data...: 100%|██████████| 397946/397946 [00:13<00:00, 29055.63it/s]
BUILDING TEST SET...: 100%|██████████| 10000/10000 [00:05<00:00, 1857.55it/s]


In [None]:
import math

def getHitRatio(recommend_list, gt_item):
    if gt_item in recommend_list:
        return 1
    else:
        return 0

def getNDCG(recommend_list, gt_item):
    idx = np.where(recommend_list == gt_item)[0]
    if len(idx) > 0:
        return math.log(2)/math.log(idx+2)
    else:
        return 0

def getCoverage(total_recommended_items, total_items):
    return total_recommended_items/total_items

# Popularity Recommender System

In [None]:
import numpy
import pandas

class Popularity_Recommender():

	# Initialize all the variables
	def __init__(self):
		# Tha training data which is been provided.
		self.train_data = None #interactions
		self.user_id = None #Column for customers or users
		self.item_id = None #Column for articles or items
		self.popularity_recommendations = None #Final recommendation list

	# Create the recommendations.
	def create(self,train_data,user_id,item_id,label):

		self.train_data = train_data
		self.user_id = user_id
		self.item_id = item_id
		self.label = label
		
		# The items are grouped by item_id aggregated with the sum of 1 in the labels, we only count the real interactions.
		#train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
		train_data_grouped = train_data.groupby([self.item_id]).agg({self.label: 'sum'}).reset_index() 
		train_data_grouped.rename(columns = {self.label : 'score'}, inplace = True)
		train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending = [0,1])
		# The new column named Rank is created by score sorted in ascending order.
		train_data_sort['Rank'] = train_data_sort['score'].rank(ascending = 0, method = 'first')

		self.popularity_recommendations = train_data_sort


	# Method to user created recommendations
	def predict(self, user_id, topk=10):

		user_recommendation = self.popularity_recommendations
    #Delete the items that have been bought by the user before
		previous_items_customer = self.train_data[self.train_data[self.user_id].eq(user_id) & self.train_data[self.label]>0 ][self.item_id]
		user_recommendation = self.popularity_recommendations[~self.popularity_recommendations[self.item_id].isin(previous_items_customer)]

		return user_recommendation.head(topk)[self.item_id]

In [None]:
full_dataset.interactions

array([[    1, 10002,     1],
       [    1, 40132,     0],
       [    1, 13990,     0],
       ...,
       [10000, 20683,     0],
       [10000, 31647,     0],
       [10000, 24842,     0]])

In [None]:
model = Popularity_Recommender()
model.create(pd.DataFrame(full_dataset.interactions, columns=['customer_id','article_id','label']), 'customer_id', 'article_id','label')

In [None]:
model.predict(full_dataset.items[4000][0],5)

16      10017
1414    11415
1514    11515
145     10146
1736    11737
Name: article_id, dtype: int64

In [None]:
from statistics import mean
def test_popularity(model, full_dataset, topk=10):
    # Test the HR and NDCG for the model @topK
    HR, NDCG, COVERAGE = [], [], []

    for user_test in full_dataset.test_set:
        gt_item = user_test[0][1]
        predictions = model.predict(user_test[0][0], topk) #device
        recommend_list = predictions
        for art in recommend_list:  COVERAGE.append(art) if art not in COVERAGE else COVERAGE
        HR.append(getHitRatio(recommend_list, gt_item))
        NDCG.append(getNDCG(recommend_list, gt_item))
    return mean(HR), mean(NDCG), COVERAGE

In [None]:
# Check Init performance
for i in [10, 50, 100, 300, 500, 1000]:
    hr, ndcg, cov = test_popularity(model, full_dataset, topk=i)

    print("Topk:", i)
    print("initial HR: ", hr)
    print("initial NDCG: ", ndcg)  

    Coverage = getCoverage(len(cov), len(np.unique(full_dataset.interactions[:,1],)))
    print("initial COVERAGE: ", Coverage)
    
    tb_PopularityRS.add_scalar('eval/COVERAGE@{topk}', Coverage, i)
    tb_PopularityRS.add_scalar('eval/HR@{topk}', hr, i)
    tb_PopularityRS.add_scalar('eval/NDCG@{topk}', ndcg, i)

Topk: 10
initial HR:  0
initial NDCG:  0.0036927681667798477
initial COVERAGE:  0.00041546571109553115
Topk: 50
initial HR:  0
initial NDCG:  0.007284893343239425
initial COVERAGE:  0.001635896237438654
Topk: 100
initial HR:  0
initial NDCG:  0.009758870636682165
initial COVERAGE:  0.003012126405442601
Topk: 300
initial HR:  0.0001
initial NDCG:  0.015249354332938591
initial COVERAGE:  0.008543013684401858
Topk: 500
initial HR:  0.0003
initial NDCG:  0.018351483101688038
initial COVERAGE:  0.013918101321700294
Topk: 1000
initial HR:  0.0026
initial NDCG:  0.023686418136420195
initial COVERAGE:  0.027420736932305057


# Random Recommender

In [None]:
import numpy
import pandas
import random

class Random_Recommender():

	# Initialize all the variables
	def __init__(self):
		# Tha training data which is been provided.
		self.train_data = None #interactions
		self.user_id = None #Column for customers or users
		self.item_id = None #Column for articles or items
		self.list_items = None
		self.random_recommendations = None #Final recommendation list

	# Create the recommendations.
	def create(self,train_data,user_id,item_id,label):

		self.train_data = train_data
		self.user_id = user_id
		self.item_id = item_id
		
		list_items = list(train_data[self.item_id].unique())
		self.list_items = list_items

	# Method to user created recommendations
	def predict(self, user_id, topk=10):
	
    #Delete the items that have been bought by the user before
		#previous_items_customer = self.train_data[self.train_data[self.user_id].eq(user_id) & self.train_data[self.label]>0 ][self.item_id]
		#user_recommendation = self.random_recommendations[~self.random_recommendations.isin(previous_items_customer)]
		#user_recommendation = [rec.append(x) for x in user_recommendation if x not in previous_items_customer ]
		#print(user_recommendation[:topk])
		self.random_recommendations = self.list_items
		random.shuffle(self.random_recommendations)
		return self.random_recommendations[:topk]

In [None]:
model_random = Random_Recommender()
model_random.create(pd.DataFrame(full_dataset.interactions, columns=['customer_id','article_id','label']), 'customer_id', 'article_id','label')

In [None]:
model_random.predict(full_dataset.items[100][0],5)

[39003, 33913, 28677, 21316, 18629]

In [None]:
from statistics import mean
def test_random(model, full_dataset, topk=10):
    # Test the HR and NDCG for the model @topK
    HR, NDCG, COVERAGE = [], [], []

    for user_test in full_dataset.test_set:
        gt_item = user_test[0][1]
        predictions = model.predict(user_test[0][0], topk) #device
        recommend_list = predictions
        for art in recommend_list:  COVERAGE.append(art) if art not in COVERAGE else COVERAGE
        HR.append(getHitRatio(recommend_list, gt_item))
        NDCG.append(getNDCG(recommend_list, gt_item))
    return mean(HR), mean(NDCG), COVERAGE

In [None]:
# Check Init performance
for i in [10, 50, 100, 300, 500, 1000]:
    hr, ndcg, cov = test_random(model_random, full_dataset, topk=i)

    print("Topk:", i)
    print("initial HR: ", hr)
    print("initial NDCG: ", ndcg)  

    Coverage = getCoverage(len(cov), len(np.unique(full_dataset.interactions[:,1],)))
    print("initial COVERAGE: ", Coverage)
    
    tb_RandomRS.add_scalar('eval/COVERAGE@{topk}', Coverage, i)
    tb_RandomRS.add_scalar('eval/HR@{topk}', hr, i)
    tb_RandomRS.add_scalar('eval/NDCG@{topk}', ndcg, i)

Topk: 10
initial HR:  0.0004
initial NDCG:  0.00013883320077684737
initial COVERAGE:  0.9263067694944301
Topk: 50
initial HR:  0.0015
initial NDCG:  0.00037428144152842755
initial COVERAGE:  1.0
Topk: 100
initial HR:  0.0026
initial NDCG:  0.0005103057800542293
initial COVERAGE:  1.0
Topk: 300
initial HR:  0.0072
initial NDCG:  0.0011343281135425872
initial COVERAGE:  1.0
Topk: 500
initial HR:  0.0128
initial NDCG:  0.0017788138294628495
initial COVERAGE:  1.0
Topk: 1000
initial HR:  0.0245
initial NDCG:  0.0031863902593946706
initial COVERAGE:  1.0


# VISUALIZING RESULTS

In [None]:
#from tensorboard import notebook
#notebook.list() # View open TensorBoard instances

Known TensorBoard instances:
  - port 6006: logdir runs (started 0:02:33 ago; pid 2117)


In [None]:
%tensorboard --logdir runs2

UsageError: Line magic function `%tensorboard` not found.
