In [None]:
from implicit.cpu.als import AlternatingLeastSquares
from implicit.cpu.bpr import BayesianPersonalizedRanking
from implicit.cpu.lmf import LogisticMatrixFactorization
from implicit.evaluation import precision_at_k,mean_average_precision_at_k
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from scipy import sparse
from datetime import datetime
from datetime import timedelta
from recometrics import split_reco_train_test,calc_reco_metrics
from cmfrec import MostPopular
import csv


In [None]:
#CONSTANTS
DATASET = 'DatasetsWtime'
PULSE_RED = f'{DATASET}/pulse_red.csv'
ARTICLE_CORE = f'{DATASET}/article_core.csv'

In [None]:
df_pulse_red = pd.read_csv(PULSE_RED, index_col=[0], low_memory=False)
df_pulse_red.loc[df_pulse_red['views'] > 1, 'views'] = 1
df_pulse_red

## FILTERS

### Remove direkte articles and article published outside timespan

In [None]:
df_article_core = pd.read_csv(ARTICLE_CORE)
section_of_article = {row["article_id"]: row["section_title"] for index, row in df_article_core.iterrows()}
direkte_articles = [row['article_id'] for index, row in df_article_core.iterrows() if row['section_title']=='Direkte']

In [None]:
article_id_to_keep = list(set(section_of_article.keys())-set(direkte_articles))

In [None]:
df_pulse_red_article_keep = df_pulse_red.loc[df_pulse_red['article_id'].isin(article_id_to_keep)]
df_pulse_red_article_keep

### Remove environment ids (not registrered users)

In [None]:
spids_without_cookies = []
for i in set(df_pulse_red_article_keep.spid):
    if i.isdigit():
        spids_without_cookies.append(i)


In [None]:
df_without_cookies = df_pulse_red_article_keep.loc[df_pulse_red_article_keep.spid.isin(spids_without_cookies)]
df_without_cookies

### Remove articles read by only one user

In [None]:
article_reads = df_without_cookies.groupby(by='article_id')['views'].count().sort_values(ascending=False)
article_reads

In [None]:
article_reads = df_without_cookies.groupby(by='article_id')['views'].count().sort_values(ascending=False)

In [None]:
articlestokeep = article_reads>=2
articlestokeep = list(articlestokeep[articlestokeep].index)

In [None]:
df_min_article_reads = df_without_cookies.loc[df_without_cookies['article_id'].isin(articlestokeep)]

### Remove users that hasnt read one article per week

In [None]:
datetime_str = df_pulse_red['published_date'][0]

In [None]:
datetime_object = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')

In [None]:
time_deltas = [datetime_object + timedelta(days=i) for i in range(7,57,7)]
time_deltas

In [None]:
start = str(datetime_object)
spids_to_keep = []
for i,e in enumerate(time_deltas):
    to = str(e)
    temp_df = df_min_article_reads[df_min_article_reads['published_date'].between(start,to)]
    temp_spids = set(temp_df['spid'])
    if i == 0:
        spids_to_keep = list(set(temp_df['spid']))
    spids_to_keep = list(set(spids_to_keep) & set(temp_df['spid']))
    if i != 7:
        start = str(e)
print(len(spids_to_keep))

In [None]:
df_pulse_red_one_article_per_week = df_min_article_reads.loc[df_min_article_reads['spid'].isin(spids_to_keep)]
df_pulse_red_one_article_per_week

### Prep dataframe for training

In [None]:
df_pr = df_pulse_red_one_article_per_week.drop('published_date',axis=1)

In [None]:
df_pr = df_pr.reset_index().drop('index',axis=1)
df_pr

In [None]:
df_pr = df_pr.loc[(df_pr.spid.notnull()) & (df_pr.article_id.notnull())]
df_pr['spid'] = df_pr['spid'].astype('category')
df_pr['article_id'] = df_pr['article_id'].astype('category')
df_pr.head(5)

### Train test split

In [None]:
from scipy.sparse import coo_matrix
X = coo_matrix((df_pr.views, (df_pr.spid.cat.codes, df_pr.article_id.cat.codes)))
X

In [None]:
X_fit_reco, X_train_reco, X_test_reco, test_users_reco = \
    split_reco_train_test(
        X, split_type="separated",
        users_test_fraction = 0.2,
        max_test_users=20000,
    )

### Evaluation

### Random and Most popular scores

In [None]:
def calculateRandomAndMostPopular(fit_set,train_set,test_set):
    rng = np.random.default_rng(seed=1)
    UserFactors_random = rng.standard_normal(size=(test_set.shape[0], 5))
    ItemFactors_random = rng.standard_normal(size=(test_set.shape[1], 5))

    ### Non-personalized recommendations
    model_baseline = MostPopular(implicit=True, user_bias=False).fit(fit_set)
    item_biases = model_baseline.item_bias_
    item_biases

    k=5
    metrics_random = calc_reco_metrics(
        #X_train[:X_test.shape[0]]
        train_set[:test_set.shape[0]], test_set,
        UserFactors_random, ItemFactors_random,
        k=k, all_metrics=True
    )
    metrics_baseline = calc_reco_metrics(
        #X_train[:X_test.shape[0]]
        train_set[:test_set.shape[0]], test_set,
        None, None, item_biases=item_biases,
        k=k, all_metrics=True
    )
    
    all_metrics = [
    metrics_random,
    metrics_baseline
    ]
    all_metrics = pd.concat([m.mean(axis=0).to_frame().T for m in all_metrics], axis=0)
    all_metrics.index = [
        "Random",
        "Most Popular"
    ]
    return all_metrics

In [None]:
results_random_most_popular = calculateRandomAndMostPopular(X_fit_reco,X_train_reco,X_test_reco)
results_random_most_popular.to_csv('../Results/randomAndMostPopular_hyper.csv')

In [None]:
with open('../Results/randomAndMostPopular_hyper.csv','w',newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['metric','P@5','TP@5','R@5','AP@5','TAP@5','NDCG@5','Hit@5','RR@5','ROC_AUC','PR-AUC'])
    randomAndMostPopular_metrics = calculateRandomAndMostPopular(X_fit_reco,X_train_reco,X_test_reco)
    writer.writerow(['Random',randomAndMostPopular_metrics['P@5']['Random'],randomAndMostPopular_metrics['TP@5']['Random'],randomAndMostPopular_metrics['R@5']['Random'],randomAndMostPopular_metrics['AP@5']['Random'],randomAndMostPopular_metrics['TAP@5']['Random'],randomAndMostPopular_metrics['NDCG@5']['Random'],randomAndMostPopular_metrics['Hit@5']['Random'],randomAndMostPopular_metrics['RR@5']['Random'],randomAndMostPopular_metrics['ROC_AUC']['Random'],randomAndMostPopular_metrics['PR_AUC']['Random']])
    writer.writerow(['Most Popular',randomAndMostPopular_metrics['P@5']['Most Popular'],randomAndMostPopular_metrics['TP@5']['Most Popular'],randomAndMostPopular_metrics['R@5']['Most Popular'],randomAndMostPopular_metrics['AP@5']['Most Popular'],randomAndMostPopular_metrics['TAP@5']['Most Popular'],randomAndMostPopular_metrics['NDCG@5']['Most Popular'],randomAndMostPopular_metrics['Hit@5']['Most Popular'],randomAndMostPopular_metrics['RR@5']['Most Popular'],randomAndMostPopular_metrics['ROC_AUC']['Most Popular'],randomAndMostPopular_metrics['PR_AUC']['Most Popular']])


### Hyperparameter tuning ALS

In [None]:
def calculate_ALS(fit_set,train_set,test_set,fac,ite,reg):
    k=5
    ALSmodel = AlternatingLeastSquares(factors=fac,iterations=ite,regularization=reg)
    ALSmodel.fit(fit_set)

    metrics_als = calc_reco_metrics(
        train_set[:test_set.shape[0]], test_set,
        ALSmodel.user_factors[:test_set.shape[0]], ALSmodel.item_factors,
        k=k, all_metrics=True
    )

    all_metrics = [
    metrics_als
    ]
    all_metrics = pd.concat([m.mean(axis=0).to_frame().T for m in all_metrics], axis=0)
    all_metrics.index = [
        "ALS"
    ]
    return all_metrics

In [None]:
ALSfactors = [5,10,15,30,60,100]
ALSiterations = [5,10,15,30,60,100]
ALSregularization = [0.001,0.01,0.1]

In [None]:
import csv
k=5
with open('../Results/ALS_hyper.csv','w',newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['model','k','factor','iteration','regularization','P@5','TP@5','R@5','AP@5','TAP@5','NDCG@5','Hit@5','RR@5','ROC_AUC','PR_AUC'])
    model_name = 'ALS'
    for fac in ALSfactors:
        for ite in ALSiterations:
            for reg in ALSregularization:
                temp_metric = calculate_ALS(X_fit_reco,X_train_reco,X_test_reco,fac,ite,reg)
                writer.writerow([model_name,k,fac,ite,reg,temp_metric['P@5'][model_name],temp_metric['TP@5'][model_name],temp_metric['R@5'][model_name],temp_metric['AP@5'][model_name],temp_metric['TAP@5'][model_name],temp_metric['NDCG@5'][model_name],temp_metric['Hit@5'][model_name],temp_metric['RR@5'][model_name],temp_metric['ROC_AUC'][model_name],temp_metric['PR_AUC'][model_name]])


### Hyperparameter tuning BPR

In [None]:
def calculate_BPR(fit_set,train_set,test_set,fac,ite,reg,learning):
    k=5
    BPRmodel = BayesianPersonalizedRanking(factors=fac,iterations=ite,regularization=reg,learning_rate=learning)
    BPRmodel.fit(fit_set)

    metrics_bpr = calc_reco_metrics(
        train_set[:test_set.shape[0]], test_set,
        BPRmodel.user_factors[:test_set.shape[0]], BPRmodel.item_factors,
        k=k, all_metrics=True
    )

    all_metrics = [
    metrics_bpr
    ]
    all_metrics = pd.concat([m.mean(axis=0).to_frame().T for m in all_metrics], axis=0)
    all_metrics.index = [
        "BPR"
    ]
    return all_metrics

In [None]:
BPRfactors = [5,10,15,30,60,100]
BPRiterations = [5,10,15,30,60,100]
BPRregularization = [0.001,0.01,0.1]
BPRlearning_rates = [0.001,0.01,0.1]

In [None]:
k=5
with open('../Results/BPR_hyper.csv','w',newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['model','k','factor','iteration','regularization','learning_rate','P@5','TP@5','R@5','AP@5','TAP@5','NDCG@5','Hit@5','RR@5','ROC_AUC','PR_AUC'])
    model_name = 'BPR'
    for fac in BPRfactors:
        for ite in BPRiterations:
            for reg in BPRregularization:
                for learning in BPRlearning_rates:
                    temp_metric = calculate_BPR(X_fit_reco,X_train_reco,X_test_reco,fac,ite,reg,learning)
                    writer.writerow([model_name,k,fac,ite,reg,learning,temp_metric['P@5'][model_name],temp_metric['TP@5'][model_name],temp_metric['R@5'][model_name],temp_metric['AP@5'][model_name],temp_metric['TAP@5'][model_name],temp_metric['NDCG@5'][model_name],temp_metric['Hit@5'][model_name],temp_metric['RR@5'][model_name],temp_metric['ROC_AUC'][model_name],temp_metric['PR_AUC'][model_name]])


### Hyperparameter tuning LMF

In [None]:
def calculate_LMF(fit_set,train_set,test_set,fac,ite,reg,learning):
    k=5
    LMFmodel = LogisticMatrixFactorization(factors=fac,iterations=ite,regularization=reg,learning_rate=learning)
    LMFmodel.fit(fit_set)

    metrics_lmf = calc_reco_metrics(
        train_set[:test_set.shape[0]], test_set,
        LMFmodel.user_factors[:test_set.shape[0]], LMFmodel.item_factors,
        k=k, all_metrics=True
    )

    all_metrics = [
    metrics_lmf
    ]
    all_metrics = pd.concat([m.mean(axis=0).to_frame().T for m in all_metrics], axis=0)
    all_metrics.index = [
        "LMF"
    ]
    return all_metrics

In [None]:
LMFfactors = [5,10,15,30,60,100]
LMFiterations = [5,10,15,30,60,100]
LMFregularization = [0.001,0.01,0.1,0.3,0.6,1.0]
LMFlearning_rates = [0.001,0.01,0.1,0.3,0.6,1.0]

In [None]:
k=5
with open('../Results/LMF_hyper.csv','w',newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['model','k','factor','iteration','regularization','learning_rate','P@5','TP@5','R@5','AP@5','TAP@5','NDCG@5','Hit@5','RR@5','ROC_AUC','PR_AUC'])
    model_name = 'LMF'
    for fac in LMFfactors:
        for ite in LMFiterations:
            for reg in LMFregularization:
                for learning in LMFlearning_rates:
                    temp_metric = calculate_LMF(X_fit_reco,X_train_reco,X_test_reco,fac,ite,reg,learning)
                    writer.writerow([model_name,k,fac,ite,reg,learning,temp_metric['P@5'][model_name],temp_metric['TP@5'][model_name],temp_metric['R@5'][model_name],temp_metric['AP@5'][model_name],temp_metric['TAP@5'][model_name],temp_metric['NDCG@5'][model_name],temp_metric['Hit@5'][model_name],temp_metric['RR@5'][model_name],temp_metric['ROC_AUC'][model_name],temp_metric['PR_AUC'][model_name]])