In [None]:
from implicit.cpu.als import AlternatingLeastSquares
from implicit.cpu.bpr import BayesianPersonalizedRanking
from implicit.cpu.lmf import LogisticMatrixFactorization
from implicit.evaluation import precision_at_k,mean_average_precision_at_k
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from scipy import sparse
from datetime import datetime
from datetime import timedelta
from recometrics import split_reco_train_test,calc_reco_metrics
from cmfrec import MostPopular

In [None]:
#CONSTANTS
DATASET = 'DatasetsWtime'
PULSE_RED = f'{DATASET}/pulse_red.csv'
ARTICLE_CORE = f'{DATASET}/article_core.csv'

In [None]:
df_pulse_red = pd.read_csv(PULSE_RED, index_col=[0], low_memory=False)
df_pulse_red.loc[df_pulse_red['views'] > 1, 'views'] = 1
df_pulse_red

## FILTERS

### Remove direkte articles and article published outside timespan

In [None]:
df_article_core = pd.read_csv(ARTICLE_CORE)
section_of_article = {row["article_id"]: row["section_title"] for index, row in df_article_core.iterrows()}
direkte_articles = [row['article_id'] for index, row in df_article_core.iterrows() if row['section_title']=='Direkte']

In [None]:
article_id_to_keep = list(set(section_of_article.keys())-set(direkte_articles))

In [None]:
df_pulse_red_article_keep = df_pulse_red.loc[df_pulse_red['article_id'].isin(article_id_to_keep)]
df_pulse_red_article_keep

### Remove environment ids (not registrered users)

In [None]:
spids_without_cookies = []
for i in set(df_pulse_red_article_keep.spid):
    if i.isdigit():
        spids_without_cookies.append(i)


In [None]:
df_without_cookies = df_pulse_red_article_keep.loc[df_pulse_red_article_keep.spid.isin(spids_without_cookies)]
df_without_cookies

### Remove articles read by only one user

In [None]:
article_reads = df_without_cookies.groupby(by='article_id')['views'].count().sort_values(ascending=False)
article_reads

In [None]:
article_reads = df_without_cookies.groupby(by='article_id')['views'].count().sort_values(ascending=False)

In [None]:
articlestokeep = article_reads>=2
articlestokeep = list(articlestokeep[articlestokeep].index)

In [None]:
df_min_article_reads = df_without_cookies.loc[df_without_cookies['article_id'].isin(articlestokeep)]

### Remove users that hasnt read one article per week

In [None]:
datetime_str = df_pulse_red['published_date'][0]

In [None]:
datetime_object = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')

In [None]:
time_deltas = [datetime_object + timedelta(days=i) for i in range(7,57,7)]
time_deltas

In [None]:
start = str(datetime_object)
spids_to_keep = []
for i,e in enumerate(time_deltas):
    to = str(e)
    temp_df = df_min_article_reads[df_min_article_reads['published_date'].between(start,to)]
    temp_spids = set(temp_df['spid'])
    if i == 0:
        spids_to_keep = list(set(temp_df['spid']))
    spids_to_keep = list(set(spids_to_keep) & set(temp_df['spid']))
    if i != 7:
        start = str(e)
print(len(spids_to_keep))

In [None]:
df_pulse_red_one_article_per_week = df_min_article_reads.loc[df_min_article_reads['spid'].isin(spids_to_keep)]
df_pulse_red_one_article_per_week

### Prep dataframe for training

In [None]:
df_pr = df_pulse_red_one_article_per_week.drop('published_date',axis=1)

In [None]:
df_pr = df_pr.reset_index().drop('index',axis=1)
df_pr

In [None]:
df_pr = df_pr.loc[(df_pr.spid.notnull()) & (df_pr.article_id.notnull())]
df_pr['spid'] = df_pr['spid'].astype('category')
df_pr['article_id'] = df_pr['article_id'].astype('category')
df_pr.head(5)

### Train test split

In [None]:
from scipy.sparse import coo_matrix
X = coo_matrix((df_pr.views, (df_pr.spid.cat.codes, df_pr.article_id.cat.codes)))
X

In [None]:
X_fit_reco, X_train_reco, X_test_reco, test_users_reco = \
    split_reco_train_test(
        X, split_type="separated",
        users_test_fraction = 0.2,
        max_test_users=20000,
    )

In [None]:
def evaluation_of_sets(fit_set,train_set,test_set):
    rng = np.random.default_rng(seed=1)
    UserFactors_random = rng.standard_normal(size=(test_set.shape[0], 5))
    ItemFactors_random = rng.standard_normal(size=(test_set.shape[1], 5))

    ### Non-personalized recommendations
    model_baseline = MostPopular(implicit=True, user_bias=False).fit(fit_set)
    item_biases = model_baseline.item_bias_
    item_biases

    ALSmodel = AlternatingLeastSquares()
    BPRmodel = BayesianPersonalizedRanking()
    LMFmodel = LogisticMatrixFactorization()

    ALSmodel.fit(fit_set)
    BPRmodel.fit(fit_set)
    LMFmodel.fit(fit_set)

    
    k=5
    metrics_random = calc_reco_metrics(
        train_set[:test_set.shape[0]], test_set,
        UserFactors_random, ItemFactors_random,
        k=k, all_metrics=True
    )
   

    metrics_baseline = calc_reco_metrics(
        train_set[:test_set.shape[0]], test_set,
        None, None, item_biases=item_biases,
        k=k, all_metrics=True
    )

    metrics_als = calc_reco_metrics(
        train_set[:test_set.shape[0]], test_set,
        ALSmodel.user_factors[:test_set.shape[0]], ALSmodel.item_factors,
        k=k, all_metrics=True
    )

    metrics_bpr = calc_reco_metrics(
        train_set[:test_set.shape[0]], test_set,
        BPRmodel.user_factors[:test_set.shape[0]], BPRmodel.item_factors,
        k=k, all_metrics=True
    )

    metrics_lmf = calc_reco_metrics(
        train_set[:test_set.shape[0]], test_set,
        LMFmodel.user_factors[:test_set.shape[0]], LMFmodel.item_factors,
        k=k, all_metrics=True
    )

    all_metrics = [
    metrics_random,
    metrics_baseline,
    metrics_als,
    metrics_bpr,
    metrics_lmf
    ]
    all_metrics = pd.concat([m.mean(axis=0).to_frame().T for m in all_metrics], axis=0)
    all_metrics.index = [
        "Random",
        "Most Popular",
        "ALS",
        "BPR",
        "LMF"
    ]

    parameters = {
        'ALSfactors': ALSmodel.factors,
        'ALSiterations': ALSmodel.iterations,
        'ALSregularizations': ALSmodel.regularization,
        'BPRfactors': BPRmodel.factors,
        'BPRiterations': BPRmodel.iterations,
        'BPRregularizations': BPRmodel.regularization,
        'BPRlearningrate': BPRmodel.learning_rate,
        'LMFfactors': LMFmodel.factors,
        'LMFiterations': LMFmodel.iterations,
        'LMFregularizations': LMFmodel.regularization,
        'LMFlearningrate': LMFmodel.learning_rate
    }

    return all_metrics,parameters

In [None]:
results = evaluation_of_sets(X_fit_reco,X_train_reco,X_test_reco)
results[0].to_csv('../Results/experiment_b2.csv')

In [None]:
results[1] #hyperparameters used