# H&M Recommendation - Artur Xarles & Enric Azuara

In [None]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

### Read data and small modifications

In [None]:
#Read the data

path = '../input/h-and-m-personalized-fashion-recommendations/'
articles = pd.read_csv(path + 'articles.csv')
transactions = pd.read_csv(path + 'transactions_train.csv', dtype = {'article_id': 'str'})
customers = pd.read_csv(path + 'customers.csv')

#Transform t_dat to date format
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

This notebook shows step by step the thought process and the final algorithm selection. First of all we will try a set of non personalized recommender systems and after that, we will use personalized methods such User and Item based Collaborative Filtering (only to show how it would work with an small dataset) and Matrix factorization methods.

## Non personalized recommender systems

The first approach we are going to follow is non personalized recommender systems. There are 2 main types:

   - Aggregated opinions (i.e. ranking)
   - Basic product associations

### Aggregated opinions

Since there is not a measure to rank the items, we will look out for the most bought products.

In [None]:
transactions["article_id"].value_counts()

Since we are treating with clothes, this first approach is too naive, so we will take into account seasonality looking at the time when it was bought.

If we look at the last 4 weeks, the most bought products are the following ones:

In [None]:
print(transactions['t_dat'].max())
print(transactions['t_dat'].min())
transactions[(transactions['t_dat'] > '2020-08-26') & (transactions['t_dat'] < '2020-09-22')]["article_id"].value_counts()

If we look at the most bought items during september...

In [None]:
transactions[transactions['t_dat'].dt.month == 9]["article_id"].value_counts()

And last, the most purchased product during last september

In [None]:
transactions[(transactions['t_dat'].dt.month == 9) & (transactions["t_dat"].dt.year == 2020)]["article_id"].value_counts()

As we have seen, the most popular items vary depending which span times we take. We will consider this factor in order to make the predictions with this approach

### Basic products association

We will compute the most common products purchased together

In [None]:
transactions_sample = transactions[(transactions['t_dat'] > '2020-08-26')]
df_train = transactions[(transactions['t_dat'].dt.month == 9) & (transactions["t_dat"].dt.year == 2020) & (transactions['t_dat'] < '2020-09-15')]
vc = df_train.article_id.value_counts()
vc = transactions.article_id.value_counts()
pairs = {}
for j,i in tqdm(enumerate(vc.index.values[1000:1032])):
    #if j%10==0: print(j,', ',end='')
    USERS = transactions.loc[transactions.article_id==i,'customer_id'].unique()
    vc2 = transactions.loc[(transactions.customer_id.isin(USERS))&(transactions.article_id!=i),'article_id'].value_counts()
    pairs[i] = [vc2.index[0], vc2.index[1], vc2.index[2]]

In [None]:
pairs

We can see that this method is a bit computationally slow so we only show an example of how it would be with an small subset of 32 articles.

### Comparing the different approaches

#### Aggregated rankings (count)

We will split the data for testing purposes. (Test data will be after 15 Sep)

In [None]:
last_week_start = datetime.datetime.strptime("16/09/20 00:00:00", '%d/%m/%y %H:%M:%S')
transactions["days_distance"] = (last_week_start - transactions["t_dat"]).dt.days
transactions["weight"] = 1
transactions["weight"] *= np.exp(-(transactions["days_distance"] / 3))
transactions['pop_factor'] = 1
df_train = transactions[(transactions['t_dat'].dt.month == 9) & (transactions["t_dat"].dt.year == 2020) & (transactions['t_dat'] < '2020-09-15')]
test = transactions[(transactions['t_dat'] >= '2020-09-15') & (transactions['t_dat'] <= '2020-09-22')].groupby(['customer_id'])['article_id'].apply(list)

We define the first weight by time

In [None]:
df_train_g = df_train.groupby("article_id").sum().reset_index()
df_train_sorted = df_train_g.sort_values(by="weight",ascending=False)
weigh1 = df_train_sorted["article_id"].to_numpy()[:12]

Second way of weighting by time

In [None]:
df_train['pop_factor'] = df_train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,16) - x).days)
df_train_h = df_train.groupby("article_id").sum().reset_index()
df_train_sorted2 = df_train_h.sort_values(by="pop_factor",ascending=False)
weigh2 = df_train_sorted2["article_id"].to_numpy()[:12]

Not weighting and just taking some time spans that can make sense

In [None]:
Last4W = transactions[(transactions['t_dat'] > '2020-08-26') & (transactions['t_dat'] < '2020-09-15')]["article_id"].value_counts().index[0:12].values
AllSep = transactions[(transactions['t_dat'].dt.month == 9) & (transactions['t_dat'] < '2020-09-15')]["article_id"].value_counts().index[0:12].values
Sep2020 = transactions[(transactions['t_dat'].dt.month == 9) & (transactions["t_dat"].dt.year == 2020) & (transactions['t_dat'] < '2020-09-15')]["article_id"].value_counts().index[0:12].values
All = transactions[(transactions['t_dat'] < '2020-09-15')]["article_id"].value_counts().index[0:12].values

Define metric to evaluate (MAP):

In [None]:
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
outputsAll = []
outputsAllSep = []
outputsSep2020 = []
outputs4W = []
outputsWeigh1 = []
outputsWeigh2 = []

for i in range(test.values.shape[0]):
    outputsAll.append(list(All))
    outputsAllSep.append(list(AllSep))
    outputsSep2020.append(list(Sep2020))
    outputs4W.append(list(Last4W))
    outputsWeigh1.append(list(weigh1))
    outputsWeigh2.append(list(weigh2))
    
print(mapk(list(test.values), outputsAll))
print(mapk(list(test.values), outputsAllSep))
print(mapk(list(test.values), outputsSep2020))
print(mapk(list(test.values), outputs4W))
print(mapk(list(test.values), outputsWeigh1))
print(mapk(list(test.values), outputsWeigh2))

As we can see, we get very diverse results, where the weighted by time methods acquire a better mAP score.

#### Combined algorithm

As we have seen, taking weights from the time or not has different results, but it looks like it is still not enough. We will take as insipiration one of the notebooks provided to improve this result and combine weights with repetition.

We will make a recommendation that will have 3 scenarios:

    1) The user has bought before 12 or more items: We will recommend the most recent 12 items bought by the user.
    2) The user has bought before less than 12 items: We will recommend all the items bought by the user and the ones more popular weighted by time up to 12 items.
    3) The user has not bought before: We will recommend the top12 popular items weighted by time.
    
To test this algorithm, we will separate the data within the last 4 weeks in order to priorize the most recent ones.

In [None]:
train = transactions[(transactions['t_dat'] >= '2020-08-15') & (transactions['t_dat'] < '2020-09-16')]
train_week_1 = transactions[(transactions['t_dat'] >= '2020-09-08') & (transactions['t_dat'] < '2020-09-16')]
train_week_2 = transactions[(transactions['t_dat'] >= '2020-09-01') & (transactions['t_dat'] < '2020-09-08')]
train_week_3 = transactions[(transactions['t_dat'] >= '2020-08-23') & (transactions['t_dat'] < '2020-09-01')]
train_week_4 = transactions[(transactions['t_dat'] >= '2020-08-15') & (transactions['t_dat'] < '2020-08-23')]
test2 = transactions[(transactions['t_dat'] >= '2020-09-16')]

positive_items_whole = train.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user1 = train_week_1.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2 = train_week_2.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3 = train_week_3.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4 = train_week_4.groupby(['customer_id'])['article_id'].apply(list)

We issolate the most popular items the last two weeks before the test dataset to determine them as the most popular ones taking into account the most bought ones weighted by time.

In [None]:
popular_items_week_1_2_aux = pd.concat([train_week_1, train_week_2], axis=0).groupby(['article_id'])['pop_factor'].value_counts()
popular_items_week_1_2 = list(popular_items_week_1_2_aux.droplevel(level=1).sort_values(ascending=False).index.values)

Let's check how it performs on the test dataset

In [None]:
positive_items_val = test2.groupby(['customer_id'])['article_id'].apply(list)

val_users = positive_items_val.keys()
val_items = []

for i,user in enumerate(val_users):
    val_items.append(positive_items_val[user])

In [None]:
from collections import Counter

positive_items_val = test2.groupby(['customer_id'])['article_id'].apply(list)
val_users = positive_items_val.keys()
val_items = []
outputs = []

for i,user in enumerate(val_users):
    val_items.append(positive_items_val[user])

for user in tqdm(val_users):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    
    user_output += list(popular_items_week_1_2[:12 - len(user_output)])
    outputs.append(user_output)
    
print("mAP Score on Validation set:", mapk(val_items, outputs))

As we can see, with this approach we improved the result by a large amount.

### Comparing with Basic products Association

In [None]:
train['article_id2'] = train.article_id.map(pairs)
train2 = train.loc[:, ['customer_id','article_id2']].copy()
train2 = train2.loc[train2.article_id2.notnull()]

In [None]:
train2 = train[['customer_id','article_id2']].copy()
train2 = train2.loc[train2.article_id2.notnull()]
train2 = train2.rename({'article_id2':'article_id'},axis=1)
train = train[['customer_id','article_id']]
train = pd.concat([train,train2],axis=0,ignore_index=True)

To compute this, we will select a subset of the users from the validation set, since finding the previous purchases for each of them is computationally exigent.

For the users that we do not have data before we will use the method proposed on the previous section.

In [None]:
from collections import Counter

positive_items_val = test2.groupby(['customer_id'])['article_id'].apply(list)
val_users = positive_items_val.keys()
val_items = []
outputs2 = []
subset = val_users[0:5000]

for i,user in enumerate(subset):
    val_items.append(positive_items_val[user])

for user in tqdm(subset):
    user_output2 = []
    if user in train["customer_id"].values:
        user_output2 += list(train[train["customer_id"]==user]["article_id"].values)

    else:
        if user in positive_items_per_user1.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
        if user in positive_items_per_user2.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
        if user in positive_items_per_user3.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
        if user in positive_items_per_user4.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
    
    user_output2 += list(popular_items_week_1_2[:12 - len(user_output)])
    outputs2.append(user_output2)
    
print("mAP Score on Validation set:", mapk(val_items, outputs2))

As we can see, we get a worse result (Map: 0.014). With more time, it would be good to explore which results we would get when trying different combined algorithms between weighting by time and commonly bought together products.

## Personalized recommender systems - Collaborative Filtering

### User and Item based Collaborative Filtering

After seeing the personalized methods, we will show how we would implement a User-Based and Item-Based CF recommender system to this dataset. As these methods are really slow when dealing with large datasets, we will do an example using only a really small dataset. At the end, this will not be used to make predictions and it will only be shown as an example.

We use a function to reduce the data and introduce a rating for each customer and article combination being the number of times that the article has been bought by a customer (we introduce 0 ratings if we do not have a transaction for these).

In [None]:
'''
Function to create a subset of the data for CF. We only use the transactions made after a certain date and the users with a 
minimum number of purchases after that day
'''
def subset_transactions(transactions, init_date, min_purchases_customer, min_purchases_article, add_0rat = True, final_date = datetime.datetime(2022, 9, 20)):
    #Get only recent transactions
    transactions_small = transactions[(transactions['t_dat'] >= init_date) & (transactions['t_dat'] < final_date)]
    #Get the customers with at least min_purchases purchases
    customers = transactions_small['customer_id'].value_counts()[transactions_small['customer_id'].value_counts() > min_purchases_customer].index.to_list()
    articles = transactions_small['article_id'].value_counts()[transactions_small['article_id'].value_counts() > min_purchases_article].index.to_list()
    #Get transactions for these customers
    transactions_small = transactions_small[transactions_small['customer_id'].isin(customers) & transactions_small['article_id'].isin(articles)]
    #Each combination of customer-article one transaction with rating the number of times that this combination happens
    transactions_small = transactions_small.groupby(['customer_id', 'article_id']).size().reset_index().rename(columns={0:'rating'})
    #Add transactions with rating 0 for transaction combinations that has not been done
    if add_0rat:
        for customer in tqdm(transactions_small['customer_id'].unique()):
            for article in transactions_small['article_id'].unique():
                if len(transactions_small[(transactions_small['customer_id'] == customer) & (transactions_small['article_id'] == article)]) == 0:
                    transactions_small = transactions_small.append({'customer_id': customer, 'article_id': article, 'rating': 0}, ignore_index = True)


    print('New data has shape: ' + str(transactions_small.shape))
    return transactions_small

initial_date = datetime.datetime(2020, 9, 20) #We use last two weeks
min_purchases_customer = 20
min_purchases_article = 70

#20 and 50 correct

transactions_small = subset_transactions(transactions = transactions, init_date = initial_date, min_purchases_customer = min_purchases_customer, min_purchases_article = min_purchases_article)
print('Number of customers: ' + str(len(transactions_small['customer_id'].unique())))
print('Number of articles: ' + str(len(transactions_small['article_id'].unique())))
print(transactions_small.head())


Once we have the reduced dataset, we split the data into train and test to be able to compare different CF methodologies.

In [None]:
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.loc[sampled_ids, 'for_testing'] = True
    return df

transactions_small['for_testing'] = False
grouped = transactions_small.groupby('customer_id', group_keys=False).apply(assign_to_set)
transactions_small_train = transactions_small[grouped.for_testing == False]
transactions_small_test = transactions_small[grouped.for_testing == True]

print(transactions_small_train.shape)
print(transactions_small_test.shape)


print("Training data_set has "+ str(transactions_small_train.shape[0]) +" ratings")
print("Test data set has "+ str(transactions_small_test.shape[0]) +" ratings")
print("The database has ", transactions_small.article_id.nunique(), " articles")


In the following code we create some functions for different similarities and classes to implement Collaborative Filtering in a user-based and item-based way. These classes contain a fit function, predict function (for customer and article specification) and a predict function for a customer that gives the top k articles respect to the rating. We also define a function to evaluate the recommendations based on RMSE. 
For the User-Based we will use the following function to make the prediction:
$$pred(a,p) = \bar{r_a} + \frac{\sum_{b \in N}{sim(a,b)*(r_{b,p}-\bar{r_b})}}{\sum_{b \in N}{sim(a,b)}}$$
as different users can have different mean of articles bought. Moreover, we will only use the subset of the top-20 most similar users in order to make the predictions.
For the Item-Based, we will also use the previous formula modificated to use the mean for the article and the top-20 most similar users.

In [None]:
from scipy.stats import pearsonr
from scipy.spatial.distance import euclidean
from tqdm import tqdm

'''
Function to compute Euclidean Similarity
'''
def SimEuclid(DataFrame,Customer1,Customer2,min_common_items=1):
    # GET MOVIES OF USER1
    movies_user1=DataFrame[DataFrame['customer_id'] ==Customer1 ]
    # GET MOVIES OF USER2
    movies_user2=DataFrame[DataFrame['customer_id'] ==Customer2 ]
    
    # FIND SHARED FILMS
    rep=pd.merge(movies_user1 ,movies_user2,on='article_id')    
    if len(rep)<2:
        return 0
    if(len(rep)<min_common_items):
        return 0
    #return distEuclid(rep['rating_x'],rep['rating_y']) 
    return 1.0/(1.0+euclidean(rep['rating_x'],rep['rating_y'])) 

'''
Function to compute Pearson Similarity
'''
def SimPearson(DataFrame,Customer1,Customer2,min_common_items=1):
    # GET MOVIES OF USER1
    movies_user1=DataFrame[DataFrame['customer_id'] ==Customer1 ]
    # GET MOVIES OF USER2
    movies_user2=DataFrame[DataFrame['customer_id'] ==Customer2 ]
    
    # FIND SHARED FILMS
    rep=pd.merge(movies_user1 ,movies_user2,on='article_id',)
    if len(rep)<2:
        return 0    
    if(len(rep)<min_common_items):
        return 0    
    res=pearsonr(rep['rating_x'],rep['rating_y'])[0]
    if(np.isnan(res)):
        return 0
    return res


'''
Class for Collaborative Filtering (user based)
'''
class CollaborativeFilteringUB:
    """ Collaborative filtering using a custom sim(u,u'). """
    
    def __init__(self,DataFrame, similarity=SimPearson):
        """ Constructor """
        self.sim_method=similarity# Gets recommendations for a person by using a weighted average
        self.df=DataFrame
        self.sim = {}
        
    def fit(self):
        """ Prepare data structures for estimation. Similarity matrix for users """
        allUsers=set(self.df['customer_id'])
        for person1 in tqdm(allUsers):
            self.sim.setdefault(person1, {})
            a=transactions_small_train[transactions_small_train['customer_id']==person1][['article_id']]
            data_reduced=pd.merge(transactions_small_train,a,on='article_id')
            for person2 in allUsers:
                if person1==person2: continue
                self.sim.setdefault(person2, {})
                if(person1 in self.sim[person2]):continue # since is a simetric matrix
                sim=self.sim_method(data_reduced,person1,person2)
                if(sim<0):
                    self.sim[person1][person2]=0
                    self.sim[person2][person1]=0
                else:
                    self.sim[person1][person2]=sim
                    self.sim[person2][person1]=sim
    #Prediction for a given customer and article            
    def predict(self, customer_id, article_id, topK = 20):
        totals={}
        movie_users=self.df[self.df['article_id'] ==article_id]
        rating_num=0.0
        rating_den=0.0
        allUsers=set(movie_users['customer_id'])
        user_mean = self.df[self.df['customer_id'] == customer_id].rating.mean()
        top_similar_customers = sorted([(self.sim[customer_id][other], other) if (customer_id != other) else (0, other) for other in allUsers])[:topK]
        for _, other in top_similar_customers:
            if customer_id==other: continue 
            rating_num += self.sim[customer_id][other] * float(movie_users[movie_users['customer_id']==other]['rating'] - self.df[self.df['customer_id'] == other]['rating'].mean())
            rating_den += self.sim[customer_id][other]
        if rating_den==0: 
            if self.df.rating[self.df['article_id']==article_id].mean()>0:
                # return the mean movie rating if there is no similar for the computation
                return self.df.rating[self.df['article_id']==article_id].mean()
            else:
                # else return mean user rating 
                return self.df.rating[self.df['customer_id']==customer_id].mean()
        return user_mean + rating_num / rating_den
    
    #Prediction of the top k articles for a customer
    def predict2(self, customer_id, k):
        totals={}
        allArticles=set(self.df['article_id'])
        articles_rating = []
        for article2 in (allArticles):
            rating = self.predict(customer_id, article2)
            articles_rating.append([article2, rating])
        preds = np.array(articles_rating)
        return preds[np.argsort(preds[:, 1])][-k:, 0].astype('int').astype('str').tolist()

    
'''
Class for Collaborative Filtering (item based)
'''
class CollaborativeFilteringIB:
    """ Collaborative filtering using a custom sim(i,i'). """
    
    def __init__(self,DataFrame, similarity=SimPearson):
        """ Constructor """
        self.sim_method=similarity# Gets recommendations for a person by using a weighted average
        self.df=DataFrame
        self.sim = {}
        
    def fit(self):
        """ Prepare data structures for estimation. Similarity matrix for items """
        allItems=set(self.df['article_id'])
        for item1 in tqdm(allItems):
            self.sim.setdefault(item1, {})
            a=transactions_small_train[transactions_small_train['article_id']==item1][['customer_id']]
            data_reduced=pd.merge(transactions_small_train,a,on='customer_id')
            for item2 in allItems:
                if item1==item2: continue
                self.sim.setdefault(item2, {})
                if(item1 in self.sim[item2]):continue # since is a simetric matrix
                sim=self.sim_method(data_reduced,item1,item2)
                if(sim<0):
                    self.sim[item1][item2]=0
                    self.sim[item2][item1]=0
                else:
                    self.sim[item1][item2]=sim
                    self.sim[item2][item1]=sim
    #Prediction for a given customer and article            
    def predict(self, customer_id, article_id, topK = 20):
        totals={}
        movie_users=self.df[self.df['customer_id'] == customer_id]
        rating_num=0.0
        rating_den=0.0
        allItems=set(movie_users['article_id'])
        article_mean = self.df[self.df['article_id'] == article_id].rating.mean()
        top_similar_articles = sorted([(self.sim[article_id][other], other) if (article_id != other) else (0, other) for other in allItems])[:topK]
        for _, other in top_similar_articles:
            if article_id==other: continue 
            rating_num += self.sim[article_id][other] * float(movie_users[movie_users['article_id']==other]['rating'] - self.df[self.df['article_id'] == other]['rating'].mean())
            rating_num += self.sim[article_id][other] * float(movie_users[movie_users['article_id']==other]['rating'])
            rating_den += self.sim[article_id][other]
        if rating_den==0: 
            if self.df.rating[self.df['customer_id']==customer_id].mean()>0:
                # return the mean movie rating if there is no similar for the computation
                return self.df.rating[self.df['customer_id']==customer_id].mean()
            else:
                # else return mean user rating 
                return self.df.rating[self.df['article_id']==article_id].mean()
        return article_mean + rating_num/rating_den
    
    #Prediction of the top k articles for a customer
    def predict2(self, customer_id):
        totals={}
        allArticles=set(self.df['article_id'])
        articles_rating = []
        for article2 in (allArticles):
            rating = self.predict(customer_id, article2)
            articles_rating.append([article2, rating])
        preds = np.array(articles_rating)
        return preds[np.argsort(preds[:, 1])][-k:, 0].astype('int').astype('str').tolist()

    
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))
    
def evaluate(predict_f,data_train,data_test):
    """ RMSE-based predictive performance evaluation with pandas. """
    ids_to_estimate = zip(data_test.customer_id, data_test.article_id)
    list_customers = set(data_train.customer_id)
    estimated = np.array([predict_f(u,i) if u in list_customers else 3 for (u,i) in ids_to_estimate ])
    real = data_test.rating.values
    return compute_rmse(estimated, real)
           
        

We will fit the IBCF and UBCF using both distances, Euclidean and Pearson, and we will compare them using the test data and the RMSE as metric.

In [None]:
'''
User-Based Collaborative Filtering
'''
reco1 = CollaborativeFilteringUB(transactions_small)
reco1.fit()
ids_to_estimate = zip(transactions_small_test.customer_id, transactions_small_test.article_id)
estimated = np.array([reco1.predict(u,i) if u in transactions_small_train.customer_id else 3 for (u,i) in ids_to_estimate ])
print('RMSE for User-Based Collaborative Recomender: %s' % evaluate(reco1.predict,transactions_small_train,transactions_small_test))

In [None]:
'''
User-Based Collaborative Filtering
'''
reco2 = CollaborativeFilteringUB(transactions_small, similarity = SimEuclid)
reco2.fit()
ids_to_estimate = zip(transactions_small_test.customer_id, transactions_small_test.article_id)
estimated = np.array([reco2.predict(u,i) if u in transactions_small_train.customer_id else 3 for (u,i) in ids_to_estimate ])
print('RMSE for User-Based Collaborative Recomender: %s' % evaluate(reco2.predict,transactions_small_train,transactions_small_test))

In [None]:
'''
Item-Based Collaborative Filtering
'''
reco3 = CollaborativeFilteringIB(transactions_small)
reco3.fit()
ids_to_estimate = zip(transactions_small_test.customer_id, transactions_small_test.article_id)
estimated = np.array([reco3.predict(u,i) if u in transactions_small_train.customer_id else 3 for (u,i) in ids_to_estimate ])
print('RMSE for Item-Based Collaborative Recomender: %s' % evaluate(reco3.predict,transactions_small_train,transactions_small_test))

In [None]:
'''
Item-Based Collaborative Filtering
'''
reco4 = CollaborativeFilteringIB(transactions_small, similarity = SimEuclid)
reco4.fit()
ids_to_estimate = zip(transactions_small_test.customer_id, transactions_small_test.article_id)
estimated = np.array([reco4.predict(u,i) if u in transactions_small_train.customer_id else 3 for (u,i) in ids_to_estimate ])
print('RMSE for Item-Based Collaborative Recomender: %s' % evaluate(reco4.predict,transactions_small_train,transactions_small_test))

As we can see the recommender system that has better results is Item Based using any of the similarities, so we would use this mehtod. However, as commented before this methods cannot be used for the whole dataset or at least for a relevant part of it as it takes too much time. In the following code we will use a more efficient recommender system to perform the recommendations:

### Factorization Models

In this case we will use a much larger dataset as this methods have less computational cost. However, we will also only use a part of the data as we believe that for users and articles for which we don't have much information (have not bought anything in a long period or only a few articles) it is better to use a non personalized recommender system.

First of all, we select the data from the last month and customers with at least 3 transactions and articles with at least 3 transactions. We also split in train and test data, taking as train the first two weeks and test the last one with available data.

In [None]:
initial_date = datetime.datetime(2020, 9, 1) #We use last two weeks
min_purchases_customer = 3
min_purchases_article = 3
train_date = datetime.datetime(2020, 9, 15)

transactions_reduced_train = subset_transactions(transactions = transactions, init_date = initial_date, min_purchases_customer = min_purchases_customer, min_purchases_article = min_purchases_article, add_0rat = False, final_date = train_date)
transactions_reduced_test = subset_transactions(transactions = transactions, init_date = train_date, min_purchases_customer = min_purchases_customer, min_purchases_article = min_purchases_article, add_0rat = False)

print('Train data has shape: ' + str(transactions_reduced_train.shape))
print('Test data has shape: ' + str(transactions_reduced_test.shape))


### SVD

The factorization method that we will use is the SVD factorization. The matrix that will be factorized contains the number of times that an article has been bought in our data for a customer. We also use only a part of the data to fit the model (from first two weeks of september) and another part to evaluate it through MAP (last week with data).

In [None]:
from scipy import sparse
from scipy.linalg import sqrtm

class RecSys_mf():
    """ Collaborative filtering using SVD. """
    
    def __init__(self,df, num_components=10):
        """ Constructor """
        self.num_components=num_components
        self.train = pd.pivot_table(df[['customer_id','article_id','rating']],columns='article_id',index='customer_id',values='rating')
        # We create a dictionary where we will store the user_id and movie_id which correspond 
        # to each index in the Rating matrix
        
        customer_index = np.arange(len(self.train.index))
        self.customers = dict(zip(customer_index,self.train.index ))
        self.customers_id2index = dict(zip(self.train.index,customer_index)) 
        
        article_index = np.arange(len(self.train.columns))
        self.articles = dict(zip(article_index,self.train.columns )) 
        self.articles_id2index= dict(zip(self.train.columns, article_index))
        self.articles_index2id= dict(zip(article_index,self.train.columns))

        
        
    def fit(self):
        """ We decompose the R matrix into to submatrices using the training data """
        print('Fitting the model...')
        train_matrix = np.array(self.train)
        # we fill the nan values with 0 as have not been bought and remove the item average
        train_matrix[np.isnan(train_matrix)] = 0
        item_means = np.mean(train_matrix, axis=0)
        x = np.tile(item_means, (train_matrix.shape[0],1))         
        
        # we remove the per item average from all entries.
        # the above mentioned nan entries will be essentially zero now
        train_matrix = train_matrix - x
        print('Factorizing matrix...')
        U, s, V = np.linalg.svd(train_matrix, full_matrices=False)

        # reconstruct rating matix
        S = np.diag(s[0:self.num_components])
        U = U[:,0:self.num_components]
        V = V[0:self.num_components,:]
        S_root = sqrtm(S)
        
        print('Computing the predictions...')
        USk=np.dot(U,S_root)
        SkV=np.dot(S_root,V)
        Y_hat = np.dot(USk, SkV)
        self.Y_hat = Y_hat + x
        

        
    def predict(self, customer_id, article_id):
        if article_id in self.articles:
            return self.Y_hat[self.customers_id2index[customer_id],self.articles_index2id[article_id]]
        else: #in case it is a new article 
            return 0
    def predict2(self, customer_id, k=12):
        if customer_id in self.customers.values():
            a = (np.argsort(-self.Y_hat[self.customers_id2index[customer_id]])<k)
            b = ([i for i, x in enumerate(a) if x])
            rec = []
            for element in b:
                rec.append(self.articles_index2id[element])
        else:
            return preds
        return rec

We fit the model with the train data.

In [None]:
reco = RecSys_mf(transactions_reduced_train,num_components=200)
reco.fit()
#print('RMSE for SVD: %s' % evaluate(reco.predict,data_train,data_test))

After fitting the model we evaluate it with the test dataset. (we only evaluate users that are in both datasets, train and test.

In [None]:
apks = []
for customer in tqdm(transactions_reduced_test['customer_id'].unique()):
    if customer in transactions_reduced_train['customer_id'].unique():
        ground_truth = list(transactions_reduced_test[transactions_reduced_test['customer_id'] == customer]['article_id'])
        pred = reco.predict2(customer)
        apks.append(apk(ground_truth, pred))
        
print('Map achieved with SVD: ' + str(np.array(apks).mean()))

As we can see, this method achieves a MAP of 0.00025 which is clearly worse than previous results.

## So, after trying several methods and evaluating them, we have seen that the one that achieves better results is the one that recommends the most recent articles bought by the user (if there are) or the most popular ones.

Following code create the predictions for the new data

In [None]:
transactions['pop_factor'] = 1
train_week_1 = transactions[(transactions['t_dat'] >= '2020-09-16')]
train_week_2 = transactions[(transactions['t_dat'] >= '2020-09-08') & (transactions['t_dat'] < '2020-09-16')]
train_week_3 = transactions[(transactions['t_dat'] >= '2020-09-01') & (transactions['t_dat'] < '2020-09-08')]
train_week_4 = transactions[(transactions['t_dat'] >= '2020-08-23') & (transactions['t_dat'] < '2020-09-01')]


#Issolate users + their items purchased
positive_items_per_user1 = train_week_1.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2 = train_week_2.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3 = train_week_3.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4 = train_week_4.groupby(['customer_id'])['article_id'].apply(list)

#Determine most popular items
popular_items_week_1_2_aux = pd.concat([train_week_1, train_week_2], axis=0).groupby(['article_id'])['pop_factor'].value_counts()
popular_items_week_1_2 = list(popular_items_week_1_2_aux.droplevel(level=1).sort_values(ascending=False).index.values)

In [None]:
from collections import Counter
outputs = []

for user in tqdm(customers["customer_id"].values):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    
    user_output += list(popular_items_week_1_2[:12 - len(user_output)])
    outputs.append(" ".join(map(str,user_output)))



In [None]:
submission = pd.DataFrame({'customer_id': customers['customer_id'], 'prediction': outputs})
submission.to_csv('submissions.csv', index = False)

# The result achieved in the kaggle competition with these predictions for the 1% of evaluation data is a MAP of 0.0206