# Import libraries


In [None]:
import numpy as np
import pandas as pd
import os
import glob
#import reco
from tqdm import tqdm
import datetime
from functools import partial
from dask.diagnostics import ProgressBar
ProgressBar().register()
import dask.dataframe as dd
from collections import Counter
from sklearn.preprocessing import OneHotEncoder

In [None]:
tqdm.pandas()

# Read data

In [None]:
data = pd.read_csv('transactions_train.csv',dtype={'article_id':str})
print(data.shape)

Calculate groups of price:

In [None]:
data['price2'] = pd.qcut(data['price'], 10)
data.head()

In [None]:
data.t_dat = pd.to_datetime(data.t_dat)#data.t_dat.progress_apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))
data = data[['t_dat','customer_id','article_id','price2']]

## Select only the last weeks

This way we will keep the relevant data and keep its size reasonable. We will take 2 weeks for training and leave the last one for validation

In [None]:
print("All Transactions Date Range: {} to {}".format(data['t_dat'].min(), data['t_dat'].max()))

train1 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,8)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
train2 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,1)) & (data['t_dat'] < datetime.datetime(2020,9,8))]
train3 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,23)) & (data['t_dat'] < datetime.datetime(2020,9,1))]
train4 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,15)) & (data['t_dat'] < datetime.datetime(2020,8,23))]

val = data.loc[data["t_dat"] >= datetime.datetime(2020,9,16)]

#del data

In [None]:
# List of all purchases per user (has repetitions)
positive_items_per_user1 = train1.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2 = train2.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3 = train3.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4 = train4.groupby(['customer_id'])['article_id'].apply(list)

In [None]:
train = pd.concat([train1,train2], axis=0) #train2

In [None]:
# train = train.drop('t_dat', axis = 1).reset_index(drop = True)

In [None]:
train.head()

Keep only the transactions for users that have more than 20 transactions for training the FM:

In [None]:
train_fm = train[['customer_id','article_id','price2']].copy()
print(train_fm.shape)
v = train_fm.customer_id.value_counts()
train_fm = train_fm[train_fm.customer_id.isin(v.index[v.gt(20)])]
print(train_fm.shape)
print(f"There're {len(train_fm.customer_id.value_counts())} users")

### Add other characteristics (TRAIN)

Load age of customers and calculate deciles to add them to train dataset:

In [None]:
customers = pd.read_csv('customers.csv')
customers = customers[['customer_id','age']]
train_fm = train_fm.merge(customers, how='left', on='customer_id')
train_fm['age'] = pd.qcut(train_fm['age'], 10)

Load article dataset and add to train the group category of the items and the colour group name:

In [None]:
articles = pd.read_csv('articles.csv',dtype={'article_id':str})
articles = articles[['article_id','product_group_name','perceived_colour_value_name','index_name']]

train_fm = train_fm.merge(articles, how='left', on='article_id')

Finally, add the last item bought for every customer:

In [None]:
shifted = train_fm[['customer_id','article_id']].groupby("customer_id").shift(+1)
train_fm = train_fm.join(shifted.rename(columns=lambda x: x+"_lag"))

In [None]:
train_fm

### Add other characteristics (VAL)

In [None]:
# val = val.drop('t_dat', axis = 1).reset_index(drop = True)
val_fm = val.merge(customers, how='left', on='customer_id')
val_fm['age'] = pd.qcut(val_fm['age'], 10)

val_fm = val_fm.merge(articles, how='left', on='article_id')

In [None]:
shifted_val = val_fm[['customer_id','article_id']].groupby("customer_id").shift(+1)
val_fm = val_fm.join(shifted_val.rename(columns=lambda x: x+"_lag"))
val_fm

## Factorization Machine

In [None]:
import matplotlib.pyplot as plt 
import math

class Oh_factorization_machine(): 
        
    def __init__(self,df,df_val, caract,num_components=10):
        """ Constructor """
       
        self.caract = caract
        self.articles = df[['article_id', 'index_name']].drop_duplicates()
        self.data     = self.__generatedf__(df[self.caract])
        self.data_val = self.__generatedf__(df[self.caract].merge(df_val[self.caract], how='inner', on=self.caract))
        
        print(self.data_val.shape)
        
        
        #self.all_data = self.__generatedf__(pd.concat([self.data,self.data_val],axis=0))
    
        
        self.ratings = self.data['rating'].values
        self.ratings_val = self.data_val['rating'].values
        
        self.K = num_components
        
        self.oh = OneHotEncoder(sparse = True)
        
        self.oh_matrix = self.oh.fit_transform(self.data[self.caract])
        self.oh_matrix_val = self.oh.transform(self.data_val[self.caract])
        
        
        print(self.oh_matrix.shape, self.oh_matrix_val.shape)
        
        
    def get_df(self):
        return self.data
        
    def __generatedf__(self,df):
        
        df['rating'] = 1
        
        #print(df)
        
        data2 = df.sample(frac=0.1, random_state=42).copy()
        data2['article_id'] = data2['article_id'].sample(frac = 1).values
        data2 = data2.drop_duplicates(subset=['article_id','customer_id'])
        #For the rmse:
        data2['rating'] = 0

#         For the log loss:
#         data2['rating'] = -1
        
        data = pd.concat([df,data2]).sample(frac=1)
        
        del data2
    
        return data
        
        
    def __sgd__(self,epoch):
        """
        Perform stochastic gradient descend
        """
        for idx in tqdm(self.training_indices, leave=True,desc='Epoch {}'.format(epoch),postfix='train_error: {:.3f}  val_error: {:.3f}'.format(self.train_rmse[-1],self.val_rmse[-1])):
            
            row = self.oh_matrix[idx]
            y = self.ratings[idx]
            
            #print(row.todense())
            prediction = self.predict(row)
        
            error  = (y - prediction)
            
            self.w0 += self.learning_rate * error
            
            self.bias += self.learning_rate * error * row 
            
            self.V[row.nonzero()[1]] += (self.learning_rate * error * (-self.V[row.nonzero()[1]] + row@self.V))
            
                
    def fit(self, n_epochs = 50, learning_rate = 0.001, lmbda=0.1, verbose =True, patience=15, early=True):
        """ Train the model. """
        self.verbose = verbose
        self.learning_rate = learning_rate
        self.lmbda = lmbda
      
        self.n_cols = self.oh_matrix.shape[1]
        
        self.train_rmse =[]
        self.val_rmse = []
        iter_diff = 0
        
        # initialize latent vectors
        self.V = np.random.normal(scale=1./self.K,
                                          size=(self.n_cols,self.K))      
        
        self.bias = np.random.normal(scale=1/self.n_cols,size=(self.n_cols)) 
        
        self.w0 = 1
    
        for epoch in range(n_epochs):
            
            self.training_indices = np.arange(self.n_cols)
            
            #shuffle training samples
            np.random.shuffle(self.training_indices)
            
            self.train_rmse.append(self.evaluate(self.predict,self.ratings,self.oh_matrix))
            self.val_rmse.append(self.evaluate(self.predict,self.ratings_val,self.oh_matrix_val))
            
            self.__sgd__(epoch)
            
            
            if early and len(self.val_rmse) > patience and self.val_rmse[-1] >= self.val_rmse[-patience]:
                if verbose: 
                    print("Early stopping at epoch {}".format(epoch))
                break
            
        
        if(self.verbose):
            self.__plot_learning_curves__()
            
    def __plot_learning_curves__(self):
        plt.plot(self.train_rmse,'--o',label="train_error")
        plt.plot(self.val_rmse,'--o',label="val_error")
        plt.legend()
        plt.show()
    
    def predict(self, row):
        """ Single transaction."""
        row = np.ravel(row.todense())
        a = self.w0 
        b = np.dot(row,self.bias.T)
        c = 0
        
        try: 
            b = np.ravel(b)
        except:
            pass
        
        #print(row.shape,row[1:].nonzero()[0])
        
        index = row.nonzero()[0]
        c = np.dot(self.V[index[0],:],self.V[index[1],:])
                
        sigmoid = lambda x : 1 / (1 + math.exp(-x))
        
        
        return sigmoid(a + b + c)
    
    def compute_rmse(self,y_pred, y_true):
        """ Compute Root Mean Squared Error. """
        
        self.error = (y_pred - y_true)
        return np.median(np.abs(y_pred - y_true))
    
    def log_loss(self,y_pred, y_true):
        """ Compute Log Loss. """
        return np.log(np.exp(-y_pred * y_true) + 1.0)
    
    def evaluate(self,predict_f,y_set,data_train):
        """
        data_train = oh_matrix
        y_set = rating (val,train)
        
        """
        #For every transaction row:
        #For rmse:
        prediction = [self.predict(data_train[idx]) for idx in range(data_train.shape[0])]
        return self.compute_rmse(prediction, y_set)
        
        #For log-loss:
        #loss = [self.log_loss(self.predict(data_train[idx]), y_set[idx]) for idx in range(data_train.shape[0])]
        
        #return np.mean(loss)

    
    
    def __top_12__(self,user):
        
        #For index_name:
        article_df = self.articles[['article_id', 'index_name']]
            
        article_df['customer_id'] = user
        
        article_df = article_df[['customer_id', 'article_id', 'index_name']]
        
        to_predict = self.oh.transform(article_df[self.caract])
        article_df['pred'] = [self.predict(to_predict[i]) for i in range(to_predict.shape[0])]
        article_df = article_df.sort_values(by=['pred'], ascending = False)[:12]
        
        return list(article_df['article_id'])
    

We will only add the index name, because we think is a very representative feature of the item:

In [None]:
fact_machine = Oh_factorization_machine(train_fm,val_fm,caract=['customer_id','article_id','index_name'],num_components=50)

In [None]:
fact_machine.fit(patience=8)

Predict top12 items for every user trained

In [None]:
users = dict(map(lambda x: x[::-1],enumerate(train_fm.customer_id.unique())))
recom = pd.DataFrame(list(users.keys()),columns=['customer_id'])
recom['recom'] = recom.customer_id.map(lambda x: fact_machine.__top_12__(x))
recom

# Non-personalized 

For users that aren't in our training we will use a non-personalized method. The method will be the same as the last assingment, so we will compute the top items in our data as well as stratify the customer population in ages (divided in deciles), as we suppose different intervals will buy different items. 

The best items will have a score sensible to time, as items that have not been bought in the recent weeks might not be as relevant as new ones. 

Let's read the customer dataset to find the age:

In [None]:
customers = pd.read_csv('customers.csv')
customers = customers[['customer_id','age']]

Merge it with the training dataset:

In [None]:
train = train.merge(customers, how='left', on='customer_id')
val = val.merge(customers, how='left', on='customer_id')

Calculate deciles of age and a popularity factor based on the time of the transaction:

In [None]:
train['age2'] = pd.qcut(train['age'], 10)
train['age2'].value_counts()
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,16) - x).days)

We define the function that calculates the most frequent bought items on the training dataset given an age that is in the training set:

In [None]:
intervals = train.age2.unique().dropna()

top_by_age = {}
for inter in intervals: 
    train_age = train.loc[train.age2 == inter]
    popular_items_group = train_age.groupby(['article_id'])['pop_factor'].sum()
    _, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1][:12])
    
    top_by_age.setdefault(inter,popular_items)

age_interval = {age : interval for age in range(15,100) for interval in intervals if age in interval}

popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()
_, top = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1][:12])


In [None]:
train.pop_factor.describe()

In [None]:
top

# RecSys Implementation

We implement the recommendation system. We also add code to simulate the score. The implementation is the following:

For a user, if it's found in the training set that we use to compute the factorization machine (users that buy >=20 items) we will give the recommendation based on this algorithm computed before. If in the contrary, the user is found in the training set but not in the one used for CF, we recommend the most common items for this user. The most common items is calculated first looking at the items bought in the last week, then if no item is found we look at the second week, and so on. Finally, if the user is not in any of the sets used for training, we give him the most popular items based on the age (if it's found) or in general.

Calculation of the score:

In [None]:
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

Implement the recommender system in the validation dataset:

In [None]:
#Comparem el validation items (actual output del validation) amb el predit del validation a partir del training!
positive_items_val = val.groupby(['customer_id'])['article_id'].apply(list)
val_users = positive_items_val.keys()
val_items = []

for i,user in tqdm(enumerate(val_users)):
    val_items.append(positive_items_val[user])

In [None]:
from collections import Counter
outputs = []
cnt = 0
cnt2 = 0

user_age = dict(zip(val.customer_id,val.age))
popular_items = list(popular_items)

for user in tqdm(val_users):
    if user not in users.keys():
        
        user_output = []
        
        if user in positive_items_per_user1.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in positive_items_per_user2.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in positive_items_per_user3.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in positive_items_per_user4.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in user_age.keys() and ~np.isnan(user_age[user]) and user_age[user] >= 15.0 and user_age[user] < max(age_interval.keys()):
            cnt2 += 1
            user_output += list(top_by_age[age_interval[int(user_age[user])]][:12 - len(user_output)])
            outputs.append(user_output)
        
        
        else: 
            user_output += list(top[:12 - len(user_output)])
            outputs.append(user_output)
    else:
        user_output = list(recom.loc[recom['customer_id'] == user,'recom'].values)
        user_output += list(top[:12 - len(user_output)])
        outputs.append(list(user_output))
        cnt+=1
        
    
        
print(cnt2,cnt)
print("mAP Score on Validation set:", mapk(val_items, outputs))

# Test submission:

Implement the RecSyst for the test dataset:

In [None]:
train1_t = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,16)) & (data['t_dat'] < datetime.datetime(2020,9,23))]
train2_t = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,8)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
train3_t = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,31)) & (data['t_dat'] < datetime.datetime(2020,9,8))]
train4_t = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,23)) & (data['t_dat'] < datetime.datetime(2020,8,31))]

positive_items_per_user1_t = train1_t.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2_t = train2_t.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3_t = train3_t.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4_t = train4_t.groupby(['customer_id'])['article_id'].apply(list)

train_t = pd.concat([train1_t,train2_t], axis=0) #train2_t
train_t = train_t.merge(customers, how='left', on='customer_id')
train_t['pop_factor'] = train_t['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days)

train_t['age2'] = pd.qcut(train_t['age'], 10)
train_t

In [None]:
intervals_t = train_t.age2.unique().dropna()

top_by_age = {}
for inter in intervals_t: 
    train_age_t = train_t.loc[train_t.age2 == inter]
    popular_items_group_t = train_age_t.groupby(['article_id'])['pop_factor'].sum()
    _, popular_items_t = zip(*sorted(zip(popular_items_group_t, popular_items_group_t.keys()))[::-1][:12])
    
    top_by_age.setdefault(inter,popular_items_t)

age_interval = {age : interval for age in range(15,100) for interval in intervals_t if age in interval}

popular_items_group_t = train.groupby(['article_id'])['pop_factor'].sum()
_, top = zip(*sorted(zip(popular_items_group_t, popular_items_group_t.keys()))[::-1][:12])

user_group = pd.concat([train1, train2, train3, train4], axis=0).groupby(['customer_id'])['article_id'].apply(list)

Load test submission:

In [None]:
test = pd.read_csv("sample_submission.csv")
test = test[['customer_id']]
test.head()

Apply the RecSys on these customers ids to get predictions:

In [None]:
test1 = test.copy()

In [None]:
test = test1.merge(customers, how='left', on='customer_id')

def to_submission(data):
    return " ".join([str(x) for x in data])
        
def recommend(user,age): 
    recommendation = []
    if user not in users.keys():
        user_output = []
        if user in positive_items_per_user1_t.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1_t[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in positive_items_per_user2_t.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2_t[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in positive_items_per_user3_t.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3_t[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]
            
        if user in positive_items_per_user4_t.keys():
            most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4_t[user]).most_common()}
            user_output += list(most_common_items_of_user.keys())[:12]

        if user in user_age.keys() and ~np.isnan(user_age[user]) and user_age[user] >= 15.0 and user_age[user] < max(age_interval.keys()):
            user_output += list(top_by_age[age_interval[int(user_age[user])]][:12 - len(user_output)])
        else: 
            user_output += list(top[:12 - len(user_output)])
            
        return user_output
    else:
        user_output = list(recom.loc[recom['customer_id'] == user,'recom'].values)
        user_output += list(top[:12 - len(user_output)])
        
        return user_output


test['prediction'] = test.progress_apply(lambda x: to_submission(recommend(x.customer_id,x.age)),axis=1)


In [None]:
test

Add the predictions to the test dataset to create the final submission:

In [None]:
del test['age']

test.to_csv(f'submission.csv',index=False)
test.head()