In [None]:
# This colab was made by Adam Dudek, Aleksandra Tomczak and Jakob Holden Hansen as a group

In [None]:
# Libraries

import pandas as pd
import numpy as np

### Part 1 - Data preprocessing.

In [None]:
# customers = pd.read_csv("data/customers.csv")
# min(customers['age']) # 16.0
# max(customers['age']) # 99.0

In [None]:
# Creating age groups
# 16 - 24 : group 1
# 25 - 34 : group 2
# 35 - 54 : group 3
# 55 - 99 : group 4
# agegroup_1 = customers.loc[customers['age'] <= 24]
# agegroup_2 = customers.loc[(customers['age'] > 24) & (customers['age'] <= 34)]
# agegroup_3 = customers.loc[(customers['age'] > 34) & (customers['age'] <= 54)]
# agegroup_4 = customers.loc[(customers['age'] > 54)]

In [None]:
# Saving customers groups to separate CSV files
# agegroup_1.to_csv('age_1.csv', index = False)
# agegroup_2.to_csv('age_2.csv', index = False)
# agegroup_3.to_csv('age_3.csv', index = False)
# agegroup_4.to_csv('age_4.csv', index = False)

In [None]:
# Loading first 10 rows to investigate the data
# df = pd.read_csv('data/transactions_train.csv', nrows = 10)
# df

In [None]:
# Checking the type of the date variable
# type(df['t_dat'][0])
# Output: str

In [None]:
# Loading the transactions for 22.08.2020 - 22.09.2020
# transactions = pd.read_csv("data/transactions_train.csv")
# last_month = transactions.loc[transactions['t_dat'] >= '2020-08-22']

In [None]:
# imports

from google.colab import files
uploaded = files.upload()

In [None]:
# file import

import io

customers = pd.read_csv(io.BytesIO(uploaded['age_1.csv']))
transactions = pd.read_csv(io.BytesIO(uploaded['last_month.csv']))

In [None]:
customers = pd.read_csv('age_1.csv')
transactions = pd.read_csv('last_month.csv')

### Part 2 - Non-personalized recommendations.

In [None]:
# Creating a list of all customers IDs
customer_list = list(customers['customer_id'])

# Choosing from the transactions records the purchuses of customers from our age group
transactions_age = transactions[transactions['customer_id'].isin(customer_list)]

# Reseting an index
transactions_age = transactions_age.reset_index()
transactions_age = transactions_age.drop(columns = 'index')

# Counting how many times each article was bought
frequency = transactions_age['article_id'].value_counts()

# Creating a data frame with top 12 products
top_items = pd.DataFrame(frequency[:12]).reset_index()

# Renaming the columns
top_items.columns = ['article_id', 'frequency']

# Getting the list of top 12 articles
top_items_list = list(top_items['article_id'])

In [None]:
# Top 12 items
top_items_list

In [None]:
# Selecting necessary columns
transactions_age_ = transactions_age[['customer_id','article_id']]

# Aggregating the data frame
transactions_age_agg = transactions_age_.groupby('customer_id').agg(lambda x: list(set(x)))

# Resetting an index so that customer ID is in a separate column
transactions_age_agg = transactions_age_agg.reset_index()

# Creating a list of lists of purchased items
purchased_items = list(transactions_age_agg['article_id'])

In [None]:
# Function that calculates association between two products
def top_associated_products(df, product, N = 12):
    d = {}
    times = 0
    for l in df:
        if product in l:
            times = times + 1
            for i in l:
                if i != product: 
                    if(i in d):
                        d[i] += 1.0
                    else:
                        d[i] = 1.0

    for k in d:
        d[k] =   d[k] / times
    sorted_list = sorted(d.items(), key=lambda x: x[1],reverse=True)[:N]
    items_list = []
    for item in sorted_list:
        items_list.append(item[0])
    
    return items_list

In [None]:
# Checking if the function is working properly
most_associated = top_associated_products(purchased_items, 683001028, N = 5)
print(most_associated)

### Part 3 - Personalized recommendations.

In [None]:
# Collaborative filtering class
class CollaborativeFiltering():
    """ Collaborative filtering using a custom sim(u,u'). """
    
    def __init__(self,DataFrame):
        """ Constructor """
        self.df=DataFrame
        self.sim_mat = None #for later storage of training similarity matrix
        self.pivotdata = None #for later storage of pivot datatable
        
        
    #HAVE MADE MY OWN FIT TO MAKE I-I MATRIX
    def fit(self):
        
        """
        using DataFrame as entry and making it to a PivotTable inside class
        """
        self.df = self.top_k_items()
        indicator = np.ones(len(self.df)) #creating a rating indicator of 1 for bought items
        self.df['rating']=indicator #making a column in the dataframe with rating 1 
        
        #creating pivot table (U-I table)
        pivotdata = self.df.pivot_table(values='rating',index='customer_id',columns='article_id').fillna(0.0)
        self.pivotdata =pivotdata
        
        allItems = set(self.pivotdata.columns)
        similarity = {}
        
        for item1 in allItems:
            similarity.setdefault(item1,{})
            for item2 in allItems:
                if item1==item2: continue
                similarity.setdefault(item2,{})
                if (item1 in similarity[item2]): continue
                sim = self.CosineSimilarity(item1,item2)
        
                if(sim<0):
                    similarity[item1][item2]=0
                    similarity[item2][item1]=0
                else:
                    similarity[item1][item2]=sim
                    similarity[item2][item1]=sim
            
        self.sim_mat = pd.DataFrame(similarity)
        
    def predict(self, article_ids,customer_id,n_recommendations=12):
        pred_dict=[]
        dict_tot = []
        #finding all the articles for the given customer
        articles_customer=self.sim_mat[article_ids]
        #finding number of items bought
        n_items = articles_customer.shape[1]

        #iterating over all items
        for i in range(0,n_items):
            #finding all datasets of similarities for the article_id
            temp = articles_customer.iloc[:,i]
            #appending the dataset of similarities for a given item of the user to list
            dict_tot.append(temp)
        #transforming the dictionary of all similarities for items to DataFrame
        #filling all NaN values with zero
        dict_tot = pd.DataFrame(dict_tot).fillna(0)
        #Taking the mean of all the articles and sorts from highest to lowest
        #picks out the 14 top items
        pred_dict = np.mean(dict_tot,axis=0).sort_values(ascending=False)[:n_recommendations]
        #DataFrame fix
        return_df = pred_dict.to_frame().reset_index().copy()
        #changes columns to article_id and similarity for the output df to easier call
        return_df.rename(columns={'index':'article_id',0:'similarity'},inplace=True)
        articles_sorted=return_df['article_id'].to_list()
        articles_sorted=list(articles_sorted)
        return articles_sorted

    '''
        Using Cosine Similarity since we have a unary situation
    '''
    
    #definition of CosineSimilarity between two items
    def CosineSimilarity(self,Article1, Article2):
        customers_article1=self.pivotdata[Article1]
        customers_article2=self.pivotdata[Article2]
        sim = np.dot(customers_article1, customers_article2) / (np.linalg.norm(customers_article1)*np.linalg.norm(customers_article2))

        return sim

    def top_k_items(self,top_k=100):

        data_grouped_top = self.df.groupby('article_id').size().sort_values(ascending=False).head(top_k)
        article_numbers = np.array(data_grouped_top.index)
        art_state = self.df['article_id'].isin(article_numbers)
        new_df= self.df[art_state]
        return new_df

In [None]:
# Training the CF on trainingset
cf = CollaborativeFiltering(transactions_age_)
cf.fit()

In [None]:
# Fuction to predict multiple items
def prediction_multiple_items(col_fil, customer_id, articles):
    '''
    col_fil = the collaborative filter class pre-trained on traininset
    want to have this an an input in case we test several pre-trained collaborative filters
    with different number of item inputs
    '''
    cf_prediction = col_fil.predict(articles, customer_id)
    return cf_prediction

In [None]:
# Creating fake "Testsubject"
article_ids_poo = [827968001, 827968004]
customer_id_poo = '92f038c76d9be61640143b22aa524317059cbc6b97177964e5650b7a9353094c'
joik = prediction_multiple_items(cf, customer_id_poo, article_ids_poo)
joik

### Part 4 - Putting the predictions together.

In [None]:
# Creating a data frame with only customers IDs
all_customers = customers.drop(labels = ['FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'age', 'postal_code'], axis = 1)

# Adding a column with an empty value for all customers
all_customers['article_id'] = ''

In [None]:
# Concatenating the data frames
final = pd.concat([transactions_age_agg, all_customers], join = "outer", ignore_index = True)

In [None]:
# Dropping the duplicates, keeping the first occurence by default
# This way we will drop the rows with 0 values for the customers who actually purchased sth
final = final.drop_duplicates(subset = ['customer_id'], keep = 'first')

In [None]:
# Checking if we have all the customers in our final data frame
len(final) == len(customers)

In [None]:
# Shuffling the observations
final = final.sample(frac = 1)
final = final.sample(frac = 1)

In [None]:
# Resetting an index
final = final.reset_index(drop = True)

In [None]:
# Creating a sample of 10 000 random customers
sample = final.copy()[:10000]

In [None]:
# Creating a flag variable
sample['flag'] = ''

In [None]:
# Flag variable contains the number of items bought by the customer
for i in range(0, len(sample)):
    sample['flag'][i] = len(sample['article_id'][i])

In [None]:
sample

In [None]:
sample['reco'] =''

In [None]:
for i in range(0, len(sample)):
    if sample['flag'][i] == 0:
        sample['reco'][i] = top_items_list
    elif sample['flag'][i] == 1:
        sample['reco'][i] = top_associated_products(purchased_items, sample['article_id'][i][0], 12)
    else:
        print(i)
        sample['reco'][i] = prediction_multiple_items(cf, sample['customer_id'][i], np.array(sample['article_id'][i]))

In [None]:
submission = sample.to_csv('sample.csv', index = False)