In [70]:
import pandas as pd

In [19]:
# Read csv files
transactions = pd.read_csv("data/transactions_train.csv",parse_dates = ['t_dat'])
articles = pd.read_csv("data/articles.csv")
customers = pd.read_csv("data/customers.csv")

In [5]:
# Number of articles, number of customers and total volume change over the time

# aggregate the transaction by date
transaction_aggr = transactions.groupby(['t_dat']).nunique().reset_index()[['t_dat','customer_id','article_id']]

# create a column showing sum per user per day
transaction_aggr['sum'] = transactions.groupby(['t_dat']).sum().reset_index()[['price']]
transaction_aggr = transaction_aggr.rename(columns={"customer_id": "nr_customer",
                                                   "article_id":"nr_article",
                                                   "sum":"total_volume"})

# Save csv
transaction_aggr.to_csv("data/transaction_aggr.csv",index=False)

In [6]:
# Colors

# join transaction dataset with articles
transaction_article_colour = pd.merge(transactions[['article_id', 't_dat']], articles[['article_id', 'colour_group_name']], how = 'inner', on = ['article_id']).reset_index()

# colours articles by month
articles_transaction_color_aggr = transaction_article_colour.groupby(['t_dat','colour_group_name']).count().reset_index()[['t_dat','colour_group_name','article_id']]
articles_transaction_color_aggr['t_dat'] = pd.to_datetime(articles_transaction_color_aggr['t_dat'])

# Convert date into month and year
articles_transaction_color_aggr['month_year']=articles_transaction_color_aggr['t_dat'].dt.strftime('%m/%Y')
articles_transaction_color_aggr['year']= pd.DatetimeIndex(articles_transaction_color_aggr['t_dat']).year

# Save csv
articles_transaction_color_aggr.to_csv("data/articles_transaction_color_aggr.csv",index=False)

By season

In [8]:
# Get the month of each date
transactions['month'] =  pd.DatetimeIndex(transactions['t_dat']).month

# Define the seasons along the year
transactions.loc[(transactions["month"] >= 3) & (transactions["month"] <= 5) , "season"] = "Spring"
transactions.loc[(transactions["month"] >= 6) & (transactions["month"] <= 8) , "season"] = "Summer"
transactions.loc[(transactions["month"] >= 9) & (transactions["month"] <= 11) , "season"] = "Autumn"
transactions.loc[(transactions["month"] == 12) , "season"] = "Winter"
transactions.loc[(transactions["month"] >= 1) & (transactions["month"] <= 2) , "season"] = "Winter"

In [None]:
# Products per season
transactions_season = transactions[['season' , 'article_id']].merge(articles[['article_id' , 'product_type_name' , 'product_type_no']] , on = 'article_id').groupby(['season' , 'product_type_no' ,'product_type_name']).agg({'product_type_no': 'count'}).rename(columns={'product_type_no': 'quantity'}).reset_index()

# Save csv
transactions_season.to_csv("data/transactions_season.csv",index=False)

In [13]:
# Age Group
import numpy as np
bins = np.array([0,15,19,35,50,99])
labels = ['unknown' , 'teens' , 'young' , 'middle-aged' , 'old']
customers['age_cat'] = pd.cut(customers['age'], bins=bins, labels=labels, include_lowest=True)
customers_age = customers[['customer_id' , 'age_cat']].merge(transactions[['customer_id' , 'season']] , on = 'customer_id').groupby(['season' , 'age_cat']).agg({'age_cat': 'count'}).rename(columns={'age_cat': 'quantity'}).reset_index()

# Save csv
customers_age.to_csv("data/customers_age.csv",index=False)

In [14]:
# Colors per season
colors_season = transactions[['season' , 'article_id']].merge(articles[['article_id' , 'perceived_colour_master_name']] , on = 'article_id').groupby(['season' , 'perceived_colour_master_name']).agg({'perceived_colour_master_name': 'count'}).rename(columns={'perceived_colour_master_name': 'quantity'}).reset_index()
# Save csv
colors_season.to_csv("data/colors_season.csv",index=False)

Product recommendation

In [21]:
from dateutil.relativedelta import relativedelta
from tqdm.notebook import tqdm

Baseline Model

In [61]:
N_MONTHS = 1

window = relativedelta(months=N_MONTHS)
last_date = transactions['t_dat'].max()

threshold = last_date - window

mask = transactions['t_dat'] > threshold
transactions_baseline = transactions[mask]

purchase_dict = {} # Dict that contains each article the user bought and the count of times it was bought

for x in zip(transactions['customer_id'], transactions['article_id']):
    cust_id, art_id = x
    
    if cust_id not in purchase_dict:
        purchase_dict[cust_id] = {}
    purchase_dict[cust_id][art_id] = purchase_dict[cust_id].get(art_id, 0) + 1
    
# List of the most bought articles for all users
best_ever = list(transactions['article_id'].value_counts().index)

# Save
import pickle
f = open("data/best_ever.pkl", "wb")
pickle.dump(best_ever, f)
f.close()

f = open("data/purchase_dict.pkl", "wb")
pickle.dump(purchase_dict, f)
f.close()

Collaborative filtering

In [63]:
# Preprocessing

# Get the transactions from September 1, 2020 on
transactions_copy = transactions.copy()
transactions = transactions_copy[transactions_copy['t_dat'] > '2020-08-31'].sort_values(by=['customer_id'])

# Count the number of same articles bought by the same person and convert to dataframe
counts_df = transactions.groupby(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id']).size()
counts_df = counts_df.to_frame()
counts_df.reset_index(inplace=True)
small_counts = counts_df.rename(columns={0: 'count'})

# Transactions file after September 1, 2020 while the number of same articles bought by each person
small_counts = small_counts.sort_values('customer_id')

from scipy.sparse import csr_matrix, dok_matrix
from pandas.api.types import CategoricalDtype

# Auxiliary function
def to_dense(array):
    """
    Converts a spare matrix (where a lot of elements are zero) to a dense 
    array (an array where the elements are all sequential starting at index 0). 
    
    :param array: Matrix to be converted
    :return: Dense array
    """
    try:
        array = array.todense()
    except:
        pass
    
    return np.array(array).squeeze()

# Create sparse matrix user-item 
def build_counts_table(df):
    """
    Gives an sparse matrix where the columns and the items and the rows the customer. 
    The value is the number of times that a customer has bought an item. 
    
    :param df: original dataframe with transactions
    :return: 
        * Sparse matrix
        * Customer ids corresponding to each row
        * Items ids corresponding to each column
    """
    # Get customer ids and item ids
    customer_ids = CategoricalDtype(sorted(df.customer_id.unique()), ordered=True)
    item_ids = CategoricalDtype(sorted(df.article_id.unique()), ordered=True)

    # Get sparse matrix
    row = df.customer_id.astype(customer_ids).cat.codes
    col = df.article_id.astype(item_ids).cat.codes
    sparse_matrix = csr_matrix((df["count"], (row, col)), \
                           shape=(customer_ids.categories.size, item_ids.categories.size))

    return sparse_matrix, customer_ids, item_ids

# Get sparse matrix for the transactions from Sept 1, 2020
counts, indexes, columns = build_counts_table(small_counts)

# Number of rows is number of customers, number of columns is number of articles
print(counts.shape)

# Get the ids of the top n customers
def top_active_customers(counts, indexes, columns, n):
    """
    Returns the id of the top n customers, in terms of items bought 
    
    :param counts, indexes, columns: Tuple returned by build_counts_table
    :param n: Number of users
    :return: Series of customerID of the top users
    """
    # Operate with the sparse matrix, convert to dense the result
    sums = to_dense(counts.sum(axis=1))
    # Get indices
    indices = sums.argsort()
    return indexes.categories[indices[-n:]]

# Get the ids of the top n articles
def top_bought_articles(counts, indexes, columns, n):
    """
    Returns the top n most bought items
    
    :param counts, indexes, columns: Tuple returned by build_counts_table
    :param n: Number of items
    :return: Series of itemID of the top items
    """
    # Operate with the sparse matrix, convert to dense the result
    sums = to_dense(counts.sum(axis=0))
    # Get indices
    indices = sums.argsort()
    return columns.categories[indices[-n:]]

# Get the top 5,000 articles and users from the transactions after Sept 1, 2020
top_customers = top_active_customers(counts, indexes, columns, 5000)
top_items = top_bought_articles(counts, indexes, columns, 5000)

# Transactions from Sept 1, 2020
s = small_counts.copy()
# Transactions from Sept 1, 2020 that include one of the most bought 5,000 items
s = s[s.article_id.isin(top_items)]

# Transactions from Sept 1, 2020 that include one of the most bought 5,000 items and belong to one of the most active 5,000 customers
s = s[s.customer_id.isin(top_customers)]

# Drop the non-relevant info
s = s.drop(s.columns[[0, 3, 4]], axis=1)

(189510, 26252)
Transactions from Sept 1, 2020:  727334


In [64]:
# Get the sparse matrix for the transactions from Sept 1, 2020 that include the top n customers and items
counts, indexes, columns = build_counts_table(s) # build the counts matrix

from sklearn.metrics.pairwise import pairwise_distances

# Compute similarities
def similarity_matrix(similarity_function, counts): 
    # Convert the sparse matrix to a df
    x = pd.DataFrame.sparse.from_spmatrix(counts)
    # Create a pairwise distance matrix for the df considered using similarity function
    matrix = pairwise_distances(X = x, metric = similarity_function, n_jobs = -1)
    del x
    # Get the sparse matrix of the pairwise distance matrix
    matrix = csr_matrix(matrix)

    return matrix

# The similarity function to compute the predictions will be correlation
similarities = similarity_matrix(similarity_function="correlation", counts=counts.T)

In [65]:
counts

<4989x4903 sparse matrix of type '<class 'numpy.int64'>'
	with 69478 stored elements in Compressed Sparse Row format>

In [66]:
indexes

CategoricalDtype(categories=['00077dbd5c4a4991e092e63893ccf29294a9d5c46e85010e95f2fc10bf9437a4',
                  '000fb6e772c5d0023892065e659963da90b1866035558ec16fca51b0dcfb7e59',
                  '004d932f7a27ac3167c77db81d9cfd89392729e7f7e0d4c27e57ba355fc93988',
                  '004eba6e5f4705ea033b34f454b43524e41eb3d5c63923870c9845fb0c960706',
                  '00754012108569f9c99871720111a2b50aa7b6ebebe2a415914df8b8e5e120ff',
                  '007e3d72925742fd024933e33be4e5064815c6dc52c3fc98e23e505ef1edc828',
                  '00902bb69f7c13348e4b781baa4316ce05755e1c5e327ace8177835b30e341fe',
                  '0091bb09e49f45bdaeb3dd80f7bf98b4b8e3e3e49347e6bc9ec9818d631dddee',
                  '00944aac87d67eb28bb5d3b5dc02dafa6b34c821ff6a3b788360da7e864703a5',
                  '009a85913aa6f503ed0d2b5ac02ab919d6565bbbaa934a761ce376d643144f88',
                  ...
                  'ffbacd31aca5acf46e9afd95a633abb93b14ac3f1e40845efda1c49867497c3a',
                  'ff

In [67]:
columns

CategoricalDtype(categories=[111565001, 111586001, 111593001, 111609001, 123173001,
                  130035001, 146730001, 148033001, 153115019, 153115020,
                  ...
                  946795001, 946827001, 947060001, 947509001, 947934001,
                  949198001, 949551001, 949551002, 952267001, 953763001],
                 ordered=True)

In [73]:
# Save csv
import io
csv = io.StringIO()
counts.to_csv(csv)
indexes.to_csv("data/indexes.csv",index=False)
columns.to_csv("data/columns.csv",index=False)
similarities.to_csv("data/similarities.csv",index=False)

AttributeError: to_csv not found