In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import os

In [None]:
articles_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')

In [None]:
customers_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')

In [None]:
transactions_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', parse_dates=['t_dat'])

### Analyzing the articles dataset

In [None]:
articles_df.head()

In [None]:
print(f'The articles dataset has {articles_df.shape[0]} records, each with {articles_df.shape[1]} features')

In [None]:
articles_df.info()

In [None]:
# Checking for missing values
articles_df.isnull().sum()

In [None]:
# Checking number of unique values per column
articles_df.nunique()

There seem to be categories where the number of unique names is lower than the number of unique ids.
For instance, there are 45875 unique values in the 'prod_name' column, while there are 47224 unique values in the 'product_code' column.
This is most likely because distinct articles were actually named the same. I will therefore use only the columns corresponding to ids, not the ones with names.

In [None]:
# Checking the types of articles
f, ax = plt.subplots(figsize=(15, 7))
ax = sns.histplot(data=articles_df, y='index_name')
ax.set_xlabel('count')
ax.set_ylabel('index name')
plt.show()

### Analyzing the customers dataset

In [None]:
customers_df.head()

In [None]:
print(f'The customers dataset has {customers_df.shape[0]} records, each with {customers_df.shape[1]} features')

In [None]:
# Checking for missing values
customers_df.isnull().sum()

There are quite a few missing values in this dataset. Luckily, the columns for age and postal_code have few and no missing values, respectively. These two should be useful for recommeding articles to a customer. 

In [None]:
# Checking number of unique values per column
customers_df.nunique()

In [None]:
# Plotting the age distribution
f, ax = plt.subplots(figsize=(15, 7))
ax = sns.histplot(data=customers_df, x='age', bins=40)
ax.set_xlabel('age')
ax.set_ylabel('count')
plt.show()

There are a lot of customers in the early 20's and there is another spike at the ages of 45-55. It could indicate a strategy of predicting differently for customers of those age groups.

In [None]:
median_age = customers_df['age'].median(skipna=True)
print(f"The customers' median age is {median_age}")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 7))
club_member_statuses = customers_df.groupby('club_member_status', as_index=False)['customer_id'].count()
ax = sns.barplot(data=club_member_statuses, x='club_member_status', y='customer_id')
plt.xlabel("club member status")
plt.ylabel("number of customers")
plt.show()

Given that almost all customers have an 'ACTIVE' membership, the column probably would not help the model.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 7))
club_member_statuses = customers_df.groupby('fashion_news_frequency', as_index=False)['customer_id'].count()
ax = sns.barplot(data=club_member_statuses, x='fashion_news_frequency', y='customer_id')
plt.xlabel("fashion news frequency")
plt.ylabel("number of customers")
plt.show()

### Analyzing the transactions dataset

In [None]:
transactions_df.head()

In [None]:
# Checking for missing values
transactions_df.isnull().sum()

There are no missing values in this dataset.

In [None]:
# Plotting transactions per day
transactions_per_day = transactions_df.groupby('t_dat', as_index=False)['article_id'].count()

fig, ax = plt.subplots(1, 1, figsize=(15, 7))
plt.plot(transactions_per_day['t_dat'], transactions_per_day['article_id'])
plt.xlabel("date")
plt.ylabel("number of transactions")
ax.set_xlim(transactions_per_day['t_dat'].min(), transactions_per_day['t_dat'].max())

plt.show()

In [None]:
transactions_df['t_dat_month'] = transactions_df['t_dat'].apply(lambda date: date.month)

In [None]:
# Plotting transactions grouped by month
fig, ax = plt.subplots(1, 1, figsize=(15, 7))
transactions_by_month = transactions_df.groupby('t_dat_month', as_index=False)['article_id'].count()
ax = sns.barplot(data=transactions_by_month, x='t_dat_month', y='article_id')
plt.xlabel("month")
plt.ylabel("number of transactions")
plt.show()

Most transactions occur during summer and the least occur during winter.

In [None]:
transactions_df['t_dat_weekday'] = transactions_df['t_dat'].apply(lambda date: date.weekday())

In [None]:
# Plotting transactions grouped by the day of the week
import calendar

fig, ax = plt.subplots(1, 1, figsize=(15, 7))
transactions_by_weekday = transactions_df.groupby('t_dat_weekday', as_index=False)['article_id'].count()
transactions_by_weekday['t_dat_weekday'] = transactions_by_weekday['t_dat_weekday'].apply(lambda x: calendar.day_name[x])
ax = sns.barplot(data=transactions_by_weekday, x='t_dat_weekday', y='article_id')
plt.xlabel("day of week")
plt.ylabel("number of transactions")
plt.show()

### Creating the Dataset for the first model

In [None]:
# Using only recent transactions for the recommender model
STARTING_DATE = '2020-09-15'
recent_transactions_df = transactions_df[transactions_df['t_dat'] > STARTING_DATE]

In [None]:
recent_transactions_df.head()

In [None]:
# Merging the transactions with the articles dataframe and keeping only a part of the columns (the ones that are numerical, since there were less unique values
# for names in the articles dataset)
# Product group name is kept because there was no numeric equivalent
transactions_articles_merged = recent_transactions_df.merge(articles_df, on='article_id')
kept_columns = ['customer_id', 'article_id', 'product_group_name', 'graphical_appearance_no', 'colour_group_code',
               'perceived_colour_value_id', 'perceived_colour_master_id', 'department_no', 'index_code', 'index_group_no',
               'section_no', 'garment_group_no']
transactions_articles_merged = transactions_articles_merged[kept_columns]

In [None]:
transactions_articles_merged.head()

In [None]:
# Encoding the fashion_news_frequency column. Firstly, consider that the missing values correspond to a 'NONE' frequency.
# Also, two entries have the value "None" instead of "NONE".
# We can use a label encoding for this column because the values could be ordered: NONE < Monthly < Regularly
customers_df['fashion_news_frequency'] = customers_df['fashion_news_frequency'].fillna('NONE')
customers_df.loc[customers_df['fashion_news_frequency'] == 'None', 'fashion_news_frequency'] = 'NONE'

def frequency_type_to_code(type):
    if type == 'NONE':
        return 0
    elif type == 'Monthly':
        return 1
    else:
        return 2

customers_df['fashion_news_frequency'] = customers_df['fashion_news_frequency'].apply(lambda x: frequency_type_to_code(x))

In [None]:
# Dropping some columns that don't offer enough value to the model (postal code is too varied and the other columns have few unique values but one of them
# dominates the others in frequence, it would be hard for the model to extract insights from those columns)
customers_df = customers_df.drop(['FN', 'Active', 'club_member_status', 'postal_code'], axis=1)

In [None]:
customers_df.head()

In [None]:
# Filling the missing values in the age column with the median age, since there are few missing values in this column
customers_df['age'] = customers_df['age'].fillna(median_age)

In [None]:
# Merging with the customers dataset and keeping only age and fashion news frequency as features from the customers dataset
all_merged = transactions_articles_merged.merge(customers_df, on='customer_id')
kept_columns.extend(['age', 'fashion_news_frequency'])
all_merged = all_merged[kept_columns]

In [None]:
all_merged.head()

In [None]:
# The values of most columns need to be encoded. This is the case for columns that are categorical variables.
# customer_id, article_id do not need to be encoded
# age and news frequency can be used as they are defined, since they can be ordered (for news frequency: NONE < Monthly < Regularly)
all_merged_ohe = pd.get_dummies(all_merged, columns=all_merged.columns[2:-2])
all_merged_ohe.shape

In [None]:
all_merged_ohe['customer_id'].nunique()

Too many customer ids for the model to finish in a reasonable time; selecting fewer transactions by using only customers that bought at least 3 times recently

In [None]:
# Will keep only the transactions of customers that have bought an article at least 3 times
CUSTOMER_MIN_TRANSACTIONS = 3

customers_num_purchases = all_merged_ohe.groupby('customer_id').size().reset_index(name='count')
customers_min_purchases = customers_num_purchases[customers_num_purchases['count'] >= CUSTOMER_MIN_TRANSACTIONS]['customer_id']

all_merged_ohe = all_merged_ohe[all_merged_ohe['customer_id'].isin(customers_min_purchases)]
all_merged_ohe['customer_id'].nunique()

In [None]:
all_merged_ohe.columns

In [None]:
scaler = StandardScaler()
scaled = scaler.fit_transform(all_merged_ohe.iloc[:, 2:])
scaled_df = pd.DataFrame(scaled, columns=all_merged_ohe.columns[2:])
scaled_df.head()

In [None]:
df1 = all_merged_ohe[['customer_id', 'article_id']].reset_index(drop=True)
df2 = scaled_df.reset_index(drop=True)
all_merged_scaled = pd.concat([df1, df2], axis=1)
all_merged_scaled.head()

In [None]:
def save_dataframe_to_csv(dataframe, filename):
    output_dir = '/kaggle/working'
    filepath = os.path.join(output_dir, filename)
    dataframe.to_csv(filepath, index=False)

In [None]:
save_dataframe_to_csv(all_merged_scaled, 'all_merged_scaled.csv')

### Dataset for the second model

In [None]:
# Selecting a small part of the dataset
sampled = recent_transactions_df.sample(n=200000)[['customer_id', 'article_id']]

In [None]:
sampled.head()

In [None]:
save_dataframe_to_csv(sampled, 'small_transactions.csv')

### Computing the top 12 most popular items recently bought (to recommend by default if we didn't compute a recommendation for the customer)

In [None]:
article_popularity_df = recent_transactions_df.groupby('article_id').count().reset_index().iloc[:, :2]
article_popularity_df.columns = ['article_id', 'count']
article_popularity_df = article_popularity_df.sort_values(by='count', ascending=False)

In [None]:
# For the customers that are included in the dataset but for whom recommendations could not be made due to the size of the problem,
# a default recommendation of the top 12 bought products in the timeframe will be made
top12_popular = article_popularity_df['article_id'].head(12)
save_dataframe_to_csv(top12_popular, 'top12_popular.csv')

In [None]:
# Saving the list of all customers in the dataset as they are not all part of the datasets used in training the models
all_customers = pd.DataFrame(customers_df['customer_id'], columns=['customer_id'])

In [None]:
save_dataframe_to_csv(all_customers, 'all_customers.csv')