This is the part 1 of the complete end-to-end recommendation system.
Part 2 coming soon.

# Import Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import category_encoders as ce
import pickle

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# Basic EDA and Data Cleaning

In [None]:
root_dir = '../input/h-and-m-personalized-fashion-recommendations'
articles_df_raw = pd.read_csv(root_dir+'/articles.csv')
customers_df_raw = pd.read_csv(root_dir+'/customers.csv')
transactions_df_raw = pd.read_csv(root_dir+'/transactions_train.csv')

In [None]:
articles_df = articles_df_raw.copy()
customers_df = customers_df_raw.copy()
transactions_df = transactions_df_raw.copy()

### Exploring articles 

In [None]:
articles_df.shape

In [None]:
articles_df.head().T

In [None]:
articles_df.dtypes

In [None]:
# Converting object dtype to categorical
categorical_columns = articles_df.select_dtypes(include='object').columns
for categorical_column in categorical_columns:
    articles_df[categorical_column] = pd.Categorical(articles_df[categorical_column])

In [None]:
all_columns = articles_df.columns.values

In [None]:
# percentage of uniqueness
for column in all_columns:
    per = len(articles_df[column].unique()) / articles_df.shape[0] * 100.0
    print(f'Percentage of unique {column}:\t {per}%')

In [None]:
articles_df.isnull().sum()

In [None]:
articles_df['detail_desc'].isnull().sum() / articles_df.shape[0] * 100.0

Only 0.39 percent detail desc are nan. So drop them.

In [None]:
articles_df = articles_df.dropna().reset_index()
articles_df.shape

In [None]:
# Top 10 product names
product_name = articles_df['prod_name'].value_counts()
product_name.head(10)

In [None]:
# Top 10 product types
prod_type_name = articles_df['product_type_name'].value_counts()
prod_type_name

In [None]:
# Top 10 product_group_name
product_group_name = articles_df['product_group_name'].value_counts()
product_group_name.head(10)

In [None]:
# Top 10 graphical_appearance_name
graphical_appearance_name = articles_df['graphical_appearance_name'].value_counts()
graphical_appearance_name.head(10)

In [None]:
# Top 10 colour_group_name
colour_group_name = articles_df['colour_group_name'].value_counts()
colour_group_name.head(10)

In [None]:
# Top 10 perceived_colour_value_name
perceived_colour_value_name = articles_df['perceived_colour_value_name'].value_counts()
perceived_colour_value_name.head(10)

In [None]:
# Top 10 perceived_colour_master_name
perceived_colour_master_name = articles_df['perceived_colour_master_name'].value_counts()
perceived_colour_master_name.head(10)

In [None]:
# Top 10 department names
department_name = articles_df['department_name'].value_counts()
department_name.head(10)

In [None]:
# Top 10 index_name
index_name = articles_df['index_name'].value_counts()
index_name.head(10)

In [None]:
# Top 10 section_name
section_name = articles_df['section_name'].value_counts()
section_name.head(10)

In [None]:
# Top 10 detail_desc
detail_desc = articles_df['detail_desc'].value_counts()
detail_desc.head(10)

In [None]:
sns.countplot(x='perceived_colour_value_name',data=articles_df, hue='perceived_colour_master_name')
plt.legend(bbox_to_anchor=(1.1, 1))
plt.xticks(rotation=90)

In [None]:
sns.countplot(x='index_group_name',data=articles_df, hue='index_name')
plt.legend(bbox_to_anchor=(1.1, 1))

#### Computing correlation between two categorical features

In [None]:
# function courtesy - 
# https://stackoverflow.com/questions/46498455/categorical-features-correlation/46498792#46498792
def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

In [None]:
f1 = 'department_name'
f2 = 'garment_group_name'

confusion_matrix = pd.crosstab(articles_df[f1], articles_df[f2])
cramers_v(confusion_matrix.values)

Looks like there is a high correlation between **department_name** and **garment_group_name**.
These can be combined together.

In [None]:
f1 = 'index_name'
f2 = 'index_group_name'

confusion_matrix = pd.crosstab(articles_df[f1], articles_df[f2])
cramers_v(confusion_matrix.values)

High correlation again

**index_name** and **index_group_name** can also be combined.

Combine these as well.

In [None]:
f1 = 'colour_group_name'
f2 = 'perceived_colour_value_name'
f3 = 'perceived_colour_master_name'

confusion_matrix = pd.crosstab(articles_df[f1], articles_df[f2])
f1_f2 = cramers_v(confusion_matrix.values)

confusion_matrix = pd.crosstab(articles_df[f1], articles_df[f3])
f1_f3 = cramers_v(confusion_matrix.values)

confusion_matrix = pd.crosstab(articles_df[f2], articles_df[f3])
f2_f3 = cramers_v(confusion_matrix.values)

print(f'Between f1 and f2: {f1_f2}')
print(f'Between f1 and f3: {f1_f3}')
print(f'Between f2 and f3: {f2_f3}')

Combine all three.

In [None]:
f1 = 'prod_name'
f2 = 'product_type_name'
f3 = 'product_group_name'

confusion_matrix = pd.crosstab(articles_df[f1], articles_df[f2])
f1_f2 = cramers_v(confusion_matrix.values)

confusion_matrix = pd.crosstab(articles_df[f1], articles_df[f3])
f1_f3 = cramers_v(confusion_matrix.values)

confusion_matrix = pd.crosstab(articles_df[f2], articles_df[f3])
f2_f3 = cramers_v(confusion_matrix.values)

print(f'Between f1 and f2: {f1_f2}')
print(f'Between f1 and f3: {f1_f3}')
print(f'Between f2 and f3: {f2_f3}')

Combine all three. 

Note: We combine only if the correlation value is greater than 0.5

### Exploring customers

In [None]:
customers_df.shape

In [None]:
customers_df.head(2).T

In [None]:
customers_df.dtypes

In [None]:
# Converting object dtype to categorical
categorical_columns = customers_df.select_dtypes(include='object').columns
for categorical_column in categorical_columns:
    if 'customer_id' not in categorical_column:
        customers_df[categorical_column] = pd.Categorical(customers_df[categorical_column])

In [None]:
100.0 * customers_df.isnull().sum() / customers_df.shape[0]

**FN** and **Active** has more than 30% Nans. Drop them.

Also drop postal code (for this version, later we will analyze it).

In [None]:
customers_df.drop(['FN', 'Active', 'postal_code'], inplace=True, axis=1)

In [None]:
customers_df.isnull().sum()

We still have NaNs. Let's fill them.

Let's plot the distributions first

In [None]:
sns.countplot(customers_df.club_member_status)
customers_df.club_member_status.value_counts()

In [None]:
sns.countplot(customers_df.fashion_news_frequency)
customers_df.fashion_news_frequency.value_counts()

In [None]:
sns.distplot(customers_df.age)

In [None]:
sns.barplot(x='club_member_status', y='age', data=customers_df)

In [None]:
customers_df_ = customers_df.copy()
map_means = customers_df_.groupby('club_member_status')['age'].mean().to_dict()
map_means

In [None]:
# Impute club_member_status by majority vote value which is "Active".
customers_df['club_member_status'].fillna('ACTIVE', inplace=True)

In [None]:
# Impute fashion_news_frequency by majority vote value which is "NONE".
customers_df['fashion_news_frequency'].fillna('NONE', inplace=True)

In [None]:
# Impute age based on club_member_status

# Using for loop here, instead of direct mapping, since direct mapping seemed to break the code. 

# idx_nan_age = customers_df.loc[np.isnan(df['age'])].index
# customers_df.loc[idx_nan_age,'age'].loc[idx_nan_age] = customers_df['club_member_status'].loc[idx_nan_age].map(map_means)

# Suggestions are always welcome.

mask = customers_df['age'].isnull()
age_values = customers_df.loc[mask, 'club_member_status'].map(map_means).values
is_nan_age = customers_df.loc[mask, 'age'].index.values

for i in tqdm(range(len(is_nan_age))):
    customers_df.loc[is_nan_age[i], 'age'] = age_values[i]

In [None]:
customers_df.isnull().sum()

In [None]:
customers_df.fashion_news_frequency.unique()

Looking at the unique values of **fashion_news_frequency**, we see there is **NONE** and **None**.

Are they same? Maybe yes. Unless **NONE** is an acronym for something.

For now let's consider them to be same.

In [None]:
customers_df.fashion_news_frequency.value_counts()

In [None]:
mask = customers_df[customers_df['fashion_news_frequency'] == 'None'].index.values
customers_df.loc[mask, 'fashion_news_frequency'] = 'NONE'

In [None]:
customers_df.fashion_news_frequency.value_counts()

### Explore transactions

In [None]:
transactions_df.shape

In [None]:
transactions_df.head()

In [None]:
transactions_df['t_dat'] = pd.to_datetime(transactions_df['t_dat'])

In [None]:
transactions_df.dtypes

In [None]:
transactions_df.isnull().sum()

In [None]:
transactions_df.nunique()

In [None]:
dup_df = transactions_df[transactions_df.duplicated()]
dup_df.head()

In [None]:
idx = 15
transactions_df[(transactions_df.t_dat == dup_df.loc[idx].t_dat) & 
                (transactions_df.customer_id == dup_df.loc[idx].customer_id) & 
                (transactions_df.article_id == dup_df.loc[idx].article_id) & 
                (transactions_df.price == dup_df.loc[idx].price) & 
                (transactions_df.sales_channel_id == dup_df.loc[idx].sales_channel_id)
               ]

We now club the duplicate rows and mention the count of duplicate rows into a new column **article_purchase_count**

In [None]:
%%time
transactions_df = transactions_df.groupby(transactions_df.columns.tolist()).size().reset_index().rename(columns={0:'article_purchase_count'})
transactions_df.head()

In [None]:
idx = 20
transactions_df[(transactions_df.customer_id == dup_df.loc[idx].customer_id) & 
                (transactions_df.article_id == dup_df.loc[idx].article_id)]

Let's see the purchasing history of a single customer.

In [None]:
customer_0_df = transactions_df[transactions_df.customer_id == transactions_df.iloc[0,1]]
customer_0_df

# Feature Engineering

### Customers FE

In [None]:
customers_df.head()

In [None]:
customers_df.dtypes

In [None]:
dummies = pd.get_dummies(customers_df[['club_member_status', 'fashion_news_frequency']], drop_first=True)
customers_df = pd.concat([customers_df, dummies], axis=1)
customers_df.drop(['club_member_status','fashion_news_frequency'], axis=1, inplace=True)
customers_df.head()

In [None]:
# rescale age data
age_scaler = MinMaxScaler()

customers_df[['age']] = age_scaler.fit_transform(customers_df[['age']])
customers_df.head()

### Articles FE

In [None]:
articles_df.head().T

In [None]:
non_interesting_columns = ['index', 'product_code', 'product_type_no', 'graphical_appearance_no',
                           'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id',
                           'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no',
                           ]

articles_df.drop(non_interesting_columns, axis=1, inplace=True)
articles_df.head().T

In [None]:
# merge department_name and garment_group_name

f1 = 'department_name'
f2 = 'garment_group_name'

articles_df['department'] = articles_df[f1].astype(str) + '_' +articles_df[f2].astype(str)
articles_df.drop([f1,f2], axis=1, inplace=True)

In [None]:
# merge index_name and index_group_name

f1 = 'index_name'
f2 = 'index_group_name'

articles_df['index'] = articles_df[f1].astype(str) + '_' + articles_df[f2].astype(str)
articles_df.drop([f1,f2], axis=1, inplace=True)

In [None]:
f1 = 'colour_group_name'
f2 = 'perceived_colour_value_name'
f3 = 'perceived_colour_master_name'

articles_df['color'] = articles_df[f1].astype(str) + '_' + articles_df[f2].astype(str) + '_' + articles_df[f3].astype(str)
articles_df.drop([f1,f2,f3], axis=1, inplace=True)

In [None]:
f1 = 'prod_name'
f2 = 'product_type_name'
f3 = 'product_group_name'

articles_df['product'] = articles_df[f1].astype(str) + '_' +  articles_df[f2].astype(str) + '_' + articles_df[f3].astype(str)
articles_df.drop([f1,f2,f3], axis=1, inplace=True)

In [None]:
articles_df.head().T

In [None]:
articles_df.nunique()

In [None]:
# hash encoding categorical columns of articles_df

encoder = ce.HashingEncoder(cols=['graphical_appearance_name',
                                  'section_name',
                                  'detail_desc',
                                  'department',
                                  'index',
                                  'color',
                                  'product'
                                 ], n_components=1000)

In [None]:
articles_df = encoder.fit_transform(articles_df)

In [None]:
articles_df.columns

### Transactions FE

In [None]:
transactions_df.head()

In [None]:
# rescale 
apc_scaler = StandardScaler()

transactions_df[['price', 'article_purchase_count']] = apc_scaler.fit_transform(transactions_df[['price', 'article_purchase_count']])
transactions_df.head()

In [None]:
# save as a pickle file
transactions_df.to_pickle('t_df.pkl')
articles_df.to_pickle('a_df.pkl')
customers_df.to_pickle('c_df.pkl')

In [None]:
print('Done!')