In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

input_dir = '/kaggle/input/h-and-m-personalized-fashion-recommendations/'

In [None]:
articles_path = input_dir + 'articles.csv'
articles = pd.read_csv(articles_path, index_col='article_id')
articles.head()

In [None]:
articles.isnull().sum()

Note that only the detail_desc column has missing data. Since our approach won't make use of NLP, there is no need to apply a missing values strategy for this dataset

In [None]:
articles.nunique()

Note that some features are related, basically representing the same thing, but expressed differently (for example, product_type_no, product_type_name, product_group_name). Generally, we have pairs of tuples of features where one of them is a numeric code and the others are names or short descriptions. Note that the numeric columns have more unique values than the others. So we could try using those for our analysis.

In [None]:
article_features = ['product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id',
                    'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no']
train_articles = articles[article_features]
object_cols = [col for col in train_articles.columns if train_articles[col].dtype == 'object']
print(object_cols)

Checking whether some of those codes are non-numeric. There is indeed one such column, the 'index_code' one.

In [None]:
train_articles['index_code'].value_counts()

In [None]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
train_articles[['index_code']] = encoder.fit_transform(articles[['index_code']]).astype(int)
train_articles.head()

We are simply going to encode the index_code column. Another possible way would be a one hot encoding.

In [None]:
customers_path = input_dir + 'customers.csv'
customers = pd.read_csv(customers_path)
customers.head()

In [None]:
customers.isnull().sum()

In [None]:
customers['Active'].value_counts()

In [None]:
customers['FN'].value_counts()

In [None]:
customers['club_member_status'].value_counts()

Note that the FN and Active columns have many missing values and it is not quite clear what those could be replaced with (for example, one might try to replace the Active missing values with 0, suggesting inactivity, however there are many cases where there is a nan Active value but the club_member_status is ACTIVE). As a result, we are just going to drop them. On the other hand, we may assume that the lack of information regarding club_member_status might mean that those customers are not yet active in that club, thus we will replace those nan values with PRE-CREATE.

In [None]:
customers['fashion_news_frequency'].value_counts()

We can also assume that the missing values in fashion_news_frequency column correspond to NONE values, so we're going to replace them like that.

In [None]:
from sklearn.impute import SimpleImputer

features = ['customer_id', 'club_member_status', 'fashion_news_frequency', 'age']

train_customers = customers[features]
train_customers['club_member_status'] = customers['club_member_status'].fillna('PRE-CREATE').map(
    {'LEFT CLUB': 0, 'PRE-CREATE': 1, 'ACTIVE': 2}).astype(int)
train_customers['fashion_news_frequency'] = customers['fashion_news_frequency'].copy().fillna('NONE').map(
    {'NONE': 0, 'None': 0, 'Monthly': 1, 'Regularly': 2}).astype(int)

imputer = SimpleImputer(strategy='mean').fit(customers[['age']])
train_customers[['age']] = imputer.transform(train_customers[['age']])
train_customers = train_customers.set_index('customer_id')
train_customers.head()

In [None]:
transactions_path = input_dir + 'transactions_train.csv'
transactions = pd.read_csv(transactions_path, parse_dates=['t_dat'])
transactions.head()

In [None]:
transactions.isnull().sum()

There are no missing values here, so no need for any imputation

In [None]:
articles_by_customers = transactions.groupby('customer_id').article_id.apply(set).to_dict()
articles_popularity = transactions.groupby('article_id').customer_id.nunique().sort_values(ascending=False)

We are trying to create a simple measure for an article's popularity, more precisely how many customers have ever bought one such article (we could have tried the total number of times an article has been bought regardless of how many times the same customer bought it, but we may encounter bias by doing so, for example we might have one customer buying one article many times and thus skew the measurement) 

In [None]:
from sklearn.preprocessing import StandardScaler
scaled_customers = pd.DataFrame(StandardScaler().fit_transform(train_customers), index=train_customers.index, columns=train_customers.columns)
display(scaled_customers.head())

scaled_articles = pd.DataFrame(StandardScaler().fit_transform(train_articles), index=train_articles.index, columns=train_articles.columns)
display(scaled_articles.head())

Our approach will use the NearestNeighbors algorithm, because of that, even though our data is mostly discrete, rather than continuous, it makes sense to scale it, otherwise the distances computated by NN may be heavily influenced by a subset of features with wider ranges. 

In [None]:
scaled_customers.to_csv('scaled_customers.csv')

In [None]:
scaled_articles.to_csv('scaled_articles.csv')