# H&M Recommender System

## Importing the libraries

In [1]:
#working with files and memory management
import gc
import pickle

In [2]:
import pandas as pd
import numpy as np

In [3]:
#used during data exploration and model evaluation
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

In [4]:
#working with datetime feature
from datetime import datetime

In [5]:
#handling missing values where not dropped
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

In [6]:
#for evaluating our model
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score

In [96]:
#for dimension reduction
from sklearn.pipeline import Pipeline # to sequence training events
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

In [98]:
#model
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids

In [9]:
#used to provide information to the user when running this notebook
from IPython.display import display, clear_output

## Importing the dataset

In [10]:
#get transaction data
transactions_train_df = pd.read_csv("input/transactions_train.csv") # import the transactions dataset

In [11]:
#get product meta data
articles_df = pd.read_csv("input/articles.csv")

In [12]:
#get customer meta data
customers_df = pd.read_csv("input/customers.csv")

## Exploratory Data Analysis & Dataset Preparation

In this section we first looked at what data was available, it's distribution, what was missing and what opportunities were available to reduce the number of features or dimensions in our dataset. Secondly we determined what models could work well with the data, finally we looked to fix any missing values or encoding categorical variables where needed.

An exploritory data analysis was conducted already by various other kaggle contestants such as (Karpov, 2022). Their analysis was reviewed as part of this workbook in order to reduce this EDA section and allow us to focus on model building, prediction and evaluation. 

Data available consisted of images of every product, detailed metadata of every product, detailed metadata of every customer and purchase details for customers who bought products. These will be refered to as images,articles,customers and transactions respectively. Although it is assumed that images are an important part of how customers decide on the products they purchase, due to the data size and limited processing power, they will not be used here.

In [13]:
transactions_train_df.head(3)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2


In [14]:
transactions_train_df.nunique()

t_dat                   734
customer_id         1362281
article_id           104547
price                  9857
sales_channel_id          2
dtype: int64

In [15]:
customers_df.head(3)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...


In [16]:
customers_df.nunique()

customer_id               1371980
FN                              1
Active                          1
club_member_status              3
fashion_news_frequency          4
age                            84
postal_code                352899
dtype: int64

In [17]:
articles_df.head(3)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [18]:
articles_df.nunique()

article_id                      105542
product_code                     47224
prod_name                        45875
product_type_no                    132
product_type_name                  131
product_group_name                  19
graphical_appearance_no             30
graphical_appearance_name           30
colour_group_code                   50
colour_group_name                   50
perceived_colour_value_id            8
perceived_colour_value_name          8
perceived_colour_master_id          20
perceived_colour_master_name        20
department_no                      299
department_name                    250
index_code                          10
index_name                          10
index_group_no                       5
index_group_name                     5
section_no                          57
section_name                        56
garment_group_no                    21
garment_group_name                  21
detail_desc                      43404
dtype: int64

There are over 31.9 million transactions and over 3gb in size. With our limited space and processing power, this made working with the dataset slow and unweidly. Instead we were only able to sample this dataset.

For the customer's dataset we have 1,37 million customers from 352,899 locations, it is assumed that FN stands for whether h&m have the customers is signed up for fashion news. Several other features are also available such as post code. We will have to convert NaNs into zero values, we will do the same for Active. For Fashion news frequency we will have to encode these orginal categories. This might also determine customer quality for recommendation.

105,542 products are in the articles dataset. Regarding columns in this dataset, every item had a unique identifier called the article id, but it also had a product code and a product name. The identifier and the product code were not the same, it was assumed that this was due to size differences or colour variations in H&M's clothing (e.g a v-neck polo shirt could be in a small, medium and large, as well as having two colours, black and white). 

The product name could probably be dropped later as the product code and name seem to match. This seems to be true for the product type name, colour group name and graphical appearance name. We could drop the names and keep the numbers. We will however keep product group name as there doesn't seem to be a corisponding type_no. We will have to encode this ourselves.

### Making Recommendations

In a perfect scenario for a recommendation system we would have a table of m users by n items, with each product given a rating r_ij by each user. However We could have hundreds of users and thousands of products. A user may not have tried every product so our table would have missing values. To solve this issue we would need to predict the value for missing cells (rhat_ij). A good prediction would mean a good recommendation to the user. Another way to make recommendations to users would be to rank the top k products for each user. This would be based on information we have available on products and users. In essence the prediction problem boils down to how we rate products.

In order for us to rate products we would first need some metric to rate them by. Secondly, we would need to decide the prerequisites that a product must meet to recieve said rating. Thirdly we would then need to calculate the score for each item that satisfies the prerequisites and finally we would output a list of items in decreasing order. Unfortunately, H&M have not provided any labelling for us. W+e must make our own. This makes the problem of recommendation more difficult.

In our Transaction dataset we have customers who bought products at a particular price and time. We started here, with these features to try create a simple collaborative model recommendation system. The model tried to learn from a customer's historical purchases and make predictions about their future purchases.  

Since we don't actual have a customer item ratings like a 1 to 5 rating per item, we will assume qty of purchase indicates customer interest in products. If we have outliers they may bias our data. In this case we can assume that 68% of customer transaction will lie within 1 standard deviation from the mean so we could take anything 3 standard deviations from the mean as rare events (1%) and remove them.

We will pick a random customer with a few recent transactions and try to predict their future buying habits based on past data about them. We will configure a dataset of purchase made by the customer in the past. We will then use the meta data of the customer and the products to form a model we can use to predict if the customer will by a certain product or not.

### Prepare the Transaction Data

In [19]:
#First we will convert our date text into a panda date type.
transactions_train_df["t_dat"] = pd.to_datetime(transactions_train_df["t_dat"])

In [20]:
#we convert articles to string instead of default int.
transactions_train_df['article_id'] = transactions_train_df['article_id'].values.astype(str)

In [21]:
# we want to see distributions and std dev
transactions_train_df.describe()

Unnamed: 0,price,sales_channel_id
count,31788320.0,31788320.0
mean,0.02782927,1.704028
std,0.01918113,0.4564786
min,1.694915e-05,1.0
25%,0.01581356,1.0
50%,0.02540678,2.0
75%,0.03388136,2.0
max,0.5915254,2.0


We will use a 3 week date range between 2020-09-8 and 2020-09-22 to reduce our dataset size.

In [22]:
mask = (transactions_train_df['t_dat'] >= '2020-09-01') & (transactions_train_df['t_dat'] <= '2020-09-22')

In [23]:
features_df = transactions_train_df.loc[mask]
features_df['customer_id'].size

798269

In [24]:
features_df = features_df[['article_id','customer_id', 't_dat', 'price', 'sales_channel_id']]

### Prepare Product Data

In [25]:
#we convert articles to string instead of default int.
articles_df['article_id'] = articles_df['article_id'].values.astype(str)

In [26]:
#merge product meta data with transactions
features_df = features_df.merge(articles_df, left_on='article_id', right_on='article_id')

In [27]:
features_df.columns

Index(['article_id', 'customer_id', 't_dat', 'price', 'sales_channel_id',
       'product_code', 'prod_name', 'product_type_no', 'product_type_name',
       'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')

Regarding columns in the articles data set, every item had a unique identifier called the article id, but it also had a product code and a product name. The identifier and the product code were not the same, it was assumed that this was due to size differences or colour variations in H&M's clothing (e.g a v-neck polo shirt could be in a small, medium and large, as well as having two colours, black and white). 

The product name could probably be dropped as the product code and name seem to match. This seems to be true for the product type name, colour group name and graphical appearance name. We could drop the names and keep the numbers. We will however keep product_group_name as there doesn't seem to be a corisponding type_no. We will have to encode this ourselves.

There was no missing data so we did not need to do anything like imputing missing data with sklearn SimpleImputer

In [28]:
features_df.drop(['prod_name',
                  'product_type_name',
                  'graphical_appearance_name',
                  'colour_group_name',
                  'perceived_colour_value_name',
                  'perceived_colour_master_name',
                  'department_name',
                  'index_name',
                  'index_group_name',
                  'section_name',
                  'garment_group_name',
                  'detail_desc'], axis=1)

Unnamed: 0,article_id,customer_id,t_dat,price,sales_channel_id,product_code,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no
0,777148006,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,2020-09-01,0.013542,1,777148,252,Garment Upper body,1010010,52,7,4,1626,A,1,15,1003
1,777148006,5ac5e1825104ed5fe3333e75b9337eebc4b45ad761056b...,2020-09-03,0.013542,1,777148,252,Garment Upper body,1010010,52,7,4,1626,A,1,15,1003
2,777148006,0dcf3023ea1992a78a1fcc769b6befc956f7308186496d...,2020-09-06,0.013542,1,777148,252,Garment Upper body,1010010,52,7,4,1626,A,1,15,1003
3,777148006,28b30893bbe946358103760387e3dcd09fdb7b077a942f...,2020-09-06,0.042356,2,777148,252,Garment Upper body,1010010,52,7,4,1626,A,1,15,1003
4,777148006,278f23c7fac720c2b96b25455d640860bdfa8bb3c867cf...,2020-09-10,0.013542,1,777148,252,Garment Upper body,1010010,52,7,4,1626,A,1,15,1003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
798264,737994021,f71529889de7a28df0015fad0a043941ecc98883286ef0...,2020-09-22,0.030492,1,737994,273,Garment Lower body,1010023,72,2,2,7917,H,4,76,1016
798265,533261032,f79e372e21c1359dfebc7da0bf7f321d55e47b3275c351...,2020-09-22,0.033881,2,533261,256,Garment Upper body,1010016,17,2,13,6515,G,4,44,1002
798266,865792012,f82c91decd5f9abd0a7a72eae0d4911b00ed4f5b4f04f9...,2020-09-22,0.008458,2,865792,273,Garment Lower body,1010001,73,4,2,6525,G,4,40,1005
798267,772659001,f96661e9e56449885d4c4b90d3227e4abea5e0d2382e2d...,2020-09-22,0.016932,1,772659,274,Garment Lower body,1010016,33,4,3,1948,A,1,18,1009


### Prepare Customer Data

In [29]:
#merge customer meta data with transactions
features_df = features_df.merge(customers_df, left_on='customer_id', right_on='customer_id')

In [30]:
#drop post code as it is similar to customer_id
features_df = features_df.drop(['postal_code'], axis=1)

In [31]:
#we reorganise columns
features_df = features_df[['customer_id',#the customer
                           'FN',#customer meta data
                           'Active',
                           'club_member_status', 
                           'fashion_news_frequency', 
                           'age',
                           'product_code',#product meta data
                           'product_type_no',
                           'product_group_name',
                           'graphical_appearance_no',
                           'colour_group_code', 
                           'perceived_colour_value_id', 
                           'perceived_colour_master_id', 
                           'department_no',  
                           'index_code', 
                           'index_group_no',  
                           'section_no', 
                           'garment_group_no', 
                           't_dat',#transaction meta data
                           'price',
                           'sales_channel_id', 
                           'article_id']]#the product

In [32]:
#convert from objects and floats to categories and ints
features_df['club_member_status'] = features_df['club_member_status'].astype('category')
features_df['fashion_news_frequency'] = features_df['fashion_news_frequency'].astype('category')

In [33]:
#check for missing values
def find_missing(df):
    missing = df.isnull().sum() # ref: https://stackoverflow.com/questions/59694988/python-pandas-dataframe-find-missing-values
    print(df.shape)
    print(missing)

In [34]:
find_missing(features_df)

(798269, 22)
customer_id                        0
FN                            443296
Active                        448465
club_member_status              1358
fashion_news_frequency          1752
age                             2931
product_code                       0
product_type_no                    0
product_group_name                 0
graphical_appearance_no            0
colour_group_code                  0
perceived_colour_value_id          0
perceived_colour_master_id         0
department_no                      0
index_code                         0
index_group_no                     0
section_no                         0
garment_group_no                   0
t_dat                              0
price                              0
sales_channel_id                   0
article_id                         0
dtype: int64


In [35]:
features_df.iloc[:, 8:-13].values

array([['Garment Upper body'],
       ['Garment Upper body'],
       ['Garment Lower body'],
       ...,
       ['Accessories'],
       ['Garment Full body'],
       ['Garment Upper body']], dtype=object)

In [36]:
features_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,product_code,product_type_no,product_group_name,graphical_appearance_no,...,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,t_dat,price,sales_channel_id,article_id
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,1.0,1.0,ACTIVE,Regularly,44.0,777148,252,Garment Upper body,1010010,...,4,1626,A,1,15,1003,2020-09-01,0.013542,1,777148006
1,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,1.0,1.0,ACTIVE,Regularly,44.0,835801,252,Garment Upper body,1010016,...,9,1626,A,1,15,1003,2020-09-01,0.018627,1,835801001
2,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,1.0,1.0,ACTIVE,Regularly,44.0,923134,272,Garment Lower body,1010016,...,19,1636,A,1,15,1005,2020-09-01,0.012695,1,923134005
3,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,1.0,1.0,ACTIVE,Regularly,44.0,865929,254,Garment Upper body,1010001,...,11,1636,A,1,15,1005,2020-09-01,0.016932,1,865929003
4,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,1.0,1.0,ACTIVE,Regularly,44.0,935858,252,Garment Upper body,1010016,...,5,4091,D,2,50,1001,2020-09-07,0.016932,1,935858001


In [37]:
features_df['FN'] = features_df['FN'].fillna(0)
features_df['Active'] = features_df['Active'].fillna(0)

club_member_status = features_df.iloc[:, 3:-18].values
fashion_news_frequency = features_df.iloc[:, 4:-17].values
age = features_df.iloc[:, 5:-16].values

#ref: https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
imputer_med = SimpleImputer(missing_values=np.nan, strategy='median')
imputer_mf = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

#we replace missing values with the most frequent
imputer_mf.fit(club_member_status)
club_member_status = imputer_mf.transform(club_member_status)

imputer_mf.fit(fashion_news_frequency)
fashion_news_frequency = imputer_mf.transform(fashion_news_frequency)

#we replace any missing age values with the median age
imputer_med.fit(age)
age = imputer_med.transform(age)

#now add corrected columns back into our main customer dataframe
features_df.iloc[:, 3:-18] = club_member_status
features_df.iloc[:, 4:-17] = fashion_news_frequency
features_df.iloc[:, 5:-16] = age

#replace minus sign in text and check result of dataset after imputing missing values
features_df.columns = features_df.columns.str.replace('-', '')

#lower case columns
features_df.columns = map(str.lower, features_df.columns)

find_missing(features_df)

(798269, 22)
customer_id                   0
fn                            0
active                        0
club_member_status            0
fashion_news_frequency        0
age                           0
product_code                  0
product_type_no               0
product_group_name            0
graphical_appearance_no       0
colour_group_code             0
perceived_colour_value_id     0
perceived_colour_master_id    0
department_no                 0
index_code                    0
index_group_no                0
section_no                    0
garment_group_no              0
t_dat                         0
price                         0
sales_channel_id              0
article_id                    0
dtype: int64


In [38]:
#We will encode and scale after we have split our data
features_df.tail()

Unnamed: 0,customer_id,fn,active,club_member_status,fashion_news_frequency,age,product_code,product_type_no,product_group_name,graphical_appearance_no,...,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,t_dat,price,sales_channel_id,article_id
798264,d5013b57392ac330a87fdf6c04d439594bfb8776afe035...,0.0,0.0,ACTIVE,NONE,34.0,828321,265,Garment Full body,1010014,...,5,4314,J,4,43,1019,2020-09-22,0.033881,1,828321001
798265,d98a24c79ecfe9e1e52f9ff5d0ffc4f84740c317591530...,1.0,1.0,ACTIVE,Regularly,75.0,818890,76,Accessories,1010016,...,5,3519,C,1,65,1019,2020-09-22,0.016932,1,818890001
798266,d98a24c79ecfe9e1e52f9ff5d0ffc4f84740c317591530...,1.0,1.0,ACTIVE,Regularly,75.0,818890,76,Accessories,1010016,...,5,3519,C,1,65,1019,2020-09-22,0.016932,1,818890001
798267,e991c3fcb6730496d8ba1c521121d55fb1dfd0ab98b748...,1.0,1.0,ACTIVE,Regularly,21.0,930405,265,Garment Full body,1010026,...,20,1322,A,1,15,1013,2020-09-22,0.06778,2,930405002
798268,f1b9cf466441305d09034354ccbb6f18faf9deaa99b85b...,0.0,0.0,ACTIVE,NONE,28.0,790006,262,Garment Upper body,1010016,...,5,1201,A,1,19,1007,2020-09-22,0.084729,2,790006001


### Split Data Into Train & Test Set 
We take the past 2 weeks as training data and 1 week in the future as test data

In [75]:
train_mask = (features_df['t_dat'] >= '2020-09-21') & (features_df['t_dat'] <= '2020-09-21')
train_df = features_df.loc[train_mask]
train_df['customer_id'].size

32130

In [76]:
test_mask = (features_df['t_dat'] >= '2020-09-22') & (features_df['t_dat'] <= '2020-09-22')
test_df = features_df.loc[test_mask]
test_df['customer_id'].size

32866

### Encode Data (After Timesplit)

In [77]:
#we now encode any categorical variables in our training and testing data
le = preprocessing.LabelEncoder()
train_df.iloc[:,3] = le.fit_transform(train_df.iloc[:,3])#club_member_status
train_df.iloc[:,4] = le.fit_transform(train_df.iloc[:,4])#fashion_news_frequency
train_df.iloc[:,8] = le.fit_transform(train_df.iloc[:,8])#product_group_name
train_df.iloc[:,14] = le.fit_transform(train_df.iloc[:,14])#index_code

test_df.iloc[:,3] = le.fit_transform(test_df.iloc[:,3])#club_member_status
test_df.iloc[:,4] = le.fit_transform(test_df.iloc[:,4])#fashion_news_frequency
test_df.iloc[:,8] = le.fit_transform(test_df.iloc[:,8])#product_group_name
test_df.iloc[:,14] = le.fit_transform(test_df.iloc[:,14])#index_code


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.iloc[:,3] = le.fit_transform(train_df.iloc[:,3])#club_member_status
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.iloc[:,4] = le.fit_transform(train_df.iloc[:,4])#fashion_news_frequency
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.iloc[:,8] = le.fit_transform(train_df.

In [78]:
#encode date as ordinal after we split our training and test data
train_df['t_dat'] = train_df['t_dat'].apply(lambda x: x.toordinal())
#reverse encoding. convert from ordinal to date
#features_df['t_dat'] = features_df['t_dat'].apply(lambda x: datetime.fromordinal(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['t_dat'] = train_df['t_dat'].apply(lambda x: x.toordinal())


In [79]:
#encode date as ordinal after we split our training and test data
test_df['t_dat'] = test_df['t_dat'].apply(lambda x: x.toordinal())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['t_dat'] = test_df['t_dat'].apply(lambda x: x.toordinal())


In [80]:
train_df.head(1)

Unnamed: 0,customer_id,fn,active,club_member_status,fashion_news_frequency,age,product_code,product_type_no,product_group_name,graphical_appearance_no,...,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,t_dat,price,sales_channel_id,article_id
180,44cedf42ef296b66eb0842bed7234b7f83ba3bb49a45df...,1.0,1.0,0,1,31.0,918292,273,5,1010010,...,5,8310,9,26,5,1005,737689,0.031763,2,918292001


In [81]:
test_df.head(1)

Unnamed: 0,customer_id,fn,active,club_member_status,fashion_news_frequency,age,product_code,product_type_no,product_group_name,graphical_appearance_no,...,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,t_dat,price,sales_channel_id,article_id
35,4078d35f7b2ae7a56cfdaec8c42959bf28f1f7ed742ab6...,0.0,0.0,0,1,25.0,835801,252,4,1010016,...,9,1626,0,1,15,1003,737690,0.011847,1,835801001


### Create Feature & Target Matrix

In [82]:
#These are the attributes of our customers
X_train = train_df.iloc[:, 1: 20].values
X_train.shape

(32130, 19)

In [83]:
# this is the product or in our case the class
y_train = train_df.iloc[:, 21].values
y_train.shape

(32130,)

In [84]:
# these are future customers
X_test = test_df.iloc[:, 1: 20].values
X_test.shape

(32866, 19)

In [85]:
# these are future products
y_test = test_df.iloc[:, 21].values
y_test.shape

(32866,)

### Feature Scaling: MinMax - range (0,1)

In [86]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))

In [87]:
X_train_scaled = min_max_scaler.fit_transform(X_train)
X_train_scaled

array([[1.        , 1.        , 0.        , ..., 0.16666667, 0.        ,
        0.0734004 ],
       [1.        , 1.        , 0.        , ..., 0.83333333, 0.        ,
        0.03818913],
       [1.        , 1.        , 0.        , ..., 0.66666667, 0.        ,
        0.04321932],
       ...,
       [0.        , 0.        , 0.        , ..., 0.08333333, 0.        ,
        0.01002012],
       [1.        , 1.        , 0.        , ..., 0.25      , 0.        ,
        0.23939638],
       [1.        , 1.        , 0.        , ..., 0.25      , 0.        ,
        0.23939638]])

In [105]:
X_test_scaled = min_max_scaler.fit_transform(X_test)
X_test_scaled

array([[0.        , 0.        , 0.        , ..., 0.08333333, 0.        ,
        0.02174204],
       [0.        , 0.        , 0.        , ..., 0.75      , 0.        ,
        0.02174204],
       [1.        , 1.        , 0.        , ..., 0.04166667, 0.        ,
        0.06529313],
       ...,
       [1.        , 1.        , 0.        , ..., 0.75      , 0.        ,
        0.03179229],
       [1.        , 1.        , 0.        , ..., 0.5       , 0.        ,
        0.13229481],
       [0.        , 0.        , 0.        , ..., 0.25      , 0.        ,
        0.16579564]])

### Export Cleaned Data to Text File

In [88]:
#np.savetxt("data/X_train_scaled_sample.csv", X_train_scaled, delimiter=",")

In [89]:
#np.savetxt("data/y_test_scaled_sample.csv", y_train, delimiter=",")

## Model Building

### Training the K-NN model on our Training set

In [90]:
#steps = [('svd', TruncatedSVD(n_components=15)), ('knn', hm_clf)]
# n_neighbors: number of neighbors. Default is 5
# metric="minkowski", p=2: will calculate distance as eucledian distance formula

#steps = [('pca', PCA(n_components = 0.95)), ('knn', KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2))]
#steps = [('svd', TruncatedSVD(n_components=15)), ('knn', KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2))]
#steps = [('svd', TruncatedSVD(n_components=15)), ('knn', KNeighborsClassifier(n_neighbors=4, metric="minkowski", p=2))]
#steps = [('svd', TruncatedSVD(n_components=15)), ('knn', KNeighborsClassifier(n_neighbors=3, metric="minkowski", p=2))]

### Training the Kmeans model on our Training set

In [99]:
steps = [('pca', PCA(n_components=2)), ('km', KMeans(n_clusters=12))]

In [100]:
model = Pipeline(steps=steps)

In [101]:
#model.fit(X_train_scaled, y_train) # for knn

In [102]:
model.fit(X_train_scaled) # for kmedoids

Pipeline(steps=[('pca', PCA(n_components=2)), ('km', KMeans(n_clusters=12))])

### Predicting a new result

In [103]:
#print(classifier.predict(sc.transform([[0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37, 30,2020-09-08]])))

### Predicting the Test set results

In [106]:
y_pred = model.predict(X_test_scaled)

In [107]:
y_pred # here we see the model predict products for the test data.

array([ 0,  6, 10, ...,  1,  5,  0])

In [108]:
len(y_pred)

32866

## Model Evaluation

In [87]:
print("Accuracy of H&M KNN Classifier:", accuracy_score(y_test, y_pred))

Accuracy of H&M KNN Classifier: 0.4259922512051178


In [88]:
print("Precision Score for H&M KNN Classifier:", precision_score(y_test, y_pred, average='macro'))

Precision Score for H&M KNN Classifier: 0.17221084804834588


  _warn_prf(average, modifier, msg_start, len(result))


## Results & Discussion

### Results
TEST 1: H&M KNN Classifier Model Accuracy (k=5) was 0.00497 and Precision Score was 0.001, both less than 1% accurate.
 * We used the last 3 weeks in September 2020 from our transaction dataset to be used later for training and testing data.
 * We used Customer attributes, the full date range (YYYY-MM-DD) and transaction attributes as features to predict products.
 * We filled in missing data in our customer attributes using SimpleImputer to impute most frequent categories and median was used to impute missing ages.
 * We split our data up by time instead of random assignment. This meant 2 weeks (in the past) were used for training and 1 week (in the future) was used for testing testing
 * We encoded our categorical data using the LabelEncoder and date feature by converting the date into an ordinal number. This was after the training/test split.
 * We created our training and testing datasets and scaled them using MinMax range (0 to 1)
 * To build the KNN Classifier model, we used k=5 and our metric was minkowski p2, This meant we calculated the distance using eucledian distance formula.
 * To evaluate the model, we used accuracy as a measuring score. Which compared our predicted products to what was in the test dataset.
 
TEST 2: H&M KNN Classifier Model (k=5) Accuracy increased to 0.432 and Precision Score increased to 0.161:
 * We included product attributes in with Customer attributes, the full date range (YYYY-MM-DD) and transaction attributes as features to predict products. This can lead to curse of dimensionality. The KNN Classifier can perform poorly with too many features. In our case the results were better but the time it took to process the data won't scale to the full transaction file.
 * For the next test we should reduce the amount of features we have. To do this we must use dimension reduction techniques such as principle component analysis (when we have lots of features) or singular value decomposition (SVD) when we have sparse data.

TEST 3: H&M KNN Classifier Model (k=5) Accuracy decreased to 0.432 and Precision Score increased to 0.161:
 * we tried PCA with 95% variance kept. It didn't improve our score or our speed. We will try SVD in the next text
 
TEST 4: H&M KNN Classifier Model (k=5) Accuracy decreased to 0.422 and Precision Score increased to 0.151:
* SVD reduced our features down to 15 and also reduced our accuracy but sped up processing, we will try PCA to compare

TEST 5: H&M KNN Classifier Model (k=5) Accuracy decreased to 0.038 and Precision Score increased to 0.0144:
* We removed customer and tried to predict customer age with products as a test. Speed increased dramatically. The idea behind this would be to use age to select customer_id per product.

TEST 6: H&M KNN Classifier Model (k=4) Accuracy decreased to 0.415 and Precision Score increased to 0.1601:
* We tried k=4

TEST 7: H&M KNN Classifier Model (k=3) Accuracy decreased to 0.426 and Precision Score increased to 0.1722:
* We tried k=3
 

### Discussion

Overall Test 2 was the best. We tried in Test 3 to introduce PCA but it did not improve our result. We did however use SVD to reduce our number of features to 15. This reduced our score but significantly increased our processing time. This trade off was deemed acceptable to progress to a full train and test with Kaggle.

## References

* https://www.kaggle.com/code/martandsay/knn-multi-classification-animal-classification/notebook
* https://towardsdatascience.com/multiclass-classification-using-k-nearest-neighbours-ca5281a9ef76
* https://analyticsindiamag.com/singular-value-decomposition-svd-application-recommender-system/
* https://machinelearningmastery.com/singular-value-decomposition-for-dimensionality-reduction-in-python/
* https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60
* https://www.kaggle.com/code/lichtlab/h-m-data-deep-dive-chap-1-understand-article
* https://www.kaggle.com/code/vanguarde/h-m-eda-first-look
* https://www.kaggle.com/code/debarshichanda/understanding-mean-average-precision
* https://www.kaggle.com/code/paweljankiewicz/hm-create-dataset-samples
* https://www.kaggle.com/code/souamesannis/tips-to-work-efficiently-with-the-dataset

# Generate Kaggle Predictions File

In [1]:
#working with files and memory management
import gc
import pickle

#working with data
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#working with datetime feature
from datetime import datetime

#handling missing values where not dropped
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

#for evaluating our model
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score

#for dimension reduction
from sklearn.pipeline import Pipeline # to sequence training events
from sklearn.decomposition import TruncatedSVD

#model
from sklearn.neighbors import KNeighborsClassifier

#used to provide information to the user when running this notebook
from IPython.display import display, clear_output

In [2]:
#ref: https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
imputer_cms_mf = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_fnf_mf = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_age_med = SimpleImputer(missing_values=np.nan, strategy='median')

#create encoders for categorical variables
le_cms = preprocessing.LabelEncoder()
le_fnf = preprocessing.LabelEncoder()
le_pgn = preprocessing.LabelEncoder() #product group name
le_ic = preprocessing.LabelEncoder()

#create scaler
min_max_scaler_train = preprocessing.MinMaxScaler(feature_range=(0, 1))
min_max_scaler_test = preprocessing.MinMaxScaler(feature_range=(0, 1))
    
#H&M Collaborative KNN Model Based Recommendation System
def HmRecSys_data_prep():
    
    #GET DATA
    #output message for user
    clear_output(wait=True)
    display('Importing Data. Please wait...')
    
    #get transaction data
    transactions_train_df = pd.read_csv("data/transactions_train.csv", 
                                        dtype={"article_id": "str"}) # import the transactions dataset

    #get product meta data
    articles_df = pd.read_csv("data/articles.csv", dtype={"article_id": "str"})

    #get customer meta data
    customers_df = pd.read_csv("data/customers.csv")

    #output message for user
    clear_output(wait=True)
    display('Preparing Data. Please wait...')
    
    #PREPARE DATA
    #prepare transactions dataset
    features_df = transactions_train_df[['article_id',
                                         'customer_id', 
                                         't_dat', 
                                         'price', 
                                         'sales_channel_id']]
    
    del transactions_train_df
    gc.collect()
    
    clear_output(wait=True)
    display('imported transactions and arranged columns...')
    
    #First we will convert our date text into a panda date type.
    features_df["t_dat"] = pd.to_datetime(features_df["t_dat"])
    
    clear_output(wait=True)
    display('converted date into datetime object...')

    clear_output(wait=True)
    display('converted article_ids to strings...')
    
    #merge product meta data with transactions
    features_df = features_df.merge(articles_df, left_on='article_id', right_on='article_id')
    
    del articles_df
    gc.collect()
    
    clear_output(wait=True)
    display('merged articles with transactions...')
    
    #we drop cols we don't need from products dataset
    features_df.drop(['prod_name',
                      'product_type_name',
                      'graphical_appearance_name',
                      'colour_group_name',
                      'perceived_colour_value_name',
                      'perceived_colour_master_name',
                      'department_name',
                      'index_name',
                      'index_group_name',
                      'section_name',
                      'garment_group_name',
                      'detail_desc'], axis=1)    
    
    clear_output(wait=True)
    display('rearranged columns of features dataset for merger...')

    clear_output(wait=True)
    display('merging customers with features dataset...')
    
    #merge customer meta data with transactions
    features_df = features_df.merge(customers_df, left_on='customer_id', right_on='customer_id')
    
    del customers_df
    gc.collect()
    
    clear_output(wait=True)
    display('rearranging customers and articles with feature dataset...')
    
    #we reorganise columns
    features_df = features_df[['customer_id',#the customer
                               'FN',#customer meta data
                               'Active',
                               'club_member_status', 
                               'fashion_news_frequency', 
                               'age',
                               'product_code',#product meta data
                               'product_type_no',
                               'product_group_name',
                               'graphical_appearance_no',
                               'colour_group_code', 
                               'perceived_colour_value_id', 
                               'perceived_colour_master_id', 
                               'department_no',  
                               'index_code', 
                               'index_group_no',  
                               'section_no', 
                               'garment_group_no', 
                               't_dat',#transaction meta data
                               'price',
                               'sales_channel_id', 
                               'article_id']]#the product
    clear_output(wait=True)
    display('rearranged columns of features dataset...')
    
    #FIX MISSING DATA
    #convert from objects and floats to categories and ints
    features_df['club_member_status'] = features_df['club_member_status'].astype('category')
    features_df['fashion_news_frequency'] = features_df['fashion_news_frequency'].astype('category')
    
    features_df['FN'] = features_df['FN'].fillna(0)
    features_df['Active'] = features_df['Active'].fillna(0)

    club_member_status = features_df.iloc[:, 3:-18].values
    fashion_news_frequency = features_df.iloc[:, 4:-17].values
    age = features_df.iloc[:, 5:-16].values

    #we replace missing values with the most frequent
    imputer_cms_mf.fit(club_member_status)
    club_member_status = imputer_cms_mf.transform(club_member_status)

    imputer_fnf_mf.fit(fashion_news_frequency)
    fashion_news_frequency = imputer_fnf_mf.transform(fashion_news_frequency)

    #we replace any missing age values with the median age
    imputer_age_med.fit(age)
    age = imputer_age_med.transform(age)

    #now add corrected columns back into our main customer dataframe
    features_df.iloc[:, 3:-18] = club_member_status
    features_df.iloc[:, 4:-17] = fashion_news_frequency
    features_df.iloc[:, 5:-16] = age

    #replace minus sign in text and check result of dataset after imputing missing values
    features_df.columns = features_df.columns.str.replace('-', '')

    #lower case columns
    features_df.columns = map(str.lower, features_df.columns)

    clear_output(wait=True)
    display('filled in missing values and fixed column names...')
    
    # ENCODE DATA
    #encode our categorical variables
    features_df.iloc[:,3] = le_cms.fit_transform(features_df.iloc[:,3])#club_member_status
    features_df.iloc[:,4] = le_fnf.fit_transform(features_df.iloc[:,4])#fashion_news_frequency
    features_df.iloc[:,8] = le_pgn.fit_transform(features_df.iloc[:,8])#product_group_name
    features_df.iloc[:,14] = le_ic.fit_transform(features_df.iloc[:,14])#index_code
    

    clear_output(wait=True)
    display('encoded features...')

    # SPLIT DATA X FEATURES Y PREDICTOR
    
    train_mask = (features_df['t_dat'] >= '2020-09-21') & (features_df['t_dat'] <= '2020-09-21')
    train_df = features_df.loc[train_mask]
    
    test_mask = (features_df['t_dat'] >= '2020-09-22') & (features_df['t_dat'] <= '2020-09-22')
    test_df = features_df.loc[test_mask]
    
    #encode date as ordinal after we split our training and test data
    train_df['t_dat'] = features_df['t_dat'].apply(lambda x: x.toordinal())
    test_df['t_dat'] = features_df['t_dat'].apply(lambda x: x.toordinal())

    del features_df
    gc.collect()
    
    #These are the attributes of our customers for training
    X_train = train_df.iloc[:, 1: 20].values
    
    # this is the product or in our case the class for training
    y_train = train_df.iloc[:, 21].values


    #These are the attributes of our customers for test
    X_test = test_df.iloc[:, 1: 20].values
    
    # this is the product or in our case the class for test
    y_test = test_df.iloc[:, 21].values
    
    clear_output(wait=True)
    display('split data into X features and y predictor...')
    
    # SCALE FEATURES: MinMax - range (0,1)  
    X_train_scaled = min_max_scaler_train.fit_transform(X_train)
    X_test_scaled = min_max_scaler_test.fit_transform(X_test)

    clear_output(wait=True)
    display('features scaling complete...')
 
    clear_output(wait=True)
    display('exporting X_train_scaled np array to text file...')
    np.savetxt("data/X_train_scaled_sample1.csv", X_train_scaled, delimiter=",")

    
    clear_output(wait=True)
    display('exporting X_test_scaled np array to text file...')
    np.savetxt("data/X_test_scaled_sample1.csv", X_test_scaled, delimiter=",")  


    clear_output(wait=True)
    display('exporting y_train np array to text file...')
    np.savetxt("data/y_train_sample1.csv", y_train, delimiter=",", fmt='%s')
    
    clear_output(wait=True)
    display('exporting y_test np array to text file...')
    np.savetxt("data/y_test_sample1.csv", y_test, delimiter=",", fmt='%s')
    
    del X_train
    del y_train
    del X_train_scaled
    
    del X_test
    del y_test
    del X_test_scaled
    gc.collect()
    
    clear_output(wait=True)
    display('Data preparation complete.')

In [3]:
def HmRecSys_model_train(): 

    #output message for user
    clear_output(wait=True)
    display('Importing X_train np array from text file. Please wait...')  
    X_train_scaled = np.loadtxt('data/X_train_scaled_sample1.csv', delimiter=',')
    
    #output message for user
    clear_output(wait=True)
    display('Importing y_train df from text file. Please wait...') 
    y_train = np.loadtxt('data/y_train_sample1.csv', delimiter=',')
    
    
    #output message for user
    clear_output(wait=True)
    display('Importing X_test np array from text file. Please wait...')  
    X_test_scaled = np.loadtxt('data/X_test_scaled_sample1.csv', delimiter=',')
    
    #output message for user
    clear_output(wait=True)
    display('Importing y_train df from text file. Please wait...') 
    y_test = np.loadtxt('data/y_test_sample1.csv', delimiter=',')
    
    # CREATE PIPELINE
    #create model pipeline
    clear_output(wait=True)
    display('Creating pipeline...')
    
    steps = [('svd', TruncatedSVD(n_components=15)), 
             ('knn', KNeighborsClassifier(n_neighbors=3, metric="minkowski", p=2))]
    
    model = Pipeline(steps=steps)

    #output message for user
    clear_output(wait=True)
    display('training model please wait...')
                                 
    #TRAIN MODEL
    model.fit(X_train_scaled, y_train)
    
    del X_train_scaled
    del y_train
    gc.collect()
    
    #output message for user
    clear_output(wait=True)
    display('model trained...')
    
    #MODEL EVALUATION
    y_pred = model.predict(X_test_scaled)
    clear_output(wait=True)
    display("Accuracy of H&M KNN Classifier:" + str(accuracy_score(y_test, y_pred)) + "\nPrecision Score for H&M KNN Classifier:" + str(precision_score(y_test, y_pred, average='macro')) + "\nsaving model. please wait...")

    del X_test_scaled
    del y_test
    del y_pred
    gc.collect()
    
    pickle.dump(model, open('data/hm_knn_model.sav', 'wb'))
    del model
    gc.collect()

In [38]:
def HmRecSys_model_predict():
    
    #GET DATA
    #output message for user
    clear_output(wait=True)
    display('Importing model. Please wait...')
    
    model = pickle.load(open('data/hm_knn_model.sav', 'rb'))
    
    #output message for user
    clear_output(wait=True)
    display('Importing Data. Please wait...')
    
    #get product meta data
    articles_df = pd.read_csv("data/articles.csv", dtype={"article_id": "str"})

    #get customer meta data
    customers_df = pd.read_csv("data/customers.csv")
    
    #get transaction data
    transactions_train_df = pd.read_csv("data/transactions_train.csv", 
                                        dtype={"article_id": "str"}) # import the transactions dataset

    clear_output(wait=True)
    display('Getting price mode')
    
    #get popular price to pay
    p = transactions_train_df['price'].mode()
    
    clear_output(wait=True)
    display('Getting sales mode')
    
    #get popular sales channel to buy from
    s = transactions_train_df['sales_channel_id'].mode()

    del transactions_train_df
    gc.collect() 

    #for encoding data and filling missing values
    #ref: https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
    imputer_med = SimpleImputer(missing_values=np.nan, strategy='median')
    imputer_mf = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    le = preprocessing.LabelEncoder()
    
    #output message for user
    clear_output(wait=True)
    display('opening CSV file to start writing...')
    
    write_file = "ros_predictions4.csv"
    with open(write_file, "wt", encoding="utf-8") as output:
        #add headers first
        output.write("customer_id,prediction" + '\n')
        
        #now we loop through each row and write predictions to csv file
        for index_i, cus in customers_df.iterrows():
                
            #create prediction csv file
            output.write(cus.customer_id + "," + '\n')
            
            #random sample 4 products
            sample_4 = articles_df.sample(n=4, random_state=1)
            
            #predict 3 products near each 4
            for index_j, art in sample_4.iterrows():
                #clear_output(wait=True)
                #display('Predicting customer: ' + str(cus.customer_id) + " recommendations from: " + str(art.article_id))

                #get their meta data   
                data = {'FN': [cus['FN']],
                        'Active': [cus['Active']],
                        'club_member_status': [cus['club_member_status']],
                        'fashion_news_frequency': [cus['fashion_news_frequency']],
                        'age': [cus['age']],
                        'product_code': [art['product_code']], 
                        'product_type_no': [art['product_type_no']],
                        'product_group_name': [art['product_group_name']],
                        'graphical_appearance_no': [art['graphical_appearance_no']],
                        'colour_group_code': [art['colour_group_code']], 
                        'perceived_colour_value_id': [art['perceived_colour_value_id']], 
                        'perceived_colour_master_id': [art['perceived_colour_master_id']], 
                        'department_no': [art['department_no']],  
                        'index_code': [art['index_code']], 
                        'index_group_no': [art['index_group_no']],  
                        'section_no': [art['section_no']], 
                        'garment_group_no': [art['garment_group_no']],
                        'date': [['2020-09-29']],#future date in next 7 days (2020-09-29)
                        'price': [[p]], #most popular price to pay
                        'sales_channel_id': [[s]] #most popular way to buy (in store or online)
                       }
                
                # Create DataFrame
                features_df = pd.DataFrame(data)

                #FIX MISSING DATA
                #convert from objects and floats to categories and ints
                #features_df['club_member_status'] = features_df['club_member_status'].astype('category')
                #features_df['fashion_news_frequency'] = features_df['fashion_news_frequency'].astype('category')

                features_df['FN'] = features_df['FN'].astype('Int16')
                features_df['Active'] = features_df['Active'].astype('Int16')
                features_df['age'] = features_df['age'].astype('float')
                
                
                #features_df['FN'] = features_df['FN'].fillna(0)
                #features_df['Active'] = features_df['Active'].fillna(0)
                
                club_member_status = features_df.iloc[:, 2:-19].values
                fashion_news_frequency = features_df.iloc[:, 3:-18].values
                age = features_df.iloc[:, 4:-17].values
                #club_member_status = features_df['club_member_status'].iloc[0].values
                #fashion_news_frequency = features_df['fashion_news_frequency'].iloc[0]
                #age = features_df['age'].iloc[0]
                
                
                features_df['date'] = pd.to_datetime(features_df['date'].iloc[0], format='%Y-%m-%d')
                features_df['date'] = features_df['date'].apply(lambda x: x.toordinal())
                
                #we replace missing values with the most frequent
                club_member_status = imputer_cms_mf.fit_transform(club_member_status)




                fashion_news_frequency = imputer_fnf_mf.fit_transform(fashion_news_frequency)
                
                #we replace any missing age values with the median age
                age = imputer_age_med.fit_transform(age)


                #now add corrected columns back into our main customer dataframe
                features_df.iloc[:, 2] = club_member_status
                features_df.iloc[:, 3] = fashion_news_frequency
                features_df.iloc[:, 4] = age
                
                #replace minus sign in text and check result of dataset after imputing missing values
                features_df.columns = features_df.columns.str.replace('-', '')

                #lower case columns
                features_df.columns = map(str.lower, features_df.columns)

                #encode our categorical variables
                features_df.iloc[:,2] = le_cms.fit_transform(features_df.iloc[:,3])#club_member_status
                features_df.iloc[:,3] = le_fnf.fit_transform(features_df.iloc[:,4])#fashion_news_frequency
                features_df.iloc[:,7] = le_pgn.fit_transform(features_df.iloc[:,8])#product_group_name
                features_df.iloc[:,13] = le_ic.fit_transform(features_df.iloc[:,14])#index_code
                """
                clear_output(wait=True)
                display('filled in missing values and fixed column names...')

                #normalise data to query customer
                q_cus_scaled = min_max_scaler_test.transform(features_df)
                
                #free up memory
                del features_df
                gc.collect()  
                
                #make a prediction
                y_pred = model.predict(q_cus_scaled)
                
                result = []
                r = []
                prediction = ""
                
                #add 3 predicted products
                result.append(y_pred)
            
                for n in result:
                    r.append("0" + str(n))
                    prediction =  ' '.join(r)
                    
                #write predictions to csv file
                output.write(prediction + '\n')
                
                del result
                del r
                del prediction
                gc.collect()
                """

In [5]:
HmRecSys_data_prep()

'Data preparation complete.'

In [6]:
HmRecSys_model_train()

  _warn_prf(average, modifier, msg_start, len(result))


'Accuracy of H&M KNN Classifier:0.40376072536968294\nPrecision Score for H&M KNN Classifier:0.13095032692545291\nsaving model. please wait...'

In [39]:
HmRecSys_model_predict()

'opening CSV file to start writing...'

ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required.