# Requirement

For each customer (customer_id), H&M want a prediction of up to 12 products (article_ids), which is the predicted items a customer will buy in the next 7-day period after the training time period. The file should contain a header and have the following format.

In [40]:
#working with files and memory management
import gc
import pickle

import pandas as pd
import numpy as np

#used during data exploration
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

#handling missing values where not dropped
from sklearn.impute import SimpleImputer

from sklearn import preprocessing

from sklearn.neighbors import NearestNeighbors

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

from IPython.display import display, clear_output

# Get the Data

In [41]:
#get transaction data
transactions_train_df = pd.read_csv("data/transactions_train.csv",
                                    index_col=['article_id'],
                                    usecols=['article_id', 
                                             't_dat', 
                                             'price', 
                                             'customer_id'],
                                    parse_dates=["t_dat"]) # import the transactions dataset dtype={'article_id': 'str'}

In [42]:
transactions_train_df.head()

Unnamed: 0_level_0,t_dat,customer_id,price
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
663713001,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.050831
541518023,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.030492
505221004,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,0.015237
685687003,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,0.016932
685687004,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,0.016932


In [43]:
#get product meta data
articles_df = pd.read_csv('data/articles.csv',
                          index_col=['article_id'],
                          usecols=['article_id',
                                   'product_type_no', #e.g Scarf
                                   'graphical_appearance_no', # e.g stripe
                                   'colour_group_code', #e.g white
                                   'index_group_no',#e.g ladies wear
                                   'detail_desc'],
                          dtype={'article_id': 'str'}) # drop text, no clothing size available

In [44]:
articles_df.head()

Unnamed: 0_level_0,product_type_no,graphical_appearance_no,colour_group_code,index_group_no,detail_desc
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
108775015,253,1010016,9,1,Jersey top with narrow shoulder straps.
108775044,253,1010016,10,1,Jersey top with narrow shoulder straps.
108775051,253,1010017,11,1,Jersey top with narrow shoulder straps.
110065001,306,1010016,9,1,"Microfibre T-shirt bra with underwired, moulde..."
110065002,306,1010016,10,1,"Microfibre T-shirt bra with underwired, moulde..."


In [45]:
#get customer meta data
customers_df = pd.read_csv('data/customers.csv',
                           usecols=['customer_id'])#just import customer ids

In [46]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 1 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   customer_id  1371980 non-null  object
dtypes: object(1)
memory usage: 10.5+ MB


# Prepare the Transaction Dataset

In [47]:
#We will use a 3 week date range between 2020-09-8 and 2020-09-22 to reduce our dataset size.
mask = (transactions_train_df['t_dat'] >= '2020-09-01') & (transactions_train_df['t_dat'] <= '2020-09-22')
transactions_train_df = transactions_train_df.loc[mask]
transactions_train_df['customer_id'].size

798269

In [48]:
#merge product meta data with transactions
transactions_train_df = transactions_train_df.merge(articles_df, left_on='article_id', right_on='article_id')

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [None]:
#now we split out the date into seperate columns for day, month and year making use of python zip for memory efficiency
days, months, years  = zip(*[(d.day, d.month, d.year) for d in transactions_train_df['t_dat']])
transactions_train_df = transactions_train_df.assign(day=days, month=months, year=years)

In [None]:
transactions_train_df.head()

## Split Data Into Train & Test Set 
We take the past 2 weeks as training data and 1 week in the future as test data

In [None]:
train_mask = (transactions_train_df['t_dat'] >= '2020-09-01') & (transactions_train_df['t_dat'] <= '2020-09-14')
train_df = transactions_train_df.loc[train_mask]
train_df['customer_id'].size

In [None]:
train_df['day'].values[0]

In [None]:
test_mask = (transactions_train_df['t_dat'] >= '2020-09-15') & (transactions_train_df['t_dat'] <= '2020-09-22')
test_df = transactions_train_df.loc[test_mask]
test_df['customer_id'].size

In [None]:
# Store features as numpy array Xy
#names = ts_train_df.index
names = train_df.index

In [None]:
names

# Plot Clusters

In [None]:
product_type = train_df['product_type_no'].values # x
colour = train_df['colour_group_code'].values # y

q_product_type = test_df['product_type_no'].values[1] # x
q_colour = test_df['colour_group_code'].values[1]

#place day/price points on graph
plt.figure(figsize=(8,5))
plt.scatter(product_type, colour, color='green')

#place query on graph
plt.scatter(q_product_type, q_colour,color='black')
plt.annotate('q',(q_product_type+0.2, q_colour))

#label graph
plt.title("H&M")
plt.xlabel("Product Type")
plt.ylabel("Colour")

#display graph
plt.grid()

#populate product names on the graph
#for i, txt in enumerate(names):
#    plt.annotate(txt, (days[i]+0.09, prices[i]))

In [None]:
train_df.iloc[:, np.r_[3, 5]]

# Normalise the Sample Transaction Data

In [None]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
X_scaled = min_max_scaler.fit_transform(train_df.iloc[:, np.r_[3, 5]])

#scaler = preprocessing.StandardScaler().fit(train_df.iloc[:, np.r_[3, 4]])  #need a handle on the scaler to apply to training and test data
#X_scaled = scaler.fit_transform(train_df.iloc[:, np.r_[3, 4]])

In [None]:
#q_scaled = scaler.fit_transform([[test_df['product_type_no'].values[1], test_df['colour_group_code'].values[1]]])
q_scaled = min_max_scaler.transform([[test_df['product_type_no'].values[550], test_df['colour_group_code'].values[550]]])

In [None]:
X_scaled

In [None]:
q_scaled[:,:]

In [None]:
#check new scale
product_type = X_scaled[:,0] # x
colour = X_scaled[:,1] # y

plt.figure(figsize=(8,5))
plt.scatter(product_type, colour, color='green')

plt.scatter(q_scaled[:,0],q_scaled[:,1],color='black')
plt.annotate('q',(q_scaled[:, 0]+0.03,q_scaled[:, 1]))

plt.title("H&M")
plt.xlabel("Product Type")
plt.ylabel("Colour")
plt.grid()
#plt.legend(handles=[red_patch, blue_patch],loc=4)
#for i, txt in enumerate(names):
#    plt.annotate(txt, (days[i]+0.09, prices[i]))

# Train a KNN Model on Sample

In [None]:
knn_model = NearestNeighbors(n_neighbors=12, radius=0.4)
knn_model.fit(X_scaled)

# Predict with KNN Model

In [None]:
#get neighbours' names where k=12
result = knn_model.kneighbors(q_scaled, 12)[1][0]

In [None]:
#for n in result:
#    i = names[n]
#    print(str(n))
names[500]

In [49]:
# result contains the 'index' of the nearest neighbours
for n in result:
    p = names[n]
    item = articles_df.query('index == ' + str(p))
    print("PRODUCT: " + str(p))
    print(item.iloc[0][4])
    print()

SyntaxError: invalid syntax (<unknown>, line 1)

# Evaluate Model on Sample

In [None]:
X_test_scaled = scaler.transform(test_query[:,3:5])

# Generate Predictions File

In [None]:
#H&M Collaborative KNN Model Based Recommendation System
def hm_rec_sys(r_model, cus_df, write_file):  
    
    #write_file = "ros_predictions.csv"
    with open(write_file, "wt", encoding="utf-8") as output:
        #add headers first
        output.write("customer_id,prediction" + '\n')
        
        #now we loop through each row and write predictions to csv file
        for index, cus in cus_df.iterrows():
            #select day and price and convert them to np array
            q_cus = np.array([cus['day'],cus['price']], dtype=float) #cus[3:5]
            
            #normalise data
            q_cus_scaled = scaler.transform([q_cus])

            #get neighbours' names where k=12
            result = r_model.kneighbors(q_cus_scaled, 12)[1][0]
            
            #create prediction csv file
            r = []
            r.append(cus.customer_id + ",")
            for n in result:
                p = names.iloc[n]
                r.append("0" + str(p))
                prediction =  ' '.join(r)
            #write predictions to csv file
            output.write(prediction + '\n')
            clear_output(wait=True)
            display('Processed Row: ' + str(index))

In [None]:
#we now generate our intial predictions list and save it as a csv file
hm_rec_sys(knn_model, cus_pred_df, "data/ros_predictions3.csv")

In [None]:
#inspect our prediction data
predictions_df = pd.read_csv("data/ros_predictions3.csv")

In [None]:
predictions_df.head()

In [None]:
predictions_df['customer_id'].size