In [1]:
# importing required libraries
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy

In [2]:
# Load training data
rating_df = pd.read_csv('./train.csv')

# Print the headers for columns
print(rating_df.head())
print(rating_df.shape)


   user_id  item_id  rating
0    12572     2953       8
1    31098     1545      10
2    23468     4926       6
3    98202       18       9
4    28873       63       2
(100000, 3)


In [3]:
# Print all unique values for rating column
rating_df.rating.unique()

array([ 8, 10,  6,  9,  2,  4,  1,  5,  3,  7])

The rating database has 100K entries and 3 columns. The ratings are for a scale of 1 to 10.

In [4]:
# Load testing data
predict_df = pd.read_csv('./test.csv')
print(predict_df.head())

   user_id  item_id
0    52491     1455
1    27265       91
2      226     5725
3    26368     1072
4   103122     2953


In [5]:
# Print the size of the data frame
predict_df.shape

(20998, 2)

The predict databse to test the values has 20998 entries and only two columns.

In [6]:
# Adding a new column to the dataframe
predict_df.insert(2,'rating',0)
print(predict_df.head())

   user_id  item_id  rating
0    52491     1455       0
1    27265       91       0
2      226     5725       0
3    26368     1072       0
4   103122     2953       0


    I am  adding another column with rating to the predict database. 
    This is done to resolve the error seen while using KNNBasic which requires 3 columns.

In [7]:
# Print the size of the data frame after adding another column
predict_df.shape

(20998, 3)

Now We will extract the data from rating dataframe and split it to test and training set to verify the accuracy of the model.

In [8]:
# Using reader to extract data from dataframe.
reader = Reader(rating_scale=(1,10))
data=Dataset.load_from_df(rating_df,reader)
data_test=Dataset.load_from_df(predict_df,reader)

In [40]:
# splitting the data into train and test sections.
trainingSet, testSet = train_test_split(data, test_size=0.3, train_size=None, random_state=None, shuffle=True)

    I have initially selected a test size of 30% and training size of 70%.
    Up next I am using different algorithms starting with pearson.
    The number of neighbors I want to start is with 7. More the numbers or neighbors the effiency could improve.?

In [9]:
# Defining the sim options for KNN algorithm.
sim_options = {
    'name': 'pearson',
    'user_based': True #user-based CF
}

    I am using the pearson algorithm with user based classification. 
    The default is MSD but I am starting with Pearson and will  make comparisons with other algorithms to get the best score.

In [11]:
# Defining the knn algorithm with following options
knn = KNNBasic(sim_options=sim_options,k=7,min_k=1) #neighbours=7, other parameters set as above

knn.fit(trainingSet) #fit model to the training set

predictions_knn = knn.test(testSet) #predict for test set values


Computing the pearson similarity matrix...
Done computing similarity matrix.


Calculating the accuracy of the model using test data.

In [12]:
# calculating the rmse value
accuracy.rmse(predictions_knn, verbose=True) 

RMSE: 2.8799


2.8798832702150894

We need the RMSE score to be lower in order to get the best prediction model.

Printing the predicted knn will give us more information on the predictions.

In [13]:
# printing the predictions information for future processing.
predictions_knn

[Prediction(uid=105246, iid=1236, r_ui=7.0, est=5.512914285714285, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=31331, iid=1931, r_ui=6.0, est=5.512914285714285, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=97453, iid=1959, r_ui=8.0, est=5.512914285714285, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=15230, iid=1284, r_ui=8.0, est=5.512914285714285, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=78188, iid=4521, r_ui=2.0, est=5.512914285714285, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=43415, iid=1313, r_ui=1.0, est=5.512914285714285, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=53527, iid=135, r_ui=4.0, est=5.512914285714285, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=88339, iid=

    Parsing through the prediction information, a huge chunck of USER ID or ITEM ID is unknown,
    indicating we have a cold start issue.
    The 2nd information which we observe is that there are not enough neighbors for many user_id and Item ID.
    We will tackle the neighbor issue first and reduce the number of required neighbors.

# Generating the submission data below

In [14]:
# generating the predicted rating for test data set
predicted_knn=[]
i=0
for user in predict_df.user_id:
    predicted_rating=knn.predict(predict_df.user_id[i],predict_df.item_id[i])
    predicted_knn.append(predicted_rating.est)
    i += 1
#validating rating predictions using RMSE
print(predicted_knn[0])


5.5037


Capturing the predicted rating in an array. This wil be used to generate the submission file. 

Creating the submission CSV.

In [15]:
# Generating the dataframe as per expected submission format.
my_submission = pd.DataFrame({'Id': predict_df.user_id.astype(str)+'-'+predict_df.item_id.astype(str) , 'Rating': predicted_knn})

my_submission.to_csv('submission1.csv', index=False)

Creating a dataframe to match the required formatiing of submission. Using the DF to csv option to generate the output file

Redcuing the neighbor to 5.

In [16]:
# Creating another knn model with 5 neighbors
knn_5 = KNNBasic(sim_options=sim_options,k=5,min_k=1) #neighbours=5, other parameters set as above
knn_5.fit(trainingSet) #fit model to the training set
predictions_knn5 = knn_5.test(testSet) #predict for test set values

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [17]:
# Measuring the RMSE
accuracy.rmse(predictions_knn5, verbose=True) 

RMSE: 2.8842


2.884154246262036

We do not see much change in the RMSE score by reducing the number of neighbors to 5.
I am reducing the neighbors to 2 to check the result next.

In [18]:
# Creating another knn model with 2 neighbors
knn_2 = KNNBasic(sim_options=sim_options,k=2,min_k=1) #neighbours=2, other parameters set as above
knn_2.fit(trainingSet) #fit model to the training set
predictions_knn2 = knn_2.test(testSet) #predict for test set values

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [19]:
# Measuring the RMSE
accuracy.rmse(predictions_knn2, verbose=True) 

RMSE: 2.8842


2.884154246262036

    The Score did not improve even with a reduction to neighbour count of 2. 
    The neighbour count is not responsible for the current score but it is the cold start issue due to missing information for User or item.

In [20]:
# printing the predictions information for future processing.
predictions_knn2

[Prediction(uid=89672, iid=100, r_ui=9.0, est=5.5037, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=48060, iid=3645, r_ui=7.0, est=5.5037, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=52677, iid=5093, r_ui=8.0, est=5.5037, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=69198, iid=82, r_ui=9.0, est=5.5037, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=20475, iid=20, r_ui=10.0, est=5.5037, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=64026, iid=941, r_ui=4.0, est=5.5037, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=74826, iid=232, r_ui=9.0, est=5.5037, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=2067, iid=2284, r_ui=8.0, est=5.5037, details={'was_impossible': True, 'r

    We do not see much improvement in rmse score with 2 neighbors also. 
    Going through the details of test predictions we still see a huge chunk of predictions giving message 'Not enough neighbors.'
    I am trying a neighbor or 1 to see if the results improve.

In [41]:
# Creating another knn model with 1 neighbor
knn_1 = KNNBasic(sim_options=sim_options,k=1,min_k=1) #neighbours=1, other parameters set as above
knn_1.fit(trainingSet) #fit model to the training set
predictions_knn1 = knn_1.test(testSet) #predict for test set values

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [42]:
# Measuring the RMSE
accuracy.rmse(predictions_knn1, verbose=True) 

RMSE: 2.8689


2.868885402187469

In [46]:
# generating the predicted rating for test data set
predicted_knn=[]
i=0
for user in predict_df.user_id:
    predicted_rating=knn_1.predict(predict_df.user_id[i],predict_df.item_id[i])
    predicted_knn.append(predicted_rating.est)
    i += 1
#validating rating predictions using RMSE
print(predicted_knn[0])

5.506428571428572


In [47]:
# Generating the dataframe as per expected submission format.
my_submission = pd.DataFrame({'Id': predict_df.user_id.astype(str)+'-'+predict_df.item_id.astype(str) , 'Rating': predicted_knn})

my_submission.to_csv('submission2.csv', index=False)

    Knn (7 neighbor): 2.8799
    Knn (5 neighbor): 2.8842
    Knn (2 neighbor): 2.8842
    Knn (1 neighbor): 2.8689
    The best scores are with only 1 neighbor.

# To provide access to more USER ID or Item ID increasing the training size to 80 %

In [9]:
# Changing the training set to 80%, increasing from 70%.
trainingSet_80, testSet_20 = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [11]:
# Creating another knn model with 1 neighbors
knn_1_80 = KNNBasic(sim_options=sim_options,k=1,min_k=1) #neighbours=2, other parameters set as above
knn_1_80.fit(trainingSet_80) #fit model to the training set
predictions_knn1 = knn_1_80.test(testSet_20) #predict for test set values

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [13]:
# Measuring the RMSE
accuracy.rmse(predictions_knn1, verbose=True) 

RMSE: 2.8575


2.857521817090488

    KNN pearson with 70% training set: 2.8842
    KNN pearson with 80% training set: 2.8750
    The algorithm provides better fit with 80:20 split compared to 70:30 split.

In [14]:
# generating the predicted rating for test data set
predicted_knn=[]
i=0
for user in predict_df.user_id:
    predicted_rating=knn_1_80.predict(predict_df.user_id[i],predict_df.item_id[i])
    predicted_knn.append(predicted_rating.est)
    i += 1
#validating rating predictions using RMSE
print(predicted_knn[0])

5.508475


In [15]:
# Generating the dataframe as per expected submission format.
my_submission = pd.DataFrame({'Id': predict_df.user_id.astype(str)+'-'+predict_df.item_id.astype(str) , 'Rating': predicted_knn})

my_submission.to_csv('submission3.csv', index=False)

# Pearson Baseline Algorithm:

In [11]:
# defining sim options for pearson_baseline
sim_options = {
    'name': 'pearson_baseline',
    'user_based': True #user-based CF
}

In [12]:
# Creating another knn model with pearson baseline algo and 2 neighbors
knn_pb = KNNBasic(sim_options=sim_options,k=2,min_k=1) #neighbours=2, other parameters set as above
knn_pb.fit(trainingSet_80) #fit model to the training set 70%
predictions_knn_pb = knn_pb.test(testSet_20) #predict for test set values (30%)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [13]:
# Measuring the RMSE
accuracy.rmse(predictions_knn_pb, verbose=True) 

RMSE: 2.8895


2.8894555638332258

Pearson_baseline has no improvement in performance over pearson. Will use pearson as it provides the best results so far.

In [15]:
# generating the predicted rating for test data set
predicted_knn_pb=[]
i=0
for user in predict_df.user_id:
    predicted_rating=knn_pb.predict(predict_df.user_id[i],predict_df.item_id[i])
    predicted_knn_pb.append(predicted_rating.est)
    i += 1
#validating rating predictions using RMSE
print(predicted_knn_pb[0])

5.5012625


In [17]:
# Generating the dataframe as per expected submission format.
my_submission = pd.DataFrame({'Id': predict_df.user_id.astype(str)+'-'+predict_df.item_id.astype(str) , 'Rating': predicted_knn_pb})

my_submission.to_csv('submission5.csv', index=False)

# Cosine Algorithm:

In [17]:
# sim options for cosine algo
sim_options = {
    'name': 'cosine',
    'user_based': True #user-based CF
}

In [18]:
# Creating another knn model with cosine algo and 2 neighbors
knn_cos = KNNBasic(sim_options=sim_options,k=2,min_k=1) #neighbours=2, other parameters set as above
knn_cos.fit(trainingSet) #fit model to the training set 70%
predictions_knn_cos = knn_cos.test(testSet) #predict for test set values (30%)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [19]:
# Measuring the RMSE
accuracy.rmse(predictions_knn_cos, verbose=True) 

RMSE: 2.9293


2.929314714339132

Cosine has a relatively poor performance compared to pearson and pearsonbaseline. 

# MSD Algorithm:

In [20]:
# sim options for MSD algo
sim_options = {
    'name': 'msd',
    'user_based': True #user-based CF
}

In [21]:
# Creating another knn model with MSD algo and 2 neighbors
knn_msd = KNNBasic(sim_options=sim_options,k=2,min_k=1) #neighbours=2, other parameters set as above
knn_msd.fit(trainingSet) #fit model to the training set 70%
predictions_knn_msd = knn_msd.test(testSet) #predict for test set values (30%)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [22]:
# Measuring the RMSE
accuracy.rmse(predictions_knn_msd, verbose=True) 

RMSE: 2.9313


2.9312654402210976

    pearson = 2.8842
    pearson_baseline = 2.8805
    cosine = 2.9293
    MSD    = 2.9313
    The best numbers are for pearson_baseline. 
    But these results cannot be compared as the seeds could vary for accurate comparison. 
    We must used same seed for each algorithm.

# Item based collaboration

In [None]:
# sim options for item based collaboration
sim_options = {
    'name': 'pearson',
    'user_based': False #user-based CF
}

In [None]:
# Creating another knn model with pearson algo and 2 neighbors
knn_item = KNNBasic(sim_options=sim_options,k=2,min_k=1) #neighbours=7, other parameters set as above
knn_item.fit(trainingSet) #fit model to the training set
predictions_knn_item = knn_item.test(testSet) #predict for test set values

In [None]:
# Measuring the RMSE
accuracy.rmse(predictions_knn_item, verbose=True) 

        pearson(user based) = 2.8842, 
        pearson (item_based) = 2.8807 
        pearson_baseline = 2.8805 
        cosine = 2.9293 MSD = 2.9313 T
        The best numbers are for pearson_baseline. 
        But these results cannot be compared as the seeds could vary for accurate comparison. 
        We must used same seed for each algorithm.

# Using Meta Data to improve prediction:

In [6]:
# Load Clothing Metadata
data_df = pd.read_csv('./metadata.csv')

# Print the headers for columns
data_df.head(10)

Unnamed: 0,fitting_condition,user_id,item_id,occasion,somato_category,clothing,size
0,fit,12572,2953,ceremony,6,sheath,6
1,fit,31098,1545,ceremony,4,gown,10
2,small,23468,4926,regular,5,jacket,4
3,fit,98202,18,formal meet,5,gown,4
4,fit,28873,63,get-together,3,mini,2
5,fit,82759,6,formal meet,4,gown,2
6,fit,54140,115,formal meet,4,dress,4
7,fit,6685,3014,get-together,4,dress,8
8,fit,66936,1972,ceremony,4,dress,2
9,large,35616,5663,office,2,sweater,4


In [7]:
# printing the size of data frame
data_df.shape

(121000, 7)

Getting the information about the Meta Data.

In [8]:
# checking for null values in dataset
data_df.isnull().sum()
rating_df.isnull().sum()

user_id    0
item_id    0
rating     0
dtype: int64

    Checking for null values in the dataset. 
    If there is null value we will impute either mean or most frequent value.
    Since the column information is categorical we cannot impute mean value as is. 
    So we can either chose nearest to categorical value or ffill/bfill option to impute missing data.

    We have 0 rows with null values. So we will not impute any values.

In [9]:
# checking information in clothes column
data_df['clothing'].value_counts()

dress         58374
gown          28093
sheath        12033
shift          3359
jumpsuit       3263
              ...  
jeans             2
for               2
crewneck          1
buttondown        1
sweatpants        1
Name: clothing, Length: 68, dtype: int64

    We have total 68 diffferent types of dress. 
    Some types of dress and not represented well. 
    It will be difficult to provide prediction for those dress as the dataset is inbalanced.

In [10]:
# Getting distribution information for values in clothing column
data_df['clothing'].value_counts(normalize=True)

dress         0.482430
gown          0.232174
sheath        0.099446
shift         0.027760
jumpsuit      0.026967
                ...   
jeans         0.000017
for           0.000017
crewneck      0.000008
buttondown    0.000008
sweatpants    0.000008
Name: clothing, Length: 68, dtype: float64

In [11]:
# Getting distribution information for values in occasion column
data_df['occasion'].value_counts()

ceremony        36249
formal meet     25558
get-together    22390
other           14348
regular         10448
office           9462
holiday          2544
party               1
Name: occasion, dtype: int64

    We have total 8 diffferent types occasion fields. 
    Party occasion is not represented well. 
    But since this is an attribute given by user and not by the dress provides we should not drop it as an outlier.

In [12]:
# Getting distribution information for values in Somato category column
data_df['somato_category'].value_counts()

4    34726
2    27403
6    13982
5    13929
3     9383
7     9354
0     9154
1     3069
Name: somato_category, dtype: int64

Somato category is a good indicator to map users to find nearest neighbor. 

Based on the study of Meta Data these three categories should be converted to one hot encoding.

Using the get_dummies option of pandas to create onehot encoded data frame.

Adding appropriate prefix for better naming of new columns of one hot encoded data frame.

In [13]:
# Creating dataframe with onehot encoding for occasion, clothing and somato_category
data_df_onehot = data_df.copy()
data_df_onehot = pd.get_dummies(data_df_onehot, columns=['occasion','clothing','somato_category'], prefix = ['occasion','clothing','somato'])

data_df_onehot.head()

Unnamed: 0,fitting_condition,user_id,item_id,size,occasion_ceremony,occasion_formal meet,occasion_get-together,occasion_holiday,occasion_office,occasion_other,...,clothing_turtleneck,clothing_vest,somato_0,somato_1,somato_2,somato_3,somato_4,somato_5,somato_6,somato_7
0,fit,12572,2953,6,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,fit,31098,1545,10,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,small,23468,4926,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,fit,98202,18,4,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,fit,28873,63,2,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [14]:
# Checking size of new dataframe with one hot data
data_df_onehot.shape

(121000, 88)

Merging the meta data dataframe with train data frame. 

In [15]:
# merging the rating information into metadata data frame
df_merge=pd.merge(data_df_onehot,rating_df)

In [16]:
# checking size of newly created data frame
df_merge.shape

(100160, 89)

    The merged dataframe is larger than the training data frame. 
    To confirm all values are correct, checking for null values.

In [17]:
# checking if the merge is correct
df_merge.isnull().sum()

fitting_condition    0
user_id              0
item_id              0
size                 0
occasion_ceremony    0
                    ..
somato_4             0
somato_5             0
somato_6             0
somato_7             0
rating               0
Length: 89, dtype: int64

    All values are legal values with 0 null values. 
    We can now use this dataset for processing.

The meta data is process below using sklearn knn algorithm.

# Using sklearn Kneighbors Colaboration

In [24]:
# importing knn from sklearn
from sklearn.neighbors import KNeighborsClassifier
knn_sk = KNeighborsClassifier(n_neighbors=7)

# Import train_test_split function
from sklearn.model_selection import train_test_split

predictor_cols = ['user_id', 'item_id']
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(rating_df[predictor_cols], rating_df.rating, test_size=0.3)

knn_sk.fit(X_train,y_train)
y_pred = knn_sk.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(y_pred)

Accuracy: 0.09646666666666667
[ 9  2  7 ...  6 10  1]


Accuracy of the knn from sklearn is only 9.6 percent. Dropping this algorithm.

In [49]:
#creating function for SVD algorithm.
svd=SVD()
# Train the algorithm on the trainset, and predict ratings for the testset
svd.fit(trainingSet)
predictions_svd = svd.test(testSet)

In [50]:
# checking RMSE score
accuracy.rmse(predictions_svd, verbose=True) 

RMSE: 2.9215


2.921541148105007

SVD algorithm provides a  lower rmse score. Dropping the algorithm from future considerations. 

# Metadata prediction processing.

In [227]:
#Printing all column names
df_merge.columns

Index(['fitting_condition', 'user_id', 'item_id', 'size', 'occasion_ceremony',
       'occasion_formal meet', 'occasion_get-together', 'occasion_holiday',
       'occasion_office', 'occasion_other', 'occasion_party',
       'occasion_regular', 'clothing_ballgown', 'clothing_blazer',
       'clothing_blouse', 'clothing_blouson', 'clothing_bomber',
       'clothing_buttondown', 'clothing_caftan', 'clothing_cami',
       'clothing_cape', 'clothing_cardigan', 'clothing_coat', 'clothing_combo',
       'clothing_crewneck', 'clothing_culotte', 'clothing_culottes',
       'clothing_down', 'clothing_dress', 'clothing_duster', 'clothing_for',
       'clothing_frock', 'clothing_gown', 'clothing_henley', 'clothing_hoodie',
       'clothing_jacket', 'clothing_jeans', 'clothing_jogger',
       'clothing_jumpsuit', 'clothing_kaftan', 'clothing_kimono',
       'clothing_knit', 'clothing_legging', 'clothing_leggings',
       'clothing_maxi', 'clothing_midi', 'clothing_mini', 'clothing_overalls',
      

In [228]:
from sklearn.neighbors import KNeighborsClassifier
knn_sk_meta = KNeighborsClassifier(n_neighbors=7)

# Import train_test_split function
from sklearn.model_selection import train_test_split
# selecting columns to be used for predictions
predictor_cols_meta = ['user_id', 'item_id', 'occasion_ceremony',
       'occasion_formal meet', 'occasion_get-together', 'occasion_holiday',
       'occasion_office', 'occasion_other', 'occasion_party',
       'occasion_regular', 'clothing_ballgown', 'clothing_blazer',
       'clothing_blouse', 'clothing_blouson', 'clothing_bomber',
       'clothing_buttondown', 'clothing_caftan', 'clothing_cami',
       'clothing_cape', 'clothing_cardigan', 'clothing_coat', 'clothing_combo',
       'clothing_crewneck', 'clothing_culotte', 'clothing_culottes',
       'clothing_down', 'clothing_dress', 'clothing_duster', 'clothing_for',
       'clothing_frock', 'clothing_gown', 'clothing_henley', 'clothing_hoodie',
       'clothing_jacket', 'clothing_jeans', 'clothing_jogger',
       'clothing_jumpsuit', 'clothing_kaftan', 'clothing_kimono',
       'clothing_knit', 'clothing_legging', 'clothing_leggings',
       'clothing_maxi', 'clothing_midi', 'clothing_mini', 'clothing_overalls',
       'clothing_overcoat', 'clothing_pant', 'clothing_pants',
       'clothing_parka', 'clothing_peacoat', 'clothing_poncho',
       'clothing_print', 'clothing_pullover', 'clothing_romper',
       'clothing_sheath', 'clothing_shift', 'clothing_shirt',
       'clothing_shirtdress', 'clothing_skirt', 'clothing_skirts',
       'clothing_skort', 'clothing_suit', 'clothing_sweater',
       'clothing_sweatershirt', 'clothing_sweatpants', 'clothing_sweatshirt',
       'clothing_t-shirt', 'clothing_tank', 'clothing_tee', 'clothing_tight',
       'clothing_top', 'clothing_trench', 'clothing_trouser',
       'clothing_trousers', 'clothing_tunic', 'clothing_turtleneck',
       'clothing_vest', 'somato_0', 'somato_1', 'somato_2', 'somato_3',
       'somato_4', 'somato_5', 'somato_6', 'somato_7']
# Split dataset into training set and test set
X_train_meta, X_test_meta, y_train_meta, y_test_meta = train_test_split(df_merge[predictor_cols_meta], df_merge.rating, test_size=0.3)

knn_sk_meta.fit(X_train_meta,y_train_meta)
y_pred_meta = knn_sk_meta.predict(X_test_meta)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test_meta, y_pred_meta))
print(y_pred_meta)

Accuracy: 0.09960729499467519
[3 5 5 ... 5 4 6]


    Knn from sklearn using metadata: 9.87% accuracy.
    Knn from sklearn               : 9.64% accuracy.
    Using the metasata improves the accuracy of the model.

In [229]:
# merging test data with meta data
df_merge_pred=pd.merge(predict_df,data_df_onehot, on=['user_id','item_id'], how='left')

In [230]:
df_merge_pred.head(20)

Unnamed: 0,user_id,item_id,fitting_condition,size,occasion_ceremony,occasion_formal meet,occasion_get-together,occasion_holiday,occasion_office,occasion_other,...,clothing_turtleneck,clothing_vest,somato_0,somato_1,somato_2,somato_3,somato_4,somato_5,somato_6,somato_7
0,52491,1455,fit,10,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,27265,91,fit,4,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,226,5725,fit,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,26368,1072,fit,4,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,103122,2953,fit,6,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
5,54025,2741,fit,19,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,12068,51,fit,6,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,63062,1479,fit,12,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,39907,1281,fit,8,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9,72128,2251,fit,6,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [231]:
predict_df.head(20)

Unnamed: 0,user_id,item_id
0,52491,1455
1,27265,91
2,226,5725
3,26368,1072
4,103122,2953
5,54025,2741
6,12068,51
7,63062,1479
8,39907,1281
9,72128,2251


In [232]:
# Checking size of merge data frame
df_merge_pred.shape

(21028, 88)

In [233]:
# comparing with original predict data frame
predict_df.shape

(20998, 2)

    We observe there is a duplication of 30 entries in the merged dataframe. We need to remove those duplicates.

In [234]:
# checking unique users in test data
predict_df.user_id.nunique()

18027

In [235]:
# checking unique items in test data
predict_df.item_id.nunique()

4102

    We have duplication of users and items in the original df so we cannot just drop the duplicates.

In [236]:
# checking if there is a user id with number 0.
# If not we will match the two data frames length by adding rows with userid 0 in predict_df
unique_user_id=predict_df.user_id.unique()

In [237]:
# sorting the unique_user_id
print(sorted(unique_user_id))

[2, 13, 15, 19, 23, 32, 45, 48, 56, 58, 59, 60, 68, 70, 74, 78, 80, 83, 97, 100, 103, 104, 106, 110, 112, 120, 140, 144, 145, 146, 147, 148, 159, 166, 167, 169, 171, 173, 178, 186, 189, 191, 201, 207, 212, 220, 221, 225, 226, 228, 229, 230, 231, 234, 235, 247, 251, 259, 274, 277, 281, 282, 291, 298, 309, 310, 311, 323, 326, 329, 336, 337, 345, 348, 356, 362, 363, 376, 387, 391, 394, 398, 401, 411, 416, 438, 442, 444, 457, 471, 479, 483, 484, 486, 488, 499, 504, 516, 520, 527, 530, 533, 538, 540, 544, 548, 556, 562, 564, 568, 570, 577, 593, 599, 600, 606, 609, 613, 615, 616, 624, 632, 635, 636, 654, 662, 663, 664, 670, 680, 687, 692, 702, 709, 713, 729, 735, 736, 744, 747, 749, 750, 756, 772, 774, 790, 791, 798, 811, 812, 813, 837, 840, 845, 852, 860, 871, 882, 895, 898, 909, 912, 919, 920, 921, 925, 939, 956, 967, 974, 983, 993, 997, 998, 999, 1006, 1014, 1027, 1031, 1035, 1036, 1037, 1041, 1044, 1048, 1050, 1076, 1079, 1097, 1099, 1106, 1116, 1119, 1121, 1127, 1129, 1130, 1131, 1132, 

In [238]:
predict_df.shape

(20998, 2)

In [239]:
# using the logic below to iteratively clear all redundant logic. This will clean the mismatch one iteration per run
# at the end of this for loop, the test dataframe and the merged dataframe with meta data will have same combinations of user-id and item-id
#i = 30
#for temp in range (0,30):
#    predict_df_backup = predict_df
#    for temp_2 in range (0,i):
    #predict_df_backup = predict_df_backup.append({'user_id': 0},ignore_index=True)
    #df_merge_pred['user_id_match?'] = np.where(df_merge_pred['user_id'] == predict_df_backup['user_id'], 'True', 'False') # # checking the data which maps between test dataframe and merged data frame.
    #df_merge_pred_trimmed = df_merge_pred[df_merge_pred['user_id_match?'] == 'False']
    #df_merge_pred=pd.merge(predict_df,df_merge_pred_trimmed, on=['user_id','item_id'], how='left')
    #print(df_merge_pred.shape)
    #print(df_merge_pred.isnull().sum())
    #i -= 1

In [240]:
df_merge_pred_bckup = df_merge_pred

In [241]:
df_merge_pred = df_merge_pred_bckup

In [243]:

## Code to match the merged dataset to original test dataset

#j=30
#ind_list = []
#for temp in range (0,30):
#    duplicate = 0
#    i = 0
#    ind = 0
#    predict_df_backup = predict_df
#    for temp2 in range (0,j):
#        predict_df_backup = predict_df_backup.append({'user_id': 0},ignore_index=True)
#    print(i)
##    print(ind)
#    print(predict_df_backup.shape)
#    #print(df_merge_pred.shape)
#    print('entering for loop')
#    for row in df_merge_pred.iterrows():
#        if (i not in ind_list):
#            if ((duplicate == 0) and (df_merge_pred['user_id'][i] != predict_df_backup['user_id'][i])):
#                duplicate = 1
#                print('dropping Row')
#                print(i)
#                ind = i
#                ind_list.append(i)
#        i += 1
#    j -= 1
#    if (duplicate == 1):
#        df_merge_pred_updated = df_merge_pred.drop(df_merge_pred.index[ind])
#    print(i)
#    df_merge_pred = df_merge_pred_updated

0
0
(21026, 2)
entering for loop


KeyError: 1055

### df_merge_pred.shape

In [48]:
# checking if the merge is correct
df_merge_pred.isnull().sum()

user_id                  0
item_id                  0
fitting_condition    20660
size                 20660
occasion_ceremony    20660
                     ...  
somato_4             20660
somato_5             20660
somato_6             20660
somato_7             20660
user_id_match?       20660
Length: 89, dtype: int64

In [244]:
y_pred_test = knn_sk_meta.predict(df_merge_pred[predictor_cols_meta])

In [26]:
# Generating the dataframe as per expected submission format.
my_submission = pd.DataFrame({'Id': df_merge_pred.user_id.astype(str)+'-'+df_merge_pred.item_id.astype(str) , 'Rating': y_pred_test})

my_submission.to_csv('submission4.csv', index=False)