**Content Based Filtering to recommend similar items by**:

```
Creating item-feature and user-feature matrices
Determine similarity using dot product
Recommend top-k items
```


Creating item-feature and user-feature matrices\n
Determine similarity using dot product

In [1]:
import time
import numpy as np
import pandas as pd

import multiprocessing as mp
from multiprocessing import Pool
from functools import partial

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [3]:
import os

In [4]:
root= '/gdrive/MyDrive/hnm/data/'
csv_train = os.path.join(root,'raw/transactions_train.csv')
csv_sub = os.path.join(root, 'raw/sample_submission.csv')
csv_users = os.path.join(root, 'raw/customers.csv')
csv_items = os.path.join(root, 'raw/articles.csv')

#df = pd.read_csv(csv_train, dtype={'article_id': str}, parse_dates=['t_dat'])
#df_sub = pd.read_csv(csv_sub)

In [5]:
df = pd.read_csv(csv_train, chunksize=100000)
articles = pd.read_csv(csv_items)
users = next(df)
df = users.merge(articles, on='article_id')
df = df[['t_dat', 'customer_id', 'article_id', 'prod_name', 'product_type_name',
       'product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name', 'detail_desc']]

feature_subset = ['product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']

In [6]:
#Choose features to build feature space
features = feature_subset
df1 = df[['customer_id', 'article_id'] + features]
dummies_df = pd.get_dummies(df1, columns=features)
dummies_df

Unnamed: 0,customer_id,article_id,product_group_name_Accessories,product_group_name_Bags,product_group_name_Cosmetic,product_group_name_Garment Full body,product_group_name_Garment Lower body,product_group_name_Garment Upper body,product_group_name_Items,product_group_name_Nightwear,...,garment_group_name_Shorts,garment_group_name_Skirts,garment_group_name_Socks and Tights,garment_group_name_Special Offers,garment_group_name_Swimwear,garment_group_name_Trousers,garment_group_name_Trousers Denim,"garment_group_name_Under-, Nightwear",garment_group_name_Unknown,garment_group_name_Woven/Jersey/Knitted mix Baby
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,3681748607f3287d2c3a65e00bb5fb153de30e9becf158...,663713001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,4ef5967ff17bf474bffebe5b16bd54878e1d4105f7b4ed...,663713001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,6b7b10d2d47516c82a6f97332478dab748070f09693f09...,663713001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,8ac137752bbe914aa4ae6ad007a9a0c5b67a1ab2b2d474...,663713001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,38c4fd64a42108a4c821cabe4fd99f87f43981a924cfb6...,661351001,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,38d1ebd07fec370af9dff972572157516f4d7e49818dcf...,637028001,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,3915b86df4116d0ddbccfb88d49da8f627431c2120dafa...,623873001,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99998,3934131f5acca8b4aa2e036df4adf23c752e024e1d2495...,548837002,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
minimum_items = 2
groupby_customer = dummies_df.groupby('customer_id')

l = []
cutomer_ids = []
article_ids = []
for key in groupby_customer.groups.keys():
    temp = groupby_customer.get_group(key)
    if temp.article_id.nunique() >= minimum_items:
        l.append(temp.drop('article_id', axis=1).sum(numeric_only=True).values)
        cutomer_ids.append(key)
        article_ids.extend(temp.article_id.values.tolist())

In [8]:
user_feature = pd.DataFrame(l, columns = dummies_df.columns[2:])
normalized_user_feature = user_feature.div(user_feature.sum(axis=1), axis=0)
normalized_user_feature.insert(0, 'customer_id', cutomer_ids)
normalized_user_feature = normalized_user_feature.set_index('customer_id')
normalized_user_feature

Unnamed: 0_level_0,product_group_name_Accessories,product_group_name_Bags,product_group_name_Cosmetic,product_group_name_Garment Full body,product_group_name_Garment Lower body,product_group_name_Garment Upper body,product_group_name_Items,product_group_name_Nightwear,product_group_name_Shoes,product_group_name_Socks & Tights,...,garment_group_name_Shorts,garment_group_name_Skirts,garment_group_name_Socks and Tights,garment_group_name_Special Offers,garment_group_name_Swimwear,garment_group_name_Trousers,garment_group_name_Trousers Denim,"garment_group_name_Under-, Nightwear",garment_group_name_Unknown,garment_group_name_Woven/Jersey/Knitted mix Baby
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0.000000,0.0,0.0,0.033333,0.033333,0.033333,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.100000,0.000000,0.0
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,0.000000,0.0,0.0,0.000000,0.000000,0.100000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.060000,0.0,0.000000,0.0,0.000000,0.000000,0.0
0003abe64294e66a6310c3436fa9e5b754cc5603deef4f26fc8ab8d043af9358,0.000000,0.0,0.0,0.000000,0.000000,0.100000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.025000,0.000000,0.0
0004068f54dbe1c7054b23c615edc5f733a508ecc54930bf323209f20410898c,0.000000,0.0,0.0,0.050000,0.000000,0.050000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fff0ac18093a702a0a06f4cc76582632df3ede9a36556e345150befbeed6885a,0.000000,0.0,0.0,0.000000,0.050000,0.050000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0
fff15526121f7d914a54784e68761a1d30b7547e3555738dcceb386eaaa24c4b,0.000000,0.0,0.0,0.000000,0.040000,0.060000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.020000,0.0,0.000000,0.000000,0.0
fff3e75605ec575be9b95eda1e6557299e81bba12668d750c0e973528e48b7ee,0.000000,0.0,0.0,0.000000,0.020000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.020000,0.0,0.000000,0.0,0.080000,0.000000,0.0
fff4b145d7469e023b147b0f8375c565b1be43944987792153ccc0af41466cf3,0.000000,0.0,0.0,0.000000,0.033333,0.066667,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.033333,0.0,0.033333,0.0,0.000000,0.000000,0.0


In [9]:
item_feature = dummies_df.drop_duplicates(subset='article_id')
item_feature = item_feature[item_feature.article_id.isin(article_ids)].drop('customer_id', axis=1)
item_feature = item_feature.set_index('article_id')
item_feature

Unnamed: 0_level_0,product_group_name_Accessories,product_group_name_Bags,product_group_name_Cosmetic,product_group_name_Garment Full body,product_group_name_Garment Lower body,product_group_name_Garment Upper body,product_group_name_Items,product_group_name_Nightwear,product_group_name_Shoes,product_group_name_Socks & Tights,...,garment_group_name_Shorts,garment_group_name_Skirts,garment_group_name_Socks and Tights,garment_group_name_Special Offers,garment_group_name_Swimwear,garment_group_name_Trousers,garment_group_name_Trousers Denim,"garment_group_name_Under-, Nightwear",garment_group_name_Unknown,garment_group_name_Woven/Jersey/Knitted mix Baby
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
663713001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
541518023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
505221004,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
685687003,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
685687004,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661351001,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
637028001,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
623873001,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
548837002,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
scores = normalized_user_feature.dot(item_feature.T)
scores

article_id,663713001,541518023,505221004,685687003,685687004,685687001,505221001,688873012,501323011,598859003,...,327821026,661929001,466381012,714429001,641228003,661351001,637028001,623873001,548837002,538977001
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0.166667,0.100000,0.233333,0.166667,0.20,0.200000,0.233333,0.200000,0.233333,0.166667,...,0.033333,0.233333,0.133333,0.200,0.366667,0.233333,0.233333,0.033333,0.166667,0.100000
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0.800000,0.800000,0.050000,0.150000,0.15,0.150000,0.000000,0.150000,0.350000,0.250000,...,0.000000,0.000000,0.350000,0.100,0.150000,0.050000,0.000000,0.150000,0.050000,0.050000
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,0.100000,0.100000,0.520000,0.600000,0.58,0.580000,0.480000,0.220000,0.160000,0.160000,...,0.000000,0.180000,0.040000,0.100,0.080000,0.100000,0.180000,0.040000,0.080000,0.260000
0003abe64294e66a6310c3436fa9e5b754cc5603deef4f26fc8ab8d043af9358,0.275000,0.125000,0.325000,0.150000,0.30,0.250000,0.325000,0.150000,0.275000,0.225000,...,0.000000,0.325000,0.225000,0.225,0.300000,0.150000,0.325000,0.075000,0.150000,0.175000
0004068f54dbe1c7054b23c615edc5f733a508ecc54930bf323209f20410898c,0.150000,0.200000,0.050000,0.200000,0.20,0.200000,0.050000,0.300000,0.200000,0.200000,...,0.000000,0.100000,0.350000,0.150,0.250000,0.150000,0.200000,0.150000,0.100000,0.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fff0ac18093a702a0a06f4cc76582632df3ede9a36556e345150befbeed6885a,0.450000,0.150000,0.050000,0.350000,0.45,0.450000,0.050000,0.350000,0.600000,0.400000,...,0.100000,0.100000,0.400000,0.050,0.050000,0.000000,0.100000,0.050000,0.000000,0.050000
fff15526121f7d914a54784e68761a1d30b7547e3555738dcceb386eaaa24c4b,0.160000,0.060000,0.320000,0.120000,0.18,0.220000,0.320000,0.100000,0.240000,0.200000,...,0.100000,0.380000,0.200000,0.200,0.220000,0.180000,0.300000,0.040000,0.160000,0.120000
fff3e75605ec575be9b95eda1e6557299e81bba12668d750c0e973528e48b7ee,0.760000,0.500000,0.000000,0.180000,0.28,0.280000,0.000000,0.120000,0.400000,0.440000,...,0.020000,0.000000,0.340000,0.080,0.080000,0.000000,0.000000,0.080000,0.000000,0.000000
fff4b145d7469e023b147b0f8375c565b1be43944987792153ccc0af41466cf3,0.200000,0.133333,0.200000,0.366667,0.40,0.433333,0.233333,0.233333,0.366667,0.300000,...,0.066667,0.133333,0.133333,0.100,0.133333,0.133333,0.133333,0.033333,0.100000,0.166667


In [11]:
def get_rcmnd(customer_id, scores):
    cutomer_scores = scores.loc[customer_id]
    customer_prev_items = groupby_customer.get_group(customer_id)['article_id']
    prev_dropped = cutomer_scores.drop(customer_prev_items.values)
    ordered = prev_dropped.sort_values(ascending=False)   
    return ordered, customer_prev_items

In [14]:
from sklearn.decomposition import PCA

In [15]:
pca = PCA(n_components=100)
pca.fit(normalized_user_feature)
pca.explained_variance_ratio_.sum()

0.9582976680229423

In [16]:
user_feature_pca = pd.DataFrame(pca.transform(normalized_user_feature), columns=['component_{}'.format(i) for i in range(1, 101)]).set_index(normalized_user_feature.index)
item_feature_pca = pd.DataFrame(pca.transform(item_feature), columns=['component_{}'.format(i) for i in range(1, 101)]).set_index(item_feature.index)

In [17]:
scores_pca = user_feature_pca.dot(item_feature_pca.T)

In [18]:
k = 6
customer_id = scores.index[1]
rcmnds, prev_items = get_rcmnd(customer_id, scores)
rcmnds_pca, prev_items = get_rcmnd(customer_id, scores_pca)
rcmnds = rcmnds.index.values[:k]
rcmnds_pca = rcmnds_pca.index.values[:k]
path = "../input/h-and-m-personalized-fashion-recommendations/images"

In [19]:
scores_pca.columns

Int64Index([663713001, 541518023, 505221004, 685687003, 685687004, 685687001,
            505221001, 688873012, 501323011, 598859003,
            ...
            327821026, 661929001, 466381012, 714429001, 641228003, 661351001,
            637028001, 623873001, 548837002, 538977001],
           dtype='int64', name='article_id', length=15059)

In [20]:
submission=pd.DataFrame(scores_pca.index, columns=['customer_id','prediction'])
submission['customer_id']=scores_pca.index
submission=submission.set_index('customer_id')
for ind in scores_pca.index:
  #print(ind)
  a=list()
  for cols in scores_pca.columns:
    if scores_pca.loc[ind,cols]>0.27:
      a.append(cols)
  submission.loc[ind,'prediction']=a
  #submission.append(ind,a)
  #print(submission.loc[ind])
  #print(submission.loc[ind,'prediction'])
submission.to_csv('submission.csv',index=True)



KeyboardInterrupt: ignored