**Using Content Based Filtering to recommend similar items by:**
1. Creating user-feature matrix
1. Creating item-feature matrix
1. Measuring similraity using dot product as metric
1. Recommending top-k similar items 


**Second Approach**

Perform dimensionality reduction using PCA on user_feature and item_feature matrices

**I used first 20000 rows from transactions record**

**Limited customers to customers who bought at least two items**


In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from skimage import io

In [None]:
df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', chunksize=100000)
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
users = next(df)
df = users.merge(articles, on='article_id')
df = df[['t_dat', 'customer_id', 'article_id', 'prod_name', 'product_type_name',
       'product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name', 'detail_desc']]

feature_subset = ['product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']

In [None]:
df.shape

In [None]:
df.head()

In [None]:
#Choose features to build feature space
features = feature_subset
df1 = df[['customer_id', 'article_id'] + features]
dummies_df = pd.get_dummies(df1, columns=features)
dummies_df

In [None]:
minimum_items = 2
groupby_customer = dummies_df.groupby('customer_id')

l = []
cutomer_ids = []
article_ids = []
for key in groupby_customer.groups.keys():
    temp = groupby_customer.get_group(key)
    if temp.article_id.nunique() >= minimum_items:
        l.append(temp.drop('article_id', axis=1).sum(numeric_only=True).values)
        cutomer_ids.append(key)
        article_ids.extend(temp.article_id.values.tolist())

In [None]:

user_feature = pd.DataFrame(l, columns = dummies_df.columns[2:])
normalized_user_feature = user_feature.div(user_feature.sum(axis=1), axis=0)
normalized_user_feature.insert(0, 'customer_id', cutomer_ids)
normalized_user_feature = normalized_user_feature.set_index('customer_id')
normalized_user_feature

In [None]:
item_feature = dummies_df.drop_duplicates(subset='article_id')
item_feature = item_feature[item_feature.article_id.isin(article_ids)].drop('customer_id', axis=1)
item_feature = item_feature.set_index('article_id')
item_feature

In [None]:
scores = normalized_user_feature.dot(item_feature.T)
scores

In [None]:
# normalized_user_feature.to_csv('normalized_user_feature.csv')
# item_feature.to_csv('item_feature.csv')
# scores.to_csv('scores.csv')

In [None]:
def get_rcmnd(customer_id, scores):
    cutomer_scores = scores.loc[customer_id]
    customer_prev_items = groupby_customer.get_group(customer_id)['article_id']
    prev_dropped = cutomer_scores.drop(customer_prev_items.values)
    ordered = prev_dropped.sort_values(ascending=False)   
    return ordered, customer_prev_items

In [None]:
def plot_prev(prev_items):
    fig = plt.figure(figsize=(20, 10))
    for item, i in zip(prev_items, range(1, len(prev_items)+1)):
        item = '0' + str(item)
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(1, 6, i)
        plt.imshow(image)

In [None]:
def plot_rcmnd(rcmnds):
    fig = plt.figure(figsize=(20, 10))
    for item, i in zip(rcmnds, range(1, k+1)):
        item = '0' + str(item)
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(1, 6, i)
        plt.imshow(image)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=100)
pca.fit(normalized_user_feature)
pca.explained_variance_ratio_.sum()

In [None]:
user_feature_pca = pd.DataFrame(pca.transform(normalized_user_feature), columns=['component_{}'.format(i) for i in range(1, 101)]).set_index(normalized_user_feature.index)
item_feature_pca = pd.DataFrame(pca.transform(item_feature), columns=['component_{}'.format(i) for i in range(1, 101)]).set_index(item_feature.index)

In [None]:
scores_pca = user_feature_pca.dot(item_feature_pca.T)

In [None]:
# user_feature_pca.to_csv('user_feature_pca.csv')
# item_feature_pca.to_csv('item_feature_pca.csv')
# scores_pca.to_csv('scores_pca.csv')

In [None]:
k = 6
customer_id = scores.index[1]
rcmnds, prev_items = get_rcmnd(customer_id, scores)
rcmnds_pca, prev_items = get_rcmnd(customer_id, scores_pca)
rcmnds = rcmnds.index.values[:k]
rcmnds_pca = rcmnds_pca.index.values[:k]
path = "../input/h-and-m-personalized-fashion-recommendations/images"

In [None]:
plot_prev(prev_items)

In [None]:
plot_rcmnd(rcmnds)

In [None]:
plot_rcmnd(rcmnds_pca)