In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading subset of data and restrict only customer who have bought at least three transactions

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from skimage import io

In [None]:
df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
# Randomely sample 1 Lakh records
users = df.sample(n=100000)

In [None]:
# Join the user data with article id
df = users.merge(articles, on='article_id')
df = df[['t_dat', 'customer_id', 'article_id', 'prod_name', 'product_type_name',
       'product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name', 'detail_desc']]

feature_subset = ['product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']

# We will only subset features ignoring transaction date

In [None]:
Only_features = df[['customer_id', 'article_id'] + feature_subset]
dummies_df = pd.get_dummies(Only_features, columns=feature_subset)

In [None]:
dummies_df.head(5)

In [None]:
# Minimum we will choose minimum a customer has to be doing three transactions
minimum_transaction = 3
groupby_customer = dummies_df.groupby('customer_id')


l = []
cutomer_ids = []
article_ids = []
for key in groupby_customer.groups.keys():
    temp = groupby_customer.get_group(key)
    if temp.article_id.nunique() >= minimum_transaction:
        l.append(temp.drop('article_id', axis=1).sum(numeric_only=True).values)
        cutomer_ids.append(key)
        article_ids.extend(temp.article_id.values.tolist())

In [None]:
user_feature = pd.DataFrame(l, columns = dummies_df.columns[2:])
normalized_user_feature = user_feature.div(user_feature.sum(axis=1), axis=0)
normalized_user_feature.insert(0, 'customer_id', cutomer_ids)
normalized_user_feature = normalized_user_feature.set_index('customer_id')
normalized_user_feature

In [None]:
item_feature = dummies_df.drop_duplicates(subset='article_id')
item_feature = item_feature[item_feature.article_id.isin(article_ids)].drop('customer_id', axis=1)
item_feature = item_feature.set_index('article_id')
item_feature

In [None]:
scores = normalized_user_feature.dot(item_feature.T)
scores

# We will performing matrix decomposition 

In [None]:

from numpy.linalg import svd
matrix = scores.values


In [None]:
u, s, vh = svd(matrix, full_matrices=False)

In [None]:
print(u.shape)
print(s.shape)
print(vh.shape)

In [None]:
reconstructed_vectors = u @ np.diag(s) @ vh
np.allclose(reconstructed_vectors,matrix)

# We will now Use Cosine Similarity to get the recommendations

In [None]:
# Find the highest similarity
def cosine_similarity(v,u):
    return (v @ u)/ (np.linalg.norm(v) * np.linalg.norm(u))

In [None]:
scores

# Get similarity Scores

In [None]:
#We are writing a function when given a column name which is article we will get top 10 recommendations
def similarity_score_recommendation(column_value):
    ranking = {}
    highest_similarity = -np.inf
    highest_sim_col = -1
    for col in range(0,vh.shape[1]):
        if column_value!=col:
            similarity = cosine_similarity(vh[:,column_value], vh[:,col])
            if similarity > highest_similarity:
                highest_similarity = similarity
                highest_sim_col = col
                ranking[col] = highest_similarity
    sorted_ranking = {k: v for k, v in sorted(ranking.items(), key=lambda item: item[1],reverse=True)[:10]}
    article_recommendation = []
    for key in sorted_ranking:
        article_recommendation.append(scores.columns[key])
    return article_recommendation

In [None]:
def get_rcmnd_top_ten(customer_id, scores):
    cutomer_scores = scores.loc[customer_id]
    customer_prev_items = groupby_customer.get_group(customer_id)['article_id']
    recommendations = []
    for prev_items in customer_prev_items.iteritems():
        items = prev_items[1]
        score_idx = scores.columns.get_loc(items)
        recommendation = similarity_score_recommendation(score_idx)
        recommendations.extend(recommendation)
    
    return list(set(recommendations))[:10]
        

In [None]:
def previous_transaction_articles(customer_id, scores):
    cutomer_scores = scores.loc[customer_id]
    customer_prev_items = groupby_customer.get_group(customer_id)['article_id']
    prev_items = []
    for item in customer_prev_items.iteritems():
        prev_item = item[1]
        prev_items.append(prev_item)
    return prev_items
        

In [None]:
#We will try some random custoner-id
customer_id = scores.index[356]
recommendation = get_rcmnd_top_ten(customer_id, scores)

In [None]:
prev_items = previous_transaction_articles(customer_id, scores)

# Now we will plot our Recommendation


In [None]:
def plot_prev(prev_items):
    fig = plt.figure(figsize=(20, 10))
    for item, i in zip(prev_items, range(1, len(prev_items)+1)):
        item = '0' + str(item)
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(1, 6, i)
        plt.imshow(image)

In [None]:
def plot_rcmnd(rcmnds):
    fig = plt.figure(figsize=(20, 10))
    for item, i in zip(rcmnds, range(1, len(rcmnds)+1)):
        item = '0' + str(item)
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(1, 10, i)
        plt.imshow(image)

In [None]:
path = "../input/h-and-m-personalized-fashion-recommendations/images"

In [None]:
plot_prev(prev_items)

In [None]:
plot_rcmnd(recommendation)