In [None]:
import cv2
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import seaborn as sns
import plotly.express as px
from os import listdir
from os.path import isfile, join

from termcolor import colored
from IPython.display import HTML
from PIL import Image

import warnings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')

# This notebook is an continuation to my previous notebook on finding similar products.

## Go thorugh it to understand the full cycle: https://www.kaggle.com/code/sussudharsan/h-m-similar-products-recommender-script

In the previous notebook, we built a recommender system to find similar articles.

Using this recommder, we'll find similar products to what a customer has purcahsed already.

Top 12 products for each customer is then selected usign frequency approach.

This approach is shown below for a sample population of article and transaction, which gave promising score.

This can be applied to the entire dataset.

# <span style="color:green">*Upvote if this notebook is useful!*</span>

In [None]:
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
sample_sub = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")

In [None]:
transactions.head()

# Get only Customers X Articles Purchased from transaction data

In [None]:
cust_pur = transactions[['customer_id','article_id']]

# Let's recreate the similar product recommender script from this notebook
https://www.kaggle.com/code/sussudharsan/h-m-similar-products-recommender-script

The below script is run only for a small sample population. This concept can be expanded to the full article and transaction dataset.

In [None]:
articles_sub = articles[['article_id','prod_name','product_type_name','product_group_name','graphical_appearance_name','colour_group_name'
                         ,'perceived_colour_value_name','perceived_colour_master_name','department_name','index_name','index_group_name'
                         ,'section_name','garment_group_name','detail_desc']]

# Let's remove space in all string columns
for i in articles_sub.columns[1:]:
    articles_sub[i] = articles_sub[i].str.replace(" ","")

#Combine all info from columns to a single column separated by space

cols = ['prod_name', 'product_type_name', 'product_group_name',
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name', 'perceived_colour_master_name',
       'department_name', 'index_name', 'index_group_name', 'section_name',
       'garment_group_name', 'detail_desc']
articles_sub['combined'] = articles_sub[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

articles_final = articles_sub[['article_id','combined']]

#Only 5000 products are taken because of computational issues
articles_final = articles_final.loc[:1000]

#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
articles_final['combined'] = articles_final['combined'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(articles_final['combined'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(articles_final.index, index=articles_final['article_id']).drop_duplicates()

# Function that takes in article_id as input and outputs most similar articles
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the article that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all articles
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the articles based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar articles
    sim_scores = sim_scores[:12]

    # Get the article indices
    article_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar articles
    return articles_final['article_id'].iloc[article_indices]


# Let's predict the 12 similar items for every article purchased by a customer

## For now, take only transactions with article id present in our articles_final dataset.
This is because of memory constraint

In [None]:
articles_final.head()

In [None]:
cust_pur = cust_pur[cust_pur['article_id'].isin(articles_final['article_id'])]
cust_pur.head()

In [None]:
#Subset only 20k rows to build the pipeline
cust_pur.reset_index(inplace=True,drop=True)
cust_pur = cust_pur.loc[:20000]

## Predicting articles to be purchased by every customer based on their history

In [None]:
cust_pur['similar_articles'] = cust_pur['article_id'].apply(lambda x: list(get_recommendations(x)))

In [None]:
def app_func(dataf):
  temp = []
  dataf.reset_index(inplace=True,drop=True)
  for i in range(dataf.shape[0]):
    #print(i)
    #print(i,dataf['similar_articles'][i])
    temp = temp + dataf['similar_articles'][i]
  #print('temp',temp)
  return temp#[ item for elem in temp for item in elem]

In [None]:
fin = pd.DataFrame(cust_pur.groupby(['customer_id']).apply(app_func))
fin = fin.reset_index()
fin.columns = ['customer_id','next_articles']

from collections import Counter
for i in range(fin.shape[0]):
    fin['next_articles'][i] = ([element for element,count in Counter(fin['next_articles'][i]).most_common()])[:12]

for i in range(fin.shape[0]):
  fin['next_articles'][i] = ' '.join(['0'+ str(x) for x in fin['next_articles'][i]])

In [None]:
fin.columns = ['customer_id','prediction']

# The above approach can eb expanded tothe full articles id and transaction dataset