## Purpose
The purpose of this notebook is to find similar products for any given article_id.
This is done by using TF-IDF and cosine similarity approach

Given any article ID the script finds 10 similar articles.

Please upvote if you find this helpful!

In [None]:
import cv2
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from os import listdir
from os.path import isfile, join

from termcolor import colored
from IPython.display import HTML
from PIL import Image

import warnings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')

In [None]:
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
articles.head()

# Let's build a single word column for every article id present

**<span style="color:#023e8a;"> This table contains all h&m articles with details such as a type of product, a color, a product group and other features.</span>**  
**<span style="color:#023e8a;"> Article data description: </span>**
  
- 105542 rows and 25 columns  
- No nulls apart from detail_desc  
- 11 int and 14 obj types are present 

> `article_id` **<span style="color:#023e8a;">: A unique identifier of every article.</span>**  
>  - The primary column  
  
> `product_code`, `prod_name` **<span style="color:#023e8a;">: A unique identifier of every product and its name (not the same).</span>** >  - 47224 unique product_code  
>  - product_code and article id are highly correlated  
>  - 45875 uniqe prod_name. **Different from product_code**. Which one to use?  
>  - No specific dominant levels in prod_name  
  
> `product_type`, `product_type_name` **<span style="color:#023e8a;">: The group of product_code and its name</span>**  
>  - 132 unique product types, but 131 unique product names  
>  - dominant levels are present in both. first 8 form ~80% of total data  
  
> `product_group_name` **<span style="color:#023e8a;">: 19 unique values. highly dominant levels are present.</span>**   
  
> `graphical_appearance_no`, `graphical_appearance_name` **<span style="color:#023e8a;">: The group of graphics and its name</span>**  
>  - both has 30 unique values. 1-1 mapping  
>  - highly dominant levels present  
  
> `colour_group_code`, `colour_group_name` **<span style="color:#023e8a;">: The group of color and its name</span>**  
>  - both 50 unique values. 1-1 mapping  
>  - mildly dominant levels  
  
> `perceived_colour_value_id`, `perceived_colour_value_name`, `perceived_colour_master_id`, `perceived_colour_master_name` **<span style="color:#023e8a;">: The added color info</span>**    
>  - only 8 levels in both
  
> `department_no`, `department_name` **<span style="color:#023e8a;">: A unique identifier of every dep and its name</span>**  
>  - 299 unique department_no, 250 unique department_name. **Not matching**  
>  - no dominant levels.   
  
> `index_code`, `index_name` **<span style="color:#023e8a;">: A unique identifier of every index and its name</span>**  
>  - 10 levels in both 1-1 mapping.  
>  - obviously dominant  
  
> `index_group_no`, `index_group_name` **<span style="color:#023e8a;">: A group of indeces and its name</span>**  
>  - 5 levels in both  
>  - obviously dominant  
  
> `section_no`, `section_name` **<span style="color:#023e8a;">: A unique identifier of every section and its name</span>**  
>  - 57 in section no, 56 in section name . **Not one-one matching**  
>  - Non dominant  
  
> `garment_group_no`, `garment_group_name` **<span style="color:#023e8a;">: A unique identifier of every garment and its name</span>**  
>  - 21 in both levels. 1-1 mapping  
>  - some dominant levels  
  
> `detail_desc` **<span style="color:#023e8a;">: Details</span>**  
>  - All unique descriptions. many are nulls. Not sure how helpful it'll be

## Most of the columns are paired and hence only the string versions(& not ID) of these columns are taken.
These are then processed as shown below

In [None]:
articles_sub = articles[['article_id','prod_name','product_type_name','product_group_name','graphical_appearance_name','colour_group_name'
                         ,'perceived_colour_value_name','perceived_colour_master_name','department_name','index_name','index_group_name'
                         ,'section_name','garment_group_name','detail_desc']]
articles_sub.shape

In [None]:
# Let's remove space in all string columns
for i in articles_sub.columns[1:]:
    articles_sub[i] = articles_sub[i].str.replace(" ","")

In [None]:
#Combine all info from columns to a single column separated by space

cols = ['prod_name', 'product_type_name', 'product_group_name',
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name', 'perceived_colour_master_name',
       'department_name', 'index_name', 'index_group_name', 'section_name',
       'garment_group_name', 'detail_desc']
articles_sub['combined'] = articles_sub[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [None]:
articles_sub.head()

In [None]:
articles_final = articles_sub[['article_id','combined']]

# Find related articles of all articles
Given an article_id, let's find 10 similar products for it

In [None]:
#Only 5000 products are taken because of computational issues
articles_final = articles_final.loc[:5000]

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
articles_final['combined'] = articles_final['combined'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(articles_final['combined'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
indices = pd.Series(articles_final.index, index=articles_final['article_id']).drop_duplicates()

In [None]:
# Function that takes in article_id as input and outputs most similar articles
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the article that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all articles
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the articles based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar articles
    sim_scores = sim_scores[:12]

    # Get the article indices
    article_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar articles
    return articles_final['article_id'].iloc[article_indices]


In [None]:
recom = list(get_recommendations(108775044))
recom

# Visualising the predictions

In [None]:
def display_articles(article_ids):
    rows = 4 #len(article_ids)
    cols = 3
    image_path = "/kaggle/input/h-and-m-personalized-fashion-recommendations/images/"
    plt.figure(figsize=(2 + 3 * cols, 2 + 4 * rows))
    for i in range(len(article_ids)):

        article_id = ("0" + str(article_ids[i]))[-10:]
        plt.subplot(rows, cols, i + 1)
        plt.axis('off')
        #plt.title(f"{product_group_name} {article_id[:3]}\n{article_id}.jpg")
        try:
            image = Image.open(f"{image_path}{article_id[:3]}/{article_id}.jpg")
            plt.imshow(image)
        except:
            None

In [None]:
#First image (top left) is the input article and rest are all recommended similar articles
display_articles(recom)


# Let's try with few more articles

In [None]:
recom = list(get_recommendations(252298006))
display_articles(recom)

In [None]:
recom = list(get_recommendations(224337008))
display_articles(recom)

In [None]:
recom = list(get_recommendations(245348002))
display_articles(recom)

In [None]:
recom = list(get_recommendations(112679048))
display_articles(recom)

In [None]:
#Denim short
recom = list(get_recommendations(156289011))
display_articles(recom)

In [None]:
#Greenish Khakhi short
recom = list(get_recommendations(212766041))
display_articles(recom)

# Under development

# Join articles and transaction details

In [None]:
transactions = transactions[transactions.customer_id.isin(['000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318','00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2','00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4c73235dccbbc132280'])]

In [None]:
art_trans = pd.merge(articles_final, transactions, on='article_id')

In [None]:
tot_trans = pd.merge(art_trans,customers,on='customer_id')

In [None]:
tot_trans.head()

In [None]:
tot_trans.shape

In [None]:
fin_art = tot_trans[['customer_id','combined']].groupby(['customer_id'])['combined'].apply(' '.join).reset_index()

In [None]:
fin_art

# Customer ID X Article description

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
fin_art['combined'] = fin_art['combined'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(fin_art['combined'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape

In [None]:
indices = pd.Series(fin_art.index, index=fin_art['customer_id']).drop_duplicates()

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return fin_art['customer_id'].iloc[movie_indices]


In [None]:
get_recommendations('00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4c73235dccbbc132280')

# Find related articles of all articles

In [None]:
articles_final = articles_final.loc[:500]

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
articles_final['combined'] = articles_final['combined'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(articles_final['combined'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
indices = pd.Series(articles_final.index, index=articles_final['article_id']).drop_duplicates()

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[:12]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return articles_final['article_id'].iloc[movie_indices]


In [None]:
recom = list(get_recommendations(108775044))
recom

# Visualise the predictions

In [None]:
def display_articles(article_ids):
    rows = 4 #len(article_ids)
    cols = 3
    image_path = "/kaggle/input/h-and-m-personalized-fashion-recommendations/images/"
    plt.figure(figsize=(2 + 3 * cols, 2 + 4 * rows))
    for i in range(len(article_ids)):

        article_id = ("0" + str(article_ids[i]))[-10:]
        plt.subplot(rows, cols, i + 1)
        plt.axis('off')
        #plt.title(f"{product_group_name} {article_id[:3]}\n{article_id}.jpg")
        try:
            image = Image.open(f"{image_path}{article_id[:3]}/{article_id}.jpg")
            plt.imshow(image)
        except:
            None

In [None]:
display_articles(recom)