In [163]:
# Importing necessary libraries
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics.pairwise import cosine_similarity

In [108]:
data_path = "summer-products-with-rating-and-performance_2020-08.csv"
data_usecols = ["product_color", "product_variation_size_id", "product_id"]
data_textcols = ["product_color", "product_variation_size_id"]

In [109]:
df = pd.read_csv(data_path, usecols= data_usecols)
df

Unnamed: 0,product_color,product_variation_size_id,product_id
0,white,M,5e9ae51d43d6a96e303acdb0
1,green,XS,58940d436a0d3d5da4e95a38
2,leopardprint,XS,5ea10e2c617580260d55310a
3,black,M,5cedf17ad1d44c52c59e4aca
4,yellow,S,5ebf5819ebac372b070b0e70
...,...,...,...
1568,navyblue,S,5d5fadc99febd9356cbc52ee
1569,lightblue,S,5eccd22b4497b86fd48f16b4
1570,white,SIZE S,5e74be96034d613d42b52dfe
1571,white,Size S.,5eda07ab0e295c2097c36590


In [110]:
#Text Preprocessing
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

In [112]:
df['text_combined'] = df[data_textcols].astype(str).apply(lambda x: ' '.join(x), axis=1)
df

Unnamed: 0,product_color,product_variation_size_id,product_id,text_combined,text_cleaned
0,white,M,5e9ae51d43d6a96e303acdb0,white M,white M
1,green,XS,58940d436a0d3d5da4e95a38,green XS,green XS
2,leopardprint,XS,5ea10e2c617580260d55310a,leopardprint XS,leopardprint XS
3,black,M,5cedf17ad1d44c52c59e4aca,black M,black M
4,yellow,S,5ebf5819ebac372b070b0e70,yellow S,yellow S
...,...,...,...,...,...
1568,navyblue,S,5d5fadc99febd9356cbc52ee,navyblue S,navyblue S
1569,lightblue,S,5eccd22b4497b86fd48f16b4,lightblue S,lightblue S
1570,white,SIZE S,5e74be96034d613d42b52dfe,white SIZE S,white SIZE S
1571,white,Size S.,5eda07ab0e295c2097c36590,white Size S.,white Size S.


In [122]:
df['text_cleaned'] = df['text_combined'].apply(_removeNonAscii)
df['text_cleaned'] = df['text_cleaned'].apply(func = make_lower_case)
df['text_cleaned'] = df['text_cleaned'].apply(func = remove_stop_words)
df['text_cleaned'] = df['text_cleaned'].apply(func=remove_punctuation)
df['text_cleaned'] = df['text_cleaned'].apply(func=remove_html)
df

Unnamed: 0,product_color,product_variation_size_id,product_id,text_combined,text_cleaned
0,white,M,5e9ae51d43d6a96e303acdb0,white M,white
1,green,XS,58940d436a0d3d5da4e95a38,green XS,green xs
2,leopardprint,XS,5ea10e2c617580260d55310a,leopardprint XS,leopardprint xs
3,black,M,5cedf17ad1d44c52c59e4aca,black M,black
4,yellow,S,5ebf5819ebac372b070b0e70,yellow S,yellow
...,...,...,...,...,...
1568,navyblue,S,5d5fadc99febd9356cbc52ee,navyblue S,navyblue
1569,lightblue,S,5eccd22b4497b86fd48f16b4,lightblue S,lightblue
1570,white,SIZE S,5e74be96034d613d42b52dfe,white SIZE S,white size
1571,white,Size S.,5eda07ab0e295c2097c36590,white Size S.,white size s


In [136]:
#Building TFIDF model and calculate TFIDF score
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df = 5, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text_cleaned'])

In [143]:
# Calculating the similarity measures based on Cosine Similarity
sg = cosine_similarity(tfidf_matrix, tfidf_matrix)
sg 

array([[1.        , 0.        , 0.        , ..., 0.32581916, 0.32581916,
        0.        ],
       [0.        , 1.        , 0.37642164, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.37642164, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.32581916, 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.32581916, 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [150]:
results = {}
for idx, row in df.iterrows():
    similar_indices = sg[idx].argsort()[:-100:-1]
    similar_items = [(sg[idx][i], df['product_id'][i]) for i in similar_indices]

    # First item is the item itself, so remove it.
    # Each dictionary entry is like: [(1,2), (3,4)], with each tuple being (score, item_id)
    results[row['product_id']] = similar_items[1:]

In [160]:
def item(id):
    return df.loc[df['product_id'] == id]["product_id"].tolist()[0].split(' - ')[0]

In [161]:
# Top 5 recommendation
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

In [162]:
recommend(item_id="5e9ae51d43d6a96e303acdb0", num=5)

Recommending 5 products similar to 5e9ae51d43d6a96e303acdb0...
-------
Recommended: 5eec6643d6854068e20df558 (score:1.0)
Recommended: 5d6f980190bd5f5694d25a6d (score:1.0)
Recommended: 5b12401145089b206d7a455e (score:1.0)
Recommended: 5aea67918a743b21013e41f6 (score:1.0)
Recommended: 5d11e3e51bb88e735249cbbb (score:1.0)
