In [59]:
#importing some important libraries 

import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec

import numpy as np

In [19]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [20]:
p_data =pd.read_csv('product_data.csv')

In [21]:
p_data.head()

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


In [22]:
p_data.shape

(500, 2)

In [23]:
p_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           500 non-null    int64 
 1   description  500 non-null    object
dtypes: int64(1), object(1)
memory usage: 7.9+ KB


In [24]:
def preprocess_text(text: str) -> list:
    """
    pre process text data by tokenizing, lemmatizing and then removing stop words
    Args:
        text: product description as a string
    Returns:
        list of words after pre-processing
    """

    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

In [41]:
pre_processed_data = pd.DataFrame(p_data['description'].apply(preprocess_text))

In [65]:
pre_processed_data['id'] = p_data['id']

In [43]:
pre_processed_data['length'] = pre_processed_data['description'].str.len()

In [44]:
pre_processed_data.head()

Unnamed: 0,description,length
0,"[active, classic, boxers, reason, boxers, cult...",132
1,"[active, sport, boxer, brief, skin, glory, req...",146
2,"[active, sport, brief, superbreathable, fly, b...",125
3,"[alpine, guide, pant, skin, climb, ice, switch...",184
4,"[alpine, wind, jkt, high, ridge, steep, ice, a...",239


In [47]:
def train_word2vec(window_size: int, train_df: pd.DataFrame, skip_gram: bool = 0, 
                   vector_size: int = 100, epochs:int = 10, hier_softmax: bool = 1) -> Word2Vec:
    """
    train word2vec model
    Args:
        window_size: Maximum distance between the current and predicted word within a sentence.
        train_data: training data
        skip_gram: 1 for skip_gram otherwise CBOW
        vector_size: Dimensionality of word vectors
        epochs: Number of iterations over the corpus 
        hier_softmax: If 1, hierarchical softmax will be used for model training. 
        If 0, and negative is non-zero, negative sampling will be used.
    Returns:
        gensim word2vec model
    """
    model = Word2Vec(window=window_size, sg=skip_gram, vector_size=vector_size, min_count=3,
                     alpha=0.03, min_alpha=0.0007, compute_loss=True, hs=hier_softmax,
                     seed=14)
    model.build_vocab(train_df, progress_per=200)
    model.train(train_df, total_examples=model.corpus_count,
                epochs=epochs, report_delay=1, compute_loss=True)
    return model


In [48]:
# calculating ideal window size as the 90th percentile, so that the model looks for maximum neighbors possible

ideal_window_size = int(pre_processed_data['length'].quantile(0.9))
w2v_model = train_word2vec(window_size=ideal_window_size, train_df=pre_processed_data['description'])

In [56]:
# get an average embedding for a particular product description by finding vectors for 
# all words present in that description and then taking an avergae

def vectors(df: pd.DataFrame, model: Word2Vec) -> list:
    """
    get an average embedding for a particular product description by finding vectors for 
    all words present in that description and then taking an average
    Args:
        df: pre processed input dataframe
        model: Word2Vec model
    Returns:
        list with average word embeddings for each product
    """
    
    # Creating a list for storing the vectors (description into vectors)
    global word_embeddings
    word_embeddings = []

    # Reading the each book description 
    for sentence in df['description']:
        avgword2vec = None
        count = 0
        for word in sentence:
            if word in model.wv.index_to_key:
                count += 1
                if avgword2vec is None:
                    avgword2vec = model.wv[word]
                else:
                    avgword2vec = avgword2vec + model.wv[word]
                
        if avgword2vec is not None:
            avgword2vec = avgword2vec / count
        
            word_embeddings.append(avgword2vec)
            
    return word_embeddings        


In [57]:
embeddings = vectors(pre_processed_data, w2v_model)

In [70]:
# calculating cosine similarity matrix

cos_similarity_matrix=np.dot(np.array(embeddings),np.array(embeddings).T)

In [79]:
def most_similar(input_df: pd.DataFrame, p_id: int, 
                 similarity_matrix: np.array, top_n: int = 5) -> None:
    """
    calculating top similar products based on cosine similarity matrix
    Args:
        input_df: raw input dataframe
        p_id: product id
    Returns:
        list with average word embeddings for each product
    """
    print (f'Document: {input_df.loc[p_id, "description"]}')
    print ('\n')
    print ('Similar Documents:')
    similar_ix=np.argsort(similarity_matrix[p_id])[::-1]
    similar_ix = similar_ix[:top_n]
    for ix in similar_ix:
        if ix==p_id:
            continue
        print('\n')
        print (f'Document: {input_df.loc[ix, "description"]}')
        print (f'Cosine Similarity : {similarity_matrix[p_id][ix]}')

In [81]:
most_similar(p_data, 1, cos_similarity_matrix)

Document: Active sport boxer briefs - Skinning up Glory requires enough movement without your boxers deciding to poach their own route. The form-fitting Active Sport Boxer Briefs are made from breathable 93% polyester (71% recycled) fabric that's fast-wicking, dries quickly and has 7% spandex for stretch; the seamless waistband and soft leg edges won't roll or bind. The gusseted, flat-sewn 6" inseam (size M) is offset to prevent inner-thigh chafe. Fly-free with a smooth front panel. Recyclable through the Common Threads Recycling Program.<br><br><b>Details:</b><ul> <li>"Stretch mesh provides support, open-weave mesh for airflow, wicks efficiently, dries fast"</li> <li>Seamless construction</li> <li>"Flat-sewn, gusseted inseam is set forward to prevent inner-thigh chafe"</li> <li>Fly-free support</li> <li>"Inseam (size M) is 6"""</li></ul><br><br><b>Fabric: </b>"4.6-oz 93% polyester (71% recycled)/7% spandex, with moisture-wicking performance. Recyclable through the Common Threads Recyc

#### Products which have higher cosine similarity score are similar to each other

### References

* G. Linden, B. Smith and J. York, "Amazon.com recommendations: item-to-item collaborative filtering," in IEEE Internet Computing, vol. 7, no. 1, pp. 76-80, Jan.-Feb. 2003, doi: 10.1109/MIC.2003.1167344.
* Goldberg, Yoav, and Omer Levy. "word2vec Explained: deriving Mikolov et al.'s negative-sampling word-embedding method." arXiv preprint arXiv:1402.3722 (2014).
* https://towardsdatascience.com/introduction-to-nlp-part-1-preprocessing-text-in-python-8f007d44ca96     
* https://www.analyticsvidhya.com/blog/2019/07/how-to-build-recommendation-system-word2vec-python/
* https://radimrehurek.com/gensim/models/word2vec.html
* https://www.kdnuggets.com/2020/08/content-based-recommendation-system-word-embeddings.html
* https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630