In [35]:
import numpy as np
import pandas as pd
import re
import nltk
import string
import joblib
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import models
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models.phrases import Phrases, Phraser
from gensim.models import FastText

In [36]:
data = pd.read_csv('data/flipkart-data.csv')

In [37]:
data.sample(2)

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
15989,09e38163979db5514e8c5b84fcdb6ecf,2015-12-29 05:07:38 +0000,http://www.flipkart.com/destudio-large-wall-st...,DeStudio Large WALL STICKER Sticker,"[""Home Decor & Festive Needs >> Wall Decor & C...",STIEYZ5XTSCXZHWS,1998.0,699.0,"[""http://img6a.flixcart.com/image/sticker/a/g/...",False,Buy DeStudio Large WALL STICKER Sticker for Rs...,No rating available,No rating available,DeStudio,"{""product_specification""=>[{""key""=>""Sales Pack..."
13720,c340914b07f03c05731f9541d77b1dcb,2016-03-03 14:52:00 +0000,http://www.flipkart.com/nino-bambino-full-slee...,Nino Bambino Full Sleeve Polka Print Baby Girl...,"[""Clothing >> Kids' Clothing >> Infants Wear >...",SWSEG87FPRCXGZDP,849.0,849.0,"[""http://img5a.flixcart.com/image/sweatshirt/h...",False,Key Features of Nino Bambino Full Sleeve Polka...,No rating available,No rating available,Nino Bambino,"{""product_specification""=>[{""key""=>""Knit Type""..."


* # Working Data Function

In [38]:
def working_data(data):

    def normalize_text(text):
        text = text.lower()  # Convert text to lowercase
        text = re.sub('https?://\S+|www\.\S+', '', text)  # Remove URLs
        text = re.sub('[^A-Za-z0-9]+', ' ', text)  # Remove non-alphanumeric characters (special chars)
        text = re.sub('\n', ' ', text)  # Remove newlines
        text = re.sub(' +', ' ', text)  # Remove extra spaces
        text = text.strip()  # Remove leading/trailing spaces
        return text
       
    def repair(text):
        text = str(text)
        pattern = re.compile('[\([{})\]]')
        text= pattern.sub(r'',text)
        text = text.replace('"','')
        text = text.replace(',','')
        text = text.replace('&','')
        pattern = re.compile('>>')
        return pattern.sub(r'',text)
    
    def remove_stopwords(text):
        stop_words = set(stopwords.words('english'))  # Use set for faster lookups
        words = text.split()
        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
        return filtered_sentence
    
    def remove_punctuation(text):
        table = str.maketrans('', '', string.punctuation)  # Create translation table
        words = text.split()
        filtered_sentence = ' '.join([word.translate(table) for word in words])  # Efficient punctuation removal
        return filtered_sentence
    
    def stemming(text):
        text = str(text)
        stemmer = PorterStemmer()
        return " ".join([stemmer.stem(word) for word in text.split()])
    
    def lemmatizing(text):
        text = str(text)
        lemmet = WordNetLemmatizer()
        return " ".join([lemmet.lemmatize(word) for word in text.split()])

    print("step 1 : Cleaning product_category_tree.....")
    data['product_category_tree'] = data['product_category_tree'].apply(repair)
    
    print("step 2 : Cleaning description.....")
    data['description'] = data['description'].apply(repair)

    print("step 3 : Creating 'desc' column.....")
    data['desc'] = data['product_category_tree']+data['description']
    
    print("step 4 : Normalizing 'desc' column.....")
    data['desc'] = data['desc'].apply(normalize_text)

    print("step 5 : stopwords removal 'desc'.....")
    data['desc'] = data['desc'].apply(remove_stopwords)

    print("step 6: Removing punctuation 'desc'.....")
    data['desc'] = data['desc'].apply(remove_punctuation)

    print("step 7 : Stemming 'desc' column.....")
    data['desc'] = data['desc'].apply(stemming)
    
    print("step 8 : Lemmatizing 'desc' column.....")
    data['desc'] = data['desc'].apply(lemmatizing)
    
    print("step 9 : Dropping unnecessary columns.....")
    data = data.drop(['uniq_id', 'crawl_timestamp', 'product_url', 'retail_price',
                       'discounted_price','image', 'is_FK_Advantage_product', 'product_rating',
                       'overall_rating','brand','product_specifications','product_category_tree','description'],axis=1)
    
    print("step 10 : Dropping null values.....")
    data.dropna(inplace=True)
    
    print("step 11 : ReCreating 'pid' column.....")
    data['pid'] = range(1, 20001)
    

    return data

In [39]:
working_data = working_data(data)

step 1 : Cleaning product_category_tree.....
step 2 : Cleaning description.....
step 3 : Creating 'desc' column.....
step 4 : Normalizing 'desc' column.....
step 5 : stopwords removal 'desc'.....
step 6: Removing punctuation 'desc'.....
step 7 : Stemming 'desc' column.....
step 8 : Lemmatizing 'desc' column.....
step 9 : Dropping unnecessary columns.....
step 10 : Dropping null values.....
step 11 : ReCreating 'pid' column.....


* ### Bag of Words function

In [40]:
def bag_of_words(data):
    print("Creating bag of words matrix.....")
    bow_vectorizer = CountVectorizer(stop_words='english')
    bow_matrix = bow_vectorizer.fit_transform(data['desc']).toarray()
    return bow_matrix, bow_vectorizer

* ### TF-IDF function

In [41]:
def tfidf(data):
    print("Creating tf-idf matrix.....")
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(data['desc']).toarray()
    return tfidf_matrix, tfidf_vectorizer

* ### Word2Vec function

In [42]:
def word_to_vector(data):
    print("Creating word2vec matrix.....")
    tokenized_sentences = data['desc'].apply(lambda x: x.split())
    vector_size = 100  # Size of the word vectors
    w2v_model = models.Word2Vec(sentences=tokenized_sentences, vector_size=vector_size, window=5, min_count=1, workers=4)
    sentence_embeddings = []
    for sentence in tokenized_sentences:
        vectors = [w2v_model.wv[word] for word in sentence if word in w2v_model.wv]
        if vectors:
            sentence_embeddings.append(np.mean(vectors, axis=0))
        else:
            sentence_embeddings.append(np.zeros(vector_size))  # fallback for empty sentences
    w2v_matrix = np.array(sentence_embeddings)
    return w2v_matrix, w2v_model

* ### GloVe function

In [44]:
def glove(data, glove_file_path):
    max_words = 20000
    max_len = 3440
    embedding_dim = 100

    def tokenize_text(text, max_words=20000):
        print('.....Tokenizing text...')
        tokenizer = Tokenizer(oov_token='<OOV>')
        tokenizer.fit_on_texts(text)
        sequences = tokenizer.texts_to_sequences(text)
        return sequences, tokenizer

    def pad_sequences_data(sequences, maxlen=200):
        print('.....Padding sequences...')
        padded_sequences = pad_sequences(sequences,maxlen=maxlen, padding='post', truncating='post')
        return padded_sequences

    def load_glove_embeddings(glove_file_path):
        print('.....Loading GloVe embeddings...')
        embeddings_index = {}
        with open(glove_file_path, 'r', encoding='utf8') as f:
            for line in f:
                values = line.strip().split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = vector
        return embeddings_index
    
    def glove_matrix(embedding_dim, tokenizer, embeddings_index):
        print('......Creating embedding matrix...')
        word_index = tokenizer.word_index
        embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
        for word, i in word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
            else:
                embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
                
        return embedding_matrix
    
    sequences, tokenizer = tokenize_text(data['desc'], max_words)
    print("--" * 20)
    padded_sequences = pad_sequences_data(sequences, max_len)
    print("--" * 20)
    embeddings_index = load_glove_embeddings(glove_file_path)
    print("--" * 20)
    embedding_matrix = glove_matrix(embedding_dim, tokenizer, embeddings_index)
    return padded_sequences, embedding_matrix, tokenizer


* ### FastText function

In [45]:
def train_fasttext(data):
    print("Creating FastText matrix.....")
    sentences = [row.split() for row in data['desc']]
    phrases = Phrases(sentences = sentences, min_count = 30, progress_per = 10000)
    sentences = phrases[sentences]
    ft_model = FastText(vector_size=100, window = 5, min_count = 5, workers = 4, min_n = 1, max_n = 4)
    ft_model.build_vocab(sentences)
    ft_model.train(sentences, total_examples = ft_model.corpus_count, epochs = 20)
    path = 'models/FastText.joblib'
    joblib.dump(ft_model, path)


In [46]:
train_fasttext(working_data)

Creating FastText matrix.....


* ### Similarity-Matrix function

In [47]:
def similarity_matrix(matrix):
    print("Calculating similarity matrix.....")
    similarity_matrix = cosine_similarity(matrix)
    return similarity_matrix

In [48]:
w2v_matrix, w2v_model = word_to_vector(working_data)
bow_matrix, bow_vectorizer = bag_of_words(working_data)
tfidf_matrix, tfidf_vectorizer = tfidf(working_data)
padded, glove_matrix, glove_tokenizer = glove(working_data, 'data/glove.6B.100d.txt')
fasttext_matrix = joblib.load('models/FastText.joblib').wv.vectors

Creating word2vec matrix.....
Creating bag of words matrix.....
Creating tf-idf matrix.....
.....Tokenizing text...
----------------------------------------
.....Padding sequences...
----------------------------------------
.....Loading GloVe embeddings...
----------------------------------------
......Creating embedding matrix...


In [49]:
similarity_matrix_bow = similarity_matrix(bow_matrix)
similarity_matrix_tfidf = similarity_matrix(tfidf_matrix)
similarity_matrix_w2v = similarity_matrix(w2v_matrix)
similarity_matrix_glove = similarity_matrix(glove_matrix)
similarity_matrix_fasttext = similarity_matrix(fasttext_matrix)

Calculating similarity matrix.....
Calculating similarity matrix.....
Calculating similarity matrix.....
Calculating similarity matrix.....
Calculating similarity matrix.....


* ### Recommend function

In [50]:
def recommend_products(product,similarity_model):
    product_index = data[data['product_name'] == product].index[0]
    distances = similarity_model[product_index]
    product_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x: x[1])[1:6]
    output={}
    name_list = []
    sim_list = []
    for x in product_list:
        prod_name = data.iloc[x[0]].product_name
        name_list.append(prod_name)
        prod_sim = np.round((x[1]*100),2)
        sim_list.append(prod_sim)
        
    print('Checked Product :::::   ',product)    
    name_list=np.array(name_list)
    sim_list=np.array(sim_list)
    dat = np.reshape([[name_list],[sim_list]],(2,5))
    df = pd.DataFrame(dat.T,columns=['Recommended Product','Similarity(%age)'],index=[0,1,2,3,4])
    return df

* ### Recommendations output

In [52]:
product = data.iloc[1].product_name
# recommend_products(product,similarity_matrix_bow)
# recommend_products(product,similarity_matrix_tfidf)
# recommend_products(product,similarity_matrix_w2v)
# recommend_products(product,similarity_matrix_glove)
recommend_products(product,similarity_matrix_fasttext)

Checked Product :::::    FabHomeDecor Fabric Double Sofa Bed


Unnamed: 0,Recommended Product,Similarity(%age)
0,MASARA Solid Women's Straight Kurta,63.5
1,Ploomz Women's Push-up Bra,50.27
2,Tatvaarts Tribal Danda Lady Showpiece - 48.2...,49.42
3,Legrand Legrand Myrius 673010 16A Indicator Wh...,49.36
4,Sportking Women's Leggings,49.24
