In [16]:
import numpy as np
import pandas as pd
import re
import nltk
import string
import joblib
import pickle
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import models
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models.phrases import Phrases, Phraser
from gensim.models import FastText
import bz2file as bz2
import os

In [2]:
data = pd.read_csv('../data/flipkart-data.csv')

In [3]:
data.sample()

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
12389,6f3eb7b1b19064d97243d555ede3d57b,2015-12-30 00:17:46 +0000,http://www.flipkart.com/stilestreet-women-s-so...,Stilestreet Women's Solid Casual Shirt,"[""Clothing >> Women's Clothing >> Western Wear...",SHTE5D78BGJFE53R,695.0,349.0,"[""http://img5a.flixcart.com/image/shirt/m/z/z/...",False,Stilestreet Women's Solid Casual Shirt - Buy Y...,No rating available,No rating available,Regular,"{""product_specification""=>[{""key""=>""Pattern"", ..."


In [4]:
data.columns

Index(['uniq_id', 'crawl_timestamp', 'product_url', 'product_name',
       'product_category_tree', 'pid', 'retail_price', 'discounted_price',
       'image', 'is_FK_Advantage_product', 'description', 'product_rating',
       'overall_rating', 'brand', 'product_specifications'],
      dtype='object')

* ## Working Data Function

In [5]:
def working_data(data):

    def normalize_text(text):
        text = text.lower()  # Convert text to lowercase
        text = re.sub('https?://\S+|www\.\S+', '', text)  # Remove URLs
        text = re.sub('[^A-Za-z0-9]+', ' ', text)  # Remove non-alphanumeric characters (special chars)
        text = re.sub('\n', ' ', text)  # Remove newlines
        text = re.sub(' +', ' ', text)  # Remove extra spaces
        text = text.strip()  # Remove leading/trailing spaces
        return text
       
    def repair(text):
        text = str(text)
        pattern = re.compile('[\([{})\]]')
        text= pattern.sub(r'',text)
        text = text.replace('"','')
        text = text.replace(',','')
        text = text.replace('&','')
        pattern = re.compile('>>')
        return pattern.sub(r'',text)
    
    def remove_stopwords(text):
        stop_words = set(stopwords.words('english'))  # Use set for faster lookups
        words = text.split()
        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
        return filtered_sentence
    
    def remove_punctuation(text):
        table = str.maketrans('', '', string.punctuation)  # Create translation table
        words = text.split()
        filtered_sentence = ' '.join([word.translate(table) for word in words])  # Efficient punctuation removal
        return filtered_sentence
    
    def stemming(text):
        text = str(text)
        stemmer = PorterStemmer()
        return " ".join([stemmer.stem(word) for word in text.split()])
    
    def lemmatizing(text):
        text = str(text)
        lemmet = WordNetLemmatizer()
        return " ".join([lemmet.lemmatize(word) for word in text.split()])

    print("step 1 : Cleaning product_category_tree.....")
    data['product_category_tree'] = data['product_category_tree'].apply(repair)
    
    print("step 2 : Cleaning description.....")
    data['description'] = data['description'].apply(repair)

    print("step 3 : Creating 'desc' column.....")
    data['desc'] = data['product_category_tree']+data['description']
    
    print("step 4 : Normalizing 'desc' column.....")
    data['desc'] = data['desc'].apply(normalize_text)

    print("step 5 : stopwords removal 'desc'.....")
    data['desc'] = data['desc'].apply(remove_stopwords)

    print("step 6: Removing punctuation 'desc'.....")
    data['desc'] = data['desc'].apply(remove_punctuation)

    print("step 7 : Stemming 'desc' column.....")
    data['desc'] = data['desc'].apply(stemming)
    
    print("step 8 : Lemmatizing 'desc' column.....")
    data['desc'] = data['desc'].apply(lemmatizing)
    
    print("step 9 : Dropping unnecessary columns.....")
    data = data.drop(['uniq_id', 'crawl_timestamp', 'product_url', 'retail_price',
                       'discounted_price','image', 'is_FK_Advantage_product', 'product_rating',
                       'overall_rating','brand','product_specifications','product_category_tree','description'],axis=1)
    
    print("step 10 : Dropping null values.....")
    data.dropna(inplace=True)
    
    print("step 11 : ReCreating 'pid' column.....")
    data['pid'] = range(1, 20001)
    

    return data

In [6]:
working_data = working_data(data)

step 1 : Cleaning product_category_tree.....
step 2 : Cleaning description.....
step 3 : Creating 'desc' column.....
step 4 : Normalizing 'desc' column.....
step 5 : stopwords removal 'desc'.....
step 6: Removing punctuation 'desc'.....
step 7 : Stemming 'desc' column.....
step 8 : Lemmatizing 'desc' column.....
step 9 : Dropping unnecessary columns.....
step 10 : Dropping null values.....
step 11 : ReCreating 'pid' column.....


In [7]:
working_data.sample(5)

Unnamed: 0,product_name,pid,desc
706,Navaksha Necktie Men's Combo,707,cloth men cloth navaksha men clothingnavaksha ...
19853,Wallmantra Extra Large Vinyl Sticker Sticker,19854,babi care babi kid gift sticker wallmantra sti...
14172,"Orange and Orchid Solid, Printed Men's Round N...",14173,cloth men cloth shirt orang orchid shirtsorang...
17798,Antshrike Men's Pyjama,17799,cloth men cloth inner wear sleep wear pyjama l...
3216,Quilt India Floral Cushions Cover,3217,home furnish cushion pillow cover cushion cove...


* ## Word2Vec function

In [8]:
def word_to_vector(data):
    print("Creating word2vec matrix.....")
    tokenized_sentences = data['desc'].apply(lambda x: x.split())
    vector_size = 100  # Size of the word vectors
    w2v_model = models.Word2Vec(sentences=tokenized_sentences, vector_size=vector_size, window=5, min_count=1, workers=4)
    sentence_embeddings = []
    for sentence in tokenized_sentences:
        vectors = [w2v_model.wv[word] for word in sentence if word in w2v_model.wv]
        if vectors:
            sentence_embeddings.append(np.mean(vectors, axis=0))
        else:
            sentence_embeddings.append(np.zeros(vector_size))  # fallback for empty sentences
    w2v_matrix = np.array(sentence_embeddings)
    return w2v_matrix, w2v_model

In [9]:
w2v_matrix, w2v_model = word_to_vector(working_data)

Creating word2vec matrix.....


In [10]:
w2v_matrix.shape

(20000, 100)

* ## Similarity-Matrix function

In [11]:
def similarity_matrix(matrix):
    print("Calculating similarity matrix.....")
    similarity_matrix = cosine_similarity(matrix)
    return similarity_matrix

In [12]:
similarity_matrix_w2v = similarity_matrix(w2v_matrix)

Calculating similarity matrix.....


* ## Recommend function

In [13]:
def recommend_products(product,similarity_model):
    product_index = data[data['product_name'] == product].index[0]
    distances = similarity_model[product_index]
    product_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x: x[1])[1:6]
    output={}
    name_list = []
    sim_list = []
    for x in product_list:
        prod_name = data.iloc[x[0]].product_name
        name_list.append(prod_name)
        prod_sim = np.round((x[1]*100),2)
        sim_list.append(prod_sim)
        
    print('Checked Product :::::   ',product)    
    name_list=np.array(name_list)
    sim_list=np.array(sim_list)
    dat = np.reshape([[name_list],[sim_list]],(2,5))
    df = pd.DataFrame(dat.T,columns=['Recommended Product','Similarity(%age)'],index=[0,1,2,3,4])
    return df

* ## Recommended output

In [14]:
product = data.iloc[100].product_name
recommend_products(product,similarity_matrix_w2v)

Checked Product :::::    Rorlig RR-028 Expedition Analog Watch  - For Men, Boys


Unnamed: 0,Recommended Product,Similarity(%age)
0,Rorlig RR-030 Essentials Analog Watch - For M...,99.9
1,Fastrack 9912PP15 Tees Analog Watch - For Men...,93.68
2,Nexus NX_7557 Analog Watch - For Women,92.34
3,"Escort E-1700-906_Blk Analog Watch - For Men,...",91.33
4,Ridas 1825_black Sports Analog Watch - For Men,91.18


* ## Export Model

In [15]:
pickle.dump(similarity_matrix_w2v,open('../models/similarity_w2v.pkl','wb'))

In [17]:
print(os.path.getsize("../models/similarity_w2v.pkl"))

1600000163


In [20]:
ofile = bz2.BZ2File("../models/similarity_w2v",'wb')
pickle.dump(similarity_matrix_w2v,ofile)
ofile.close()

In [21]:
print(os.path.getsize("../models/similarity_w2v"))

1397074182
