In [17]:
import numpy as np
import pandas as pd
import re
import nltk
import string
import joblib
import pickle
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import models
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models.phrases import Phrases, Phraser
from gensim.models import FastText
import bz2file as bz2
import os

In [3]:
data = pd.read_csv('../data/flipkart-data.csv')

In [4]:
data.sample()

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
766,5c572e1ca891dc7cfe6dd5f59be770c8,2016-01-06 18:20:45 +0000,http://www.flipkart.com/my-little-joy-casual-s...,My Little Joy Casual Short Sleeve Printed Baby...,"[""Baby Care >> Infant Wear >> Baby Girls' Clot...",TOPEBKYKHZ74DT6U,599.0,299.0,"[""http://img6a.flixcart.com/image/top/3/y/f/24...",False,My Little Joy Casual Short Sleeve Printed Baby...,No rating available,No rating available,,"{""product_specification""=>[{""key""=>""Ideal For""..."


In [5]:
data.columns

Index(['uniq_id', 'crawl_timestamp', 'product_url', 'product_name',
       'product_category_tree', 'pid', 'retail_price', 'discounted_price',
       'image', 'is_FK_Advantage_product', 'description', 'product_rating',
       'overall_rating', 'brand', 'product_specifications'],
      dtype='object')

* ## Working Data Function

In [6]:
def working_data(data):

    def normalize_text(text):
        text = text.lower()  # Convert text to lowercase
        text = re.sub('https?://\S+|www\.\S+', '', text)  # Remove URLs
        text = re.sub('[^A-Za-z0-9]+', ' ', text)  # Remove non-alphanumeric characters (special chars)
        text = re.sub('\n', ' ', text)  # Remove newlines
        text = re.sub(' +', ' ', text)  # Remove extra spaces
        text = text.strip()  # Remove leading/trailing spaces
        return text
       
    def repair(text):
        text = str(text)
        pattern = re.compile('[\([{})\]]')
        text= pattern.sub(r'',text)
        text = text.replace('"','')
        text = text.replace(',','')
        text = text.replace('&','')
        pattern = re.compile('>>')
        return pattern.sub(r'',text)
    
    def remove_stopwords(text):
        stop_words = set(stopwords.words('english'))  # Use set for faster lookups
        words = text.split()
        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
        return filtered_sentence
    
    def remove_punctuation(text):
        table = str.maketrans('', '', string.punctuation)  # Create translation table
        words = text.split()
        filtered_sentence = ' '.join([word.translate(table) for word in words])  # Efficient punctuation removal
        return filtered_sentence
    
    def stemming(text):
        text = str(text)
        stemmer = PorterStemmer()
        return " ".join([stemmer.stem(word) for word in text.split()])
    
    def lemmatizing(text):
        text = str(text)
        lemmet = WordNetLemmatizer()
        return " ".join([lemmet.lemmatize(word) for word in text.split()])

    print("step 1 : Cleaning product_category_tree.....")
    data['product_category_tree'] = data['product_category_tree'].apply(repair)
    
    print("step 2 : Cleaning description.....")
    data['description'] = data['description'].apply(repair)

    print("step 3 : Creating 'desc' column.....")
    data['desc'] = data['product_category_tree']+data['description']
    
    print("step 4 : Normalizing 'desc' column.....")
    data['desc'] = data['desc'].apply(normalize_text)

    print("step 5 : stopwords removal 'desc'.....")
    data['desc'] = data['desc'].apply(remove_stopwords)

    print("step 6: Removing punctuation 'desc'.....")
    data['desc'] = data['desc'].apply(remove_punctuation)

    print("step 7 : Stemming 'desc' column.....")
    data['desc'] = data['desc'].apply(stemming)
    
    print("step 8 : Lemmatizing 'desc' column.....")
    data['desc'] = data['desc'].apply(lemmatizing)
    
    print("step 9 : Dropping unnecessary columns.....")
    data = data.drop(['uniq_id', 'crawl_timestamp', 'product_url', 'retail_price',
                       'discounted_price','image', 'is_FK_Advantage_product', 'product_rating',
                       'overall_rating','brand','product_specifications','product_category_tree','description'],axis=1)
    
    print("step 10 : Dropping null values.....")
    data.dropna(inplace=True)
    
    print("step 11 : ReCreating 'pid' column.....")
    data['pid'] = range(1, 20001)
    

    return data

In [7]:
working_data = working_data(data)

step 1 : Cleaning product_category_tree.....
step 2 : Cleaning description.....
step 3 : Creating 'desc' column.....
step 4 : Normalizing 'desc' column.....
step 5 : stopwords removal 'desc'.....
step 6: Removing punctuation 'desc'.....
step 7 : Stemming 'desc' column.....
step 8 : Lemmatizing 'desc' column.....
step 9 : Dropping unnecessary columns.....
step 10 : Dropping null values.....
step 11 : ReCreating 'pid' column.....


In [8]:
working_data.sample(5)

Unnamed: 0,product_name,pid,desc
19145,Toons Printed Baby Boy's Round Neck T-Shirt,19146,babi care infant wear babi boy cloth polo shir...
11771,Shashvat Jewels Silver Necklace,11772,jewelleri necklac chain necklacesshashvat jewe...
3549,Rockmantra Left 4 Dead Ceramic Mug,3550,kitchen dine coffe mug rockmantra coffe mugsro...
7614,H D ENTERPRISE 750 ml Cooking Oil Dispenser,7615,kitchen dine contain bottl contain jar oil dis...
15196,Pilot Roller Ball Pen,15197,pen stationeri pen roller ball pen pilot rolle...


* ## TF-IDF function

In [9]:
def tfidf(data):
    print("Creating tf-idf matrix.....")
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(data['desc']).toarray()
    return tfidf_matrix, tfidf_vectorizer

In [10]:
tfidf_matrix, tfidf_vectorizer = tfidf(working_data)

Creating tf-idf matrix.....


In [11]:
tfidf_matrix.shape

(20000, 27405)

* ## Similarity-Matrix function

In [12]:
def similarity_matrix(matrix):
    print("Calculating similarity matrix.....")
    similarity_matrix = cosine_similarity(matrix)
    return similarity_matrix

In [13]:
similarity_matrix_tfidf = similarity_matrix(tfidf_matrix)

Calculating similarity matrix.....


* ## Recommend function

In [14]:
def recommend_products(product,similarity_model):
    product_index = data[data['product_name'] == product].index[0]
    distances = similarity_model[product_index]
    product_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x: x[1])[1:6]
    output={}
    name_list = []
    sim_list = []
    for x in product_list:
        prod_name = data.iloc[x[0]].product_name
        name_list.append(prod_name)
        prod_sim = np.round((x[1]*100),2)
        sim_list.append(prod_sim)
        
    print('Checked Product :::::   ',product)    
    name_list=np.array(name_list)
    sim_list=np.array(sim_list)
    dat = np.reshape([[name_list],[sim_list]],(2,5))
    df = pd.DataFrame(dat.T,columns=['Recommended Product','Similarity(%age)'],index=[0,1,2,3,4])
    return df

* ## Recommended output

In [15]:
product = data.iloc[100].product_name
recommend_products(product,similarity_matrix_tfidf)

Checked Product :::::    Rorlig RR-028 Expedition Analog Watch  - For Men, Boys


Unnamed: 0,Recommended Product,Similarity(%age)
0,Rorlig RR-030 Essentials Analog Watch - For M...,95.33
1,Luba ghk54 Stylo Analog Watch - For Women,46.38
2,Ridas 1825_black Sports Analog Watch - For Men,44.8
3,Times 123B0123 Sports Analog Watch - For Boys,44.12
4,Times SD_183 Casual Analog Watch - For Women,41.13


* ## Export Model

In [16]:
pickle.dump(similarity_matrix_tfidf,open('../models/similarity_tfidf.pkl','wb'))

In [18]:
print(os.path.getsize("../models/similarity_tfidf.pkl"))

3200000163


In [21]:
ofile = bz2.BZ2File("../models/similarity_tfidf",'wb')
pickle.dump(similarity_matrix_tfidf,ofile)
ofile.close()

In [22]:
print(os.path.getsize("../models/similarity_tfidf"))

2301939417
