In [6]:
import numpy as np
import pandas as pd
import re
import nltk
import string
import joblib
import pickle
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import models
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models.phrases import Phrases, Phraser
from gensim.models import FastText
import bz2file as bz2
import os

In [7]:
data = pd.read_csv('../data/flipkart-data.csv')

In [8]:
data.sample()

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
5484,458c697fe5824552890b0e674cf5c109,2016-05-26 23:54:46 +0000,http://www.flipkart.com/ball-ibctdc-160-2gb-dd...,I Ball IBCTDC 160/2gb/DDR2 with Dual Core 2 RA...,"[""Computers >> Computer Peripherals >> CPU >> ...",CPUEJ2CMBFHTTHSJ,12999.0,6990.0,"[""http://img6a.flixcart.com/image/cpu/h/s/j/i-...",False,Specifications of I Ball IBCTDC 160/2gb/DDR2 w...,No rating available,No rating available,I Ball,"{""product_specification""=>[{""key""=>""Processor ..."


In [9]:
data.columns

Index(['uniq_id', 'crawl_timestamp', 'product_url', 'product_name',
       'product_category_tree', 'pid', 'retail_price', 'discounted_price',
       'image', 'is_FK_Advantage_product', 'description', 'product_rating',
       'overall_rating', 'brand', 'product_specifications'],
      dtype='object')

* ## Working Data Function

In [10]:
def working_data(data):

    def normalize_text(text):
        text = text.lower()  # Convert text to lowercase
        text = re.sub('https?://\S+|www\.\S+', '', text)  # Remove URLs
        text = re.sub('[^A-Za-z0-9]+', ' ', text)  # Remove non-alphanumeric characters (special chars)
        text = re.sub('\n', ' ', text)  # Remove newlines
        text = re.sub(' +', ' ', text)  # Remove extra spaces
        text = text.strip()  # Remove leading/trailing spaces
        return text
       
    def repair(text):
        text = str(text)
        pattern = re.compile('[\([{})\]]')
        text= pattern.sub(r'',text)
        text = text.replace('"','')
        text = text.replace(',','')
        text = text.replace('&','')
        pattern = re.compile('>>')
        return pattern.sub(r'',text)
    
    def remove_stopwords(text):
        stop_words = set(stopwords.words('english'))  # Use set for faster lookups
        words = text.split()
        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
        return filtered_sentence
    
    def remove_punctuation(text):
        table = str.maketrans('', '', string.punctuation)  # Create translation table
        words = text.split()
        filtered_sentence = ' '.join([word.translate(table) for word in words])  # Efficient punctuation removal
        return filtered_sentence
    
    def stemming(text):
        text = str(text)
        stemmer = PorterStemmer()
        return " ".join([stemmer.stem(word) for word in text.split()])
    
    def lemmatizing(text):
        text = str(text)
        lemmet = WordNetLemmatizer()
        return " ".join([lemmet.lemmatize(word) for word in text.split()])

    print("step 1 : Cleaning product_category_tree.....")
    data['product_category_tree'] = data['product_category_tree'].apply(repair)
    
    print("step 2 : Cleaning description.....")
    data['description'] = data['description'].apply(repair)

    print("step 3 : Creating 'desc' column.....")
    data['desc'] = data['product_category_tree']+data['description']
    
    print("step 4 : Normalizing 'desc' column.....")
    data['desc'] = data['desc'].apply(normalize_text)

    print("step 5 : stopwords removal 'desc'.....")
    data['desc'] = data['desc'].apply(remove_stopwords)

    print("step 6: Removing punctuation 'desc'.....")
    data['desc'] = data['desc'].apply(remove_punctuation)

    print("step 7 : Stemming 'desc' column.....")
    data['desc'] = data['desc'].apply(stemming)
    
    print("step 8 : Lemmatizing 'desc' column.....")
    data['desc'] = data['desc'].apply(lemmatizing)
    
    print("step 9 : Dropping unnecessary columns.....")
    data = data.drop(['uniq_id', 'crawl_timestamp', 'product_url', 'retail_price',
                       'discounted_price','image', 'is_FK_Advantage_product', 'product_rating',
                       'overall_rating','brand','product_specifications','product_category_tree','description'],axis=1)
    
    print("step 10 : Dropping null values.....")
    data.dropna(inplace=True)
    
    print("step 11 : ReCreating 'pid' column.....")
    data['pid'] = range(1, 20001)
    

    return data

In [11]:
working_data = working_data(data)

step 1 : Cleaning product_category_tree.....
step 2 : Cleaning description.....
step 3 : Creating 'desc' column.....
step 4 : Normalizing 'desc' column.....
step 5 : stopwords removal 'desc'.....
step 6: Removing punctuation 'desc'.....
step 7 : Stemming 'desc' column.....
step 8 : Lemmatizing 'desc' column.....
step 9 : Dropping unnecessary columns.....
step 10 : Dropping null values.....
step 11 : ReCreating 'pid' column.....


In [12]:
working_data.sample(5)

Unnamed: 0,product_name,pid,desc
12577,Kiosha Women's Solid Casual Shirt,12578,cloth woman cloth western wear shirt top tunic...
821,Parron Men's Solid Casual Shirt,822,cloth men cloth shirt casual parti wear shirt ...
580,Fundoo T Full Sleeve Solid Men's Sweatshirt,581,cloth men cloth winter season wear sweatshirt ...
14227,Ocean Race Solid Men's Round Neck T-Shirt,14228,cloth men cloth shirt ocean race shirtsocean r...
8382,Amscan 110215 Plastic Weight Sled,8383,sport fit track field amscan track fieldamscan...


* ## Bag of Words function

In [13]:
def bag_of_words(data):
    print("Creating bag of words matrix.....")
    bow_vectorizer = CountVectorizer(stop_words='english')
    bow_matrix = bow_vectorizer.fit_transform(data['desc']).toarray()
    return bow_matrix, bow_vectorizer

In [14]:
bow_matrix, bow_vectorizer = bag_of_words(working_data)

Creating bag of words matrix.....


In [15]:
bow_matrix.shape

(20000, 27405)

* ## Similarity-Matrix function

In [16]:
def similarity_matrix(matrix):
    print("Calculating similarity matrix.....")
    similarity_matrix = cosine_similarity(matrix)
    return similarity_matrix

In [17]:
similarity_matrix_bow = similarity_matrix(bow_matrix)

Calculating similarity matrix.....


* ## Recommend function

In [18]:
def recommend_products(product,similarity_model):
    product_index = data[data['product_name'] == product].index[0]
    distances = similarity_model[product_index]
    product_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x: x[1])[1:6]
    output={}
    name_list = []
    sim_list = []
    for x in product_list:
        prod_name = data.iloc[x[0]].product_name
        name_list.append(prod_name)
        prod_sim = np.round((x[1]*100),2)
        sim_list.append(prod_sim)
        
    print('Checked Product :::::   ',product)    
    name_list=np.array(name_list)
    sim_list=np.array(sim_list)
    dat = np.reshape([[name_list],[sim_list]],(2,5))
    df = pd.DataFrame(dat.T,columns=['Recommended Product','Similarity(%age)'],index=[0,1,2,3,4])
    return df

* ## Recommended output

In [19]:
product = data.iloc[100].product_name
recommend_products(product,similarity_matrix_bow)

Checked Product :::::    Rorlig RR-028 Expedition Analog Watch  - For Men, Boys


Unnamed: 0,Recommended Product,Similarity(%age)
0,Rorlig RR-030 Essentials Analog Watch - For M...,97.69
1,Ridas 1825_black Sports Analog Watch - For Men,59.93
2,"Texus TXMW93 Black Analog Watch - For Men, Boys",52.96
3,Desire PT-362 Analog Watch - For Men,52.64
4,Fastrack 9912PP15 Tees Analog Watch - For Men...,52.31


* ## Export Model

In [20]:
pickle.dump(similarity_matrix_bow,open('../models/similarity_bow.pkl','wb'))

In [21]:
print(os.path.getsize("../models/similarity_bow.pkl"))

3200000163


In [22]:
ofile = bz2.BZ2File("../models/similarity_bow",'wb')
pickle.dump(similarity_matrix_bow,ofile)
ofile.close()
 

In [24]:
print(os.path.getsize("../models/similarity_bow"))

930622822
