## Import all the necessary packages

In [1]:
# import packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from bs4 import BeautifulSoup
import nltk
import math
import time
import re
import os

warnings.filterwarnings("ignore")

<p> We have a csv file which consists of all informations about the products.</p>

In [2]:
# loading data using pandas 'read_csv' 
data = pd.read_csv('flipkart_com-ecommerce_sample.csv')

In [3]:
# print no. of data points and features
print('Number of data points : ', data.shape[0],  
     '  Number of features/variables: ', data.shape[1])

Number of data points :  20000   Number of features/variables:  15


<p> Each item has 15 features in raw dataset.</p>

In [4]:
# Print column or feature names 
data.columns

Index(['uniq_id', 'crawl_timestamp', 'product_url', 'product_name',
       'product_category_tree', 'pid', 'retail_price', 'discounted_price',
       'image', 'is_FK_Advantage_product', 'description', 'product_rating',
       'overall_rating', 'brand', 'product_specifications'],
      dtype='object')

Out of these 15 features, we will be using only 5 features.
  1. pid 
  2. product_name 
  3. product_category_tree
  4. image
  5. description


In [5]:
# considering the above mentioned categories 
data = data[['pid', 'product_name', 'product_category_tree', 'image', 'description']]

In [6]:
# print top rows in table
data.head()

Unnamed: 0,pid,product_name,product_category_tree,image,description
0,SRTEH2FF9KEDEFGF,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...","[""http://img5a.flixcart.com/image/short/u/4/a/...",Key Features of Alisha Solid Women's Cycling S...
1,SBEEH3QGU7MFYJFY,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...","[""http://img6a.flixcart.com/image/sofa-bed/j/f...",FabHomeDecor Fabric Double Sofa Bed (Finish Co...
2,SHOEH4GRSUBJGZXE,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...","[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",Key Features of AW Bellies Sandals Wedges Heel...
3,SRTEH2F6HUZMQ6SJ,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...","[""http://img5a.flixcart.com/image/short/6/2/h/...",Key Features of Alisha Solid Women's Cycling S...
4,PSOEH3ZYDMSYARJ5,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...","[""http://img5a.flixcart.com/image/pet-shampoo/...",Specifications of Sicons All Purpose Arnica Do...


### Basic stats for the feature: product_name

In [7]:
# print stats of the feature: product_name
print(data['product_name'].describe())

count                                          20000
unique                                         12676
top       TheLostPuppy Back Cover for Apple iPad Air
freq                                             134
Name: product_name, dtype: object


### Basic stats for the feature: product_category_tree

In [8]:
# print stats of the feature: product_category_tree
print(data['product_category_tree'].describe())

count                                                20000
unique                                                6466
top       ["Jewellery >> Necklaces & Chains >> Necklaces"]
freq                                                  1567
Name: product_category_tree, dtype: object


In [9]:
# save data files to pickel file
data.to_pickle('pickels/20k_data');

We save data files at every major step in our processing in "pickle" files. If you are stuck anywhere (or) if some code takes too long to run on your laptop, you may use the pickle files we give you to speed things up.

### Remove near duplicate items

Understand about duplicates.

In [10]:
# find number of products that have duplicate product_name.
print(sum(data.duplicated('product_name')))

7324


In [11]:
# find number of products that have duplicate description.
print(sum(data.duplicated('description')))

2460


In [12]:
# Remove all products with very few words in product_name
data_sorted = data[data['product_name'].apply(lambda x: len(x.split())> 3)]
print("After removal of products with short title:", data_sorted.shape[0])

After removal of products with short title: 18057


In [13]:
# Sort whole data based on product_name (alphabetical order of title)
data_sorted.sort_values('product_name', inplace=True, ascending=False)
data_sorted.head()

Unnamed: 0,pid,product_name,product_category_tree,image,description
16644,UMBEGBPZVYPXJ62X,Tarkan Unique Style-2016 Umbrella,"[""Pens & Stationery >> School Supplies >> Umbr...","[""http://img5a.flixcart.com/image/umbrella/6/2...",Key Features of Tarkan Unique Style-2016 Umbr...
15557,ACCEGP55JR26GS4U,zaidis data_cable USB Cable,"[""Mobiles & Accessories >> Mobile Accessories ...","[""http://img6a.flixcart.com/image/data-cable/u...",Key Features of zaidis data_cable USB Cable Co...
16123,CWRE84EDSKFE8KF5,zDelhi.com Car Washer Z1 Ultra High Pressure W...,"[""Automotive >> Car Accessories >> Car Care >>...","[""http://img6a.flixcart.com/image/car-pressure...",zDelhi.com Car Washer Z1 Ultra High Pressure W...
5503,HJREGVMCEGGUDUE5,"youniqueshop Plastic, Stainless Steel Hand Juicer","[""Kitchen & Dining >> Kitchen Tools >> Kitchen...","[""http://img6a.flixcart.com/image/hand-juicer/...","youniqueshop Plastic, Stainless Steel Hand Jui..."
7642,SLIEHUFTZFVFDGEZ,xy decor Cotton Sofa Cover,"[""xy decor Cotton Sofa Cover (white Pack of 6)""]","[""http://img5a.flixcart.com/image/slipcover/g/...",xy decor Cotton Sofa Cover (white Pack of 6) P...


In [14]:
# Deduplication of entries
final = data_sorted.drop_duplicates(subset= {"pid","product_name","product_category_tree", "image", "description"}, keep='first', inplace=False)
final.shape

(18057, 5)

In [15]:
#checking to see how much % of data still remains
(final['pid'].size*1.0)/(data['pid'].size*1.0)*100

90.28500000000001

In [16]:
# Drop the rows where at least one element is missing.
final = final.dropna()
final.shape

(18054, 5)

In [17]:
# save data to pickle file
final.to_pickle('pickels/18k_data')

In [18]:
# read data from pickle file
data = pd.read_pickle('pickels/18k_data')

#printing some random descriptions
desc_1300 = data['description'].values[1300]
print(desc_1300)

Voylla Artificial Classic Plain Alloy Necklace - Buy Voylla Artificial Classic Plain Alloy Necklace only for Rs. 399 from Flipkart.com. Only Genuine Products. 30 Day Replacement Guarantee. Free Shipping. Cash On Delivery!


## Separating the primary category from the product category tree

In [19]:
data['product_category_tree'] = data['product_category_tree'].apply(lambda x : x.split('>>')[0][2:].strip())

In [20]:
# print primary category 
data['product_category_tree']

16644                                 Pens & Stationery
15557                             Mobiles & Accessories
16123                                        Automotive
5503                                   Kitchen & Dining
7642     xy decor Cotton Sofa Cover (white Pack of 6)"]
                              ...                      
16404                                          Footwear
14590                                          Clothing
14587                                          Clothing
15053                                          Clothing
8770                                           Clothing
Name: product_category_tree, Length: 18054, dtype: object

### Categorization

In [21]:
# top 5 category
top_five_category = list(data.groupby('product_category_tree').count().sort_values('product_name', ascending=False).head(5).index)

In [22]:
# considering only top 5 categories
processed_data = data[data['product_category_tree'].isin (top_five_category)][['pid', 'product_name', 'product_category_tree', 'image', 'description']]

# Text pre-processing

In [23]:
from bs4 import BeautifulSoup

# Utility function for removing english contractions
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    
    #general
    phrase = re.sub(r"n\'t", "not", phrase)
    phrase = re.sub(r"\'re", "are", phrase)
    phrase = re.sub(r"\'s", "is", phrase)
    phrase = re.sub(r"\'d", "wpould", phrase)
    phrase = re.sub(r"\'ll", "will", phrase)
    phrase = re.sub(r"\'t", "not", phrase)
    phrase = re.sub(r"\'ve", "have", phrase)
    phrase = re.sub(r"\'m", "am", phrase)
    return phrase    

def preprocessing(str_arg):
    sentence = BeautifulSoup(str_arg, 'lxml').get_text()  #remove all tags
    sentence = decontracted(sentence)             #remove english contractions
    sentence = re.sub("\S*\d\S*", "", sentence).strip()  #remove words with numbers
    sentence = re.sub('[^A-Za-z]+', ' ',sentence)        # remove spacial character
    sentence=sentence.lower() #converting the cleaned string to lower case
    print(sentence)
    print("*****")
    return sentence # Returning the preprocessed string in tokenized form
processed_data['description'] = processed_data['description'].astype('str').apply(preprocessing)

key features of zaidis data cable usb cable compatible devices mobile tablet cable form factor round cable length m cable speed mbps specifications of zaidis data cable usb cable white in the box sales package data cable general brand zaidis suitable for all android smartphones mobile cable length m model data cable cable type mbps speed compatible devices mobile tablet type usb cable cable round part number connector micro usb connector usb color white
*****
zdelhi com car washer ultra high pressure washer price rs the portable automatic car washer is an amazing high pressure product for cleaning around a car without moving yourself from the distance of feet it can be used for gardening agriculture weed killers pest control nursery sanitary construction and so much more the washing brush with a internal fan nozzle can spray water while brushing the car the hose with strong power gun for powerful water pressure you can take the product in your car anywhere you go its portable and can b

In [24]:
from sklearn import preprocessing

# categories list
category = list(processed_data['product_category_tree'].unique())
# Printing the list of top 5 categories
print(category)

# encoding product category
label = preprocessing.LabelEncoder()
encoded_category = label.fit_transform(processed_data['product_category_tree'])
processed_data['product_category_tree'] = encoded_category

['Mobiles & Accessories', 'Automotive', 'Home Decor & Festive Needs', 'Clothing', 'Jewellery']


In [25]:
# print processed data 
processed_data['product_category_tree']

15557    4
16123    0
17905    2
17910    2
9873     1
        ..
12539    1
14590    1
14587    1
15053    1
8770     1
Name: product_category_tree, Length: 11848, dtype: int32

## Splitting data into train and test data

In [26]:
from sklearn.model_selection  import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


X = processed_data['description']
y = processed_data['product_category_tree']

# Splitting data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

## Featurization: BAG OF WORDS

In [27]:
#BoW
count_vect = CountVectorizer() #in scikit-learn
count_vect.fit(processed_data['description'])
print("some feature names ", count_vect.get_feature_names()[:10])
print('='*50)

final_counts = count_vect.transform(processed_data['description'])
print("the type of count vectorizer ",type(final_counts))
print("the shape of out text BOW vectorizer ",final_counts.get_shape())
print("the number of unique words ", final_counts.get_shape()[1])

some feature names  ['aa', 'aaa', 'aadi', 'aadishwar', 'aadivasi', 'aadyaa', 'aahana', 'aahna', 'aahnafashion', 'aaishwarya']
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (11848, 11955)
the number of unique words  11955


## Bi-Grams, tri-gram and n-Grams.


In [28]:
#bi-gram, tri-gram and n-gram
count_vect = CountVectorizer(ngram_range=(1,2), min_df=10)
final_bigram_counts = count_vect.fit_transform(processed_data['description'])
print("the type of count vectorizer ",type(final_bigram_counts))
print("the shape of out text BOW vectorizer ",final_bigram_counts.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_bigram_counts.get_shape()[1])

the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (11848, 9822)
the number of unique words including both unigrams and bigrams  9822


## TF - IDF

In [29]:
# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10)
tf_idf_vect.fit(processed_data['description'])
print("some sample features(unique words in the corpus)",tf_idf_vect.get_feature_names()[0:10])
print('='*50)

final_tf_idf = tf_idf_vect.transform(processed_data['description'])
print("the type of count vectorizer ",type(final_tf_idf))
print("the shape of out text TFIDF vectorizer ",final_tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_tf_idf.get_shape()[1])

some sample features(unique words in the corpus) ['aaishwarya', 'aaradhi', 'aaradhi divya', 'able', 'able to', 'abode', 'abode order', 'about', 'about casual', 'about dailyobjects']
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text TFIDF vectorizer  (11848, 9822)
the number of unique words including both unigrams and bigrams  9822


## Applying Naive Bayes on BOW

In [32]:
count_vect = CountVectorizer(stop_words = 'english')
final_Xtr=count_vect.fit_transform(X_train)
final_Xtest=count_vect.transform(X_test)

mnb = MultinomialNB()
mnb.fit(final_Xtr,y_train)
print("Train score", mnb.score(final_Xtr,y_train))
print("Test score", mnb.score(final_Xtest,y_test))

predict = mnb.predict(final_Xtest)
# Print classification report 
print(classification_report(y_test, predict))

Train score 0.9910318632622916
Test score 0.990295358649789
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       195
           1       1.00      0.99      1.00      1167
           2       0.98      0.99      0.98       177
           3       0.98      1.00      0.99       623
           4       1.00      0.93      0.97       208

    accuracy                           0.99      2370
   macro avg       0.99      0.98      0.98      2370
weighted avg       0.99      0.99      0.99      2370



## Applying Naive Bayes on TF-IDF

In [33]:
final_Xtr_tfidf = tf_idf_vect.fit_transform(X_train)
final_Xtest_tfidf = tf_idf_vect.transform(X_test)

mnb2 = MultinomialNB()
mnb2.fit(final_Xtr_tfidf,y_train)
print("Train score", mnb2.score(final_Xtr_tfidf,y_train))
print("Test score", mnb2.score(final_Xtest_tfidf,y_test))

predict2 = mnb2.predict(final_Xtest_tfidf)
# Print classification report 
print(classification_report(y_test, predict2))

Train score 0.9898712808609411
Test score 0.9936708860759493
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       195
           1       1.00      1.00      1.00      1167
           2       0.99      0.99      0.99       177
           3       0.98      1.00      0.99       623
           4       1.00      0.96      0.98       208

    accuracy                           0.99      2370
   macro avg       0.99      0.99      0.99      2370
weighted avg       0.99      0.99      0.99      2370



## Comparing models score using Prettytable Library

In [34]:
from prettytable import PrettyTable    
x = PrettyTable()
x.field_names = ["Vectorizer", "train_score", "test_score"]
x.add_row(["BOW", 0.991, 0.990])
x.add_row(["TF-IDF", 0.989, 0.993])
print(x)

+------------+-------------+------------+
| Vectorizer | train_score | test_score |
+------------+-------------+------------+
|    BOW     |    0.991    |    0.99    |
|   TF-IDF   |    0.989    |   0.993    |
+------------+-------------+------------+


</p> From, the above we can conclude that BOW vectorizer is more apt and have more train_score and test_score than tf-idf vectorizer.</p>

## Predict product category 

In [35]:
label.inverse_transform(mnb.predict(count_vect.transform(["bedsheet"])))

array(['Clothing'], dtype=object)

In [38]:
# insert a product to get the category of that item.
arg = input()
label.inverse_transform(mnb.predict(count_vect.transform([arg])))

Bracelets


array(['Jewellery'], dtype=object)

In [39]:
# insert a product description to get the category of that item.
arg = input()
label.inverse_transform(mnb.predict(count_vect.transform([arg])))

Allen Solly Girl's Shift Dress - Buy Blue Allen Solly Girl's Shift Dress For Only Rs. 1499 Online in India. Shop Online For Apparels.


array(['Clothing'], dtype=object)