# Importing Libraries
------------

In [1]:
import pandas as pd
from gensim.utils import simple_preprocess
import csv
import fasttext

# Reading Data
------

In [2]:
df = pd.read_csv('flipkart.csv').rename(columns = {'product_category_tree':'category'})

In [3]:
df.head()

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,category,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32157.0,22646.0,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati..."
2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,999.0,499.0,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""..."
3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,699.0,267.0,"[""http://img5a.flixcart.com/image/short/6/2/h/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,220.0,210.0,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",..."


# Data Preprocessing

In [4]:
df.drop('retail_price', axis=1, inplace = True)
df.drop('discounted_price', axis=1, inplace = True)
df.drop('is_FK_Advantage_product', axis=1, inplace = True)
df.drop('product_rating', axis=1, inplace = True)
df.drop('overall_rating', axis=1, inplace = True)
df.drop('brand', axis=1, inplace = True)
df.drop('uniq_id', axis=1, inplace = True)
df.drop('crawl_timestamp', axis=1, inplace = True)
df.drop('product_url', axis=1, inplace = True)
df.drop('pid', axis=1, inplace = True)
df.drop('image', axis=1, inplace = True)
df.drop('product_specifications', axis=1, inplace = True)
df.drop('product_name', axis=1, inplace = True)

#### Using only the description coloumn to predict the category. Leaving out other coloumns for faster processing

In [5]:
len(df['category'])

20000

In [6]:
for i in range(len(df['category'])):
    df['category'][i] = df['category'][i][2:-2]

In [7]:
for i in range(len(df['category'])):
    df['category'][i] = df['category'][i].split(">>")[0]

In [8]:
df.tail()

Unnamed: 0,category,description
19995,Baby Care,Buy WallDesign Small Vinyl Sticker for Rs.730 ...
19996,Baby Care,Buy Wallmantra Large Vinyl Stickers Sticker fo...
19997,Baby Care,Buy Elite Collection Medium Acrylic Sticker fo...
19998,Baby Care,Buy Elite Collection Medium Acrylic Sticker fo...
19999,Baby Care,Buy Elite Collection Medium Acrylic Sticker fo...


In [9]:
len(df['description'].unique())

17540

#### Checking the object type of description

In [10]:
df['description']

0        Key Features of Alisha Solid Women's Cycling S...
1        FabHomeDecor Fabric Double Sofa Bed (Finish Co...
2        Key Features of AW Bellies Sandals Wedges Heel...
3        Key Features of Alisha Solid Women's Cycling S...
4        Specifications of Sicons All Purpose Arnica Do...
                               ...                        
19995    Buy WallDesign Small Vinyl Sticker for Rs.730 ...
19996    Buy Wallmantra Large Vinyl Stickers Sticker fo...
19997    Buy Elite Collection Medium Acrylic Sticker fo...
19998    Buy Elite Collection Medium Acrylic Sticker fo...
19999    Buy Elite Collection Medium Acrylic Sticker fo...
Name: description, Length: 20000, dtype: object

#### Filling out any NaN as it is considered a float and would hinder with our further data cleaning

In [11]:
df['description'] = df['description'].fillna('').astype(str)

In [12]:
df.iloc[:,1]

0        Key Features of Alisha Solid Women's Cycling S...
1        FabHomeDecor Fabric Double Sofa Bed (Finish Co...
2        Key Features of AW Bellies Sandals Wedges Heel...
3        Key Features of Alisha Solid Women's Cycling S...
4        Specifications of Sicons All Purpose Arnica Do...
                               ...                        
19995    Buy WallDesign Small Vinyl Sticker for Rs.730 ...
19996    Buy Wallmantra Large Vinyl Stickers Sticker fo...
19997    Buy Elite Collection Medium Acrylic Sticker fo...
19998    Buy Elite Collection Medium Acrylic Sticker fo...
19999    Buy Elite Collection Medium Acrylic Sticker fo...
Name: description, Length: 20000, dtype: object

#### Using 'simple_preprocess' to remove unwanted symbols, punctuation and convert text to lower case for uniformity

In [13]:
df.iloc[:,1] = df.iloc[:,1].apply(lambda x: ' '.join(simple_preprocess(x)))

In [14]:
df.head()

Unnamed: 0,category,description
0,Clothing,key features of alisha solid women cycling sho...
1,Furniture,fabhomedecor fabric double sofa bed finish col...
2,Footwear,key features of aw bellies sandals wedges heel...
3,Clothing,key features of alisha solid women cycling sho...
4,Pet Supplies,specifications of sicons all purpose arnica do...


#### Prefixing all out category names with __label__ as a prerequisite for the 'fasttext' library to run

In [15]:
df.iloc[:, 0]

0            Clothing 
1           Furniture 
2            Footwear 
3            Clothing 
4        Pet Supplies 
             ...      
19995       Baby Care 
19996       Baby Care 
19997       Baby Care 
19998       Baby Care 
19999       Baby Care 
Name: category, Length: 20000, dtype: object

In [16]:
df.iloc[:, 0] = df.iloc[:, 0].apply(lambda x: '__label__' + x)

In [17]:
df.tail()

Unnamed: 0,category,description
19995,__label__Baby Care,buy walldesign small vinyl sticker for rs onli...
19996,__label__Baby Care,buy wallmantra large vinyl stickers sticker fo...
19997,__label__Baby Care,buy elite collection medium acrylic sticker fo...
19998,__label__Baby Care,buy elite collection medium acrylic sticker fo...
19999,__label__Baby Care,buy elite collection medium acrylic sticker fo...


#### Replacing whitespaces with '-' to not loose the complete category name in the prediction stage

In [18]:
df.category = df.category.str.replace(' ', '-')

### Segmenting data into test and train sets

In [19]:
df_2 = df[17501:-1]
df = df[0:17501]

### Exporting to csv

In [20]:
df[['category', 'description']].to_csv('train_M.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

df_2[['category', 'description']].to_csv('test_M.txt', 
                                     index = False, 
                                     sep = ' ',
                                     header = None, 
                                     quoting = csv.QUOTE_NONE, 
                                     quotechar = "", 
                                     escapechar = " ")

# Modelling and evaluation
--------------

In [26]:
model = fasttext.train_supervised('train_M.txt', wordNgrams = 1 ,epoch=70, lr=0.3)

In [27]:
model.test('test_M.txt') 

(2435, 0.7831622176591376, 0.7831622176591376)

### The model yields values for precision at one as 0.783 and recall of one as 0.783 for a test set of 2435

#### Checking the models predictions. Note that the array value denotes the confidence of the model (the probability of the prediction to be correct according to the model)

In [28]:
for i in range(0,5):
    print(model.predict(df_2.iloc[i, 1]))

(('__label__Jewellery-',), array([0.99997842]))
(('__label__Jewellery-',), array([0.99997842]))
(('__label__Home-Furnishing-',), array([0.99760324]))
(('__label__Mobiles-&-Accessories-',), array([0.99987459]))
(('__label__Mobiles-&-Accessories-',), array([0.99999785]))


In [29]:
df_2.head()

Unnamed: 0,category,description
17501,__label__Jewellery-,disney brass cubic zirconia rhodium bracelet b...
17502,__label__Jewellery-,disney brass cubic zirconia rhodium bracelet b...
17503,__label__Home-Furnishing-,rustic india geometric cushions cover pack of ...
17504,__label__Mobiles-&-Accessories-,theskinmantra sleeve for all versions of apple...
17505,__label__Mobiles-&-Accessories-,thelostpuppy back cover for apple ipad air mul...


# Saving the Model
------------

In [30]:
model.save_model('text_class.bin')