# Importing Libraries
------------

In [23]:
import pandas as pd
from gensim.utils import simple_preprocess
import csv
import fasttext
from nltk.tokenize import word_tokenize,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import time

# Reading Data
------

In [24]:
df = pd.read_csv('set.csv')

In [25]:
df['description'] = df['description'].fillna('').astype(str)

In [26]:
df.head()

Unnamed: 0,category,description
0,__label__Clothing-,key features of alisha solid women cycling sho...
1,__label__Furniture-,fabhomedecor fabric double sofa bed finish col...
2,__label__Footwear-,key features of aw bellies sandals wedges heel...
3,__label__Clothing-,key features of alisha solid women cycling sho...
4,__label__Pet-Supplies-,specifications of sicons all purpose arnica do...


# Data Preprocessing
-------------

#### Splitting the entries to remove the stopwords (removing useless words from our data)

In [27]:
for i in range(len(df['description'])):
    df['description'][i] = df['description'][i].split()

In [28]:
sw = set(stopwords.words('english'))

In [29]:
def remove_stopwords(text, stopwords):
    useful = [w for w in text if w not in stopwords]
    return useful

In [30]:
for i in range(len(df['description'])):
    df['description'][i] = remove_stopwords(df['description'][i], sw)
    df['description'][i] = " ".join(df['description'][i])

In [32]:
df.tail()

Unnamed: 0,category,description
19995,__label__Baby-Care-,buy walldesign small vinyl sticker rs online w...
19996,__label__Baby-Care-,buy wallmantra large vinyl stickers sticker rs...
19997,__label__Baby-Care-,buy elite collection medium acrylic sticker rs...
19998,__label__Baby-Care-,buy elite collection medium acrylic sticker rs...
19999,__label__Baby-Care-,buy elite collection medium acrylic sticker rs...


### Segmenting data into test and train sets

In [33]:
df_2 = df[17501:-1]
df = df[0:17501]

### Exporting to csv

In [34]:
df[['category', 'description']].to_csv('train_M.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

df_2[['category', 'description']].to_csv('test_M.txt', 
                                     index = False, 
                                     sep = ' ',
                                     header = None, 
                                     quoting = csv.QUOTE_NONE, 
                                     quotechar = "", 
                                     escapechar = " ")

# Modelling and evaluation
--------------

In [37]:
start = time.time()
model = fasttext.train_supervised('train_M.txt', wordNgrams = 3 ,epoch=100, lr=0.3)
end = time.time()
print(end-start)

101.18481254577637


In [38]:
model.test('test_M.txt')

(2435, 0.8008213552361396, 0.8008213552361396)

### The model yields values for precision at one as 0.8008 and recall of one as 0.8008 for a test set of 2435 when fed with updated data with stopwords removed

#### Checking the models predictions. Note that the array value denotes the confidence of the model (the probability of the prediction to be correct according to the model)

In [39]:
for i in range(0,5):
    print(model.predict(df_2.iloc[i, 1]))

(('__label__Jewellery-',), array([0.99997759]))
(('__label__Jewellery-',), array([0.99997759]))
(('__label__Home-Furnishing-',), array([0.92187881]))
(('__label__Mobiles-&-Accessories-',), array([0.99994421]))
(('__label__Mobiles-&-Accessories-',), array([1.00000954]))


In [40]:
df_2.head()

Unnamed: 0,category,description
17501,__label__Jewellery-,disney brass cubic zirconia rhodium bracelet b...
17502,__label__Jewellery-,disney brass cubic zirconia rhodium bracelet b...
17503,__label__Home-Furnishing-,rustic india geometric cushions cover pack cm ...
17504,__label__Mobiles-&-Accessories-,theskinmantra sleeve versions apple ipad multi...
17505,__label__Mobiles-&-Accessories-,thelostpuppy back cover apple ipad air multico...


# Saving the Model
------------

In [41]:
model.save_model('text_class_stopwords.bin')