In [55]:
import pandas as pd

df= pd.read_csv("ecommerce_dataset.csv", names=["category", "description"], header=None)
print(df.shape)
df.head()

(50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


**DROP NA VALUES**


In [56]:
df.dropna(inplace=True)
df.shape

(50424, 2)

In [57]:
df.category.unique()

array(['Household', 'Books', 'Clothing & Accessories', 'Electronics'],
      dtype=object)

In [58]:
df.category.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)

In [59]:
df.category.unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

When you train a fasttext model, it expects labels to be specified with label prefix. We will just create a third column in the dataframe that has label as well as the product description

In [60]:
df['category'] = '__label__' + df['category'].astype(str)
df.head(5)

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [61]:
df['category_description'] = df['category'] + ' ' + df['description']
df.head(3)

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...


**Pre-processing**
1.Remove punctuation
2.Remove extra space
3.Make the entire sentence lower case


In [62]:
import re

def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower() 

In [63]:
df['category_description'] = df['category_description'].map(preprocess)
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


**Train Test Split**

In [64]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [65]:
train.shape, test.shape

((40339, 3), (10085, 3))

In [66]:
train.to_csv("ecommerce.train", columns=["category_description"], index=False, header=False)
test.to_csv("ecommerce.test", columns=["category_description"], index=False, header=False)

**Model Training**

In [67]:
import fasttext

model = fasttext.train_supervised(input="ecommerce.train")
model.test("ecommerce.test")

Read 4M words
Number of words:  79258
Number of labels: 4
Progress: 100.0% words/sec/thread: 5510863 lr:  0.000000 avg.loss:  0.180216 ETA:   0h 0m 0s


(10085, 0.9681705503222607, 0.9681705503222607)

In [68]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

(('__label__electronics',), array([0.98623329]))

In [69]:
model.predict("ockey men's cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric")

(('__label__clothing_accessories',), array([1.00001001]))

**Prediction**

In [70]:
model.predict("think and grow rich deluxe edition")

(('__label__books',), array([1.00000989]))

In [71]:
model.get_nearest_neighbors("painting")

[(0.9991868734359741, 'pln002'),
 (0.9991723895072937, "5'0"),
 (0.9991723895072937, 'valueunderbed'),
 (0.9991723895072937, 'legour'),
 (0.9991723895072937, 'bedstyleadd'),
 (0.9991723895072937, 'storageunderbed'),
 (0.9991310834884644, '13ft'),
 (0.999090313911438, 'slates'),
 (0.999090313911438, 'verendah'),
 (0.9990630149841309, "54''")]

In [72]:
model.get_nearest_neighbors("sony")

[(0.9992678761482239, 'kobo\xa0'),
 (0.9992678761482239, 'arua'),
 (0.9990842342376709, 'systene'),
 (0.9990738034248352, 'multipleframe'),
 (0.9990738034248352, 'resolutionat'),
 (0.9990738034248352, 'breathtakingbeauty'),
 (0.9990738034248352, 'inchlcd'),
 (0.9990738034248352, 'speedrating'),
 (0.9990738034248352, '90mins'),
 (0.9990738034248352, '1080p60fps')]

In [73]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

(('__label__electronics',), array([0.98623329]))

In [74]:
model.get_nearest_neighbors("sony")

[(0.9992678761482239, 'kobo\xa0'),
 (0.9992678761482239, 'arua'),
 (0.9990842342376709, 'systene'),
 (0.9990738034248352, 'multipleframe'),
 (0.9990738034248352, 'resolutionat'),
 (0.9990738034248352, 'breathtakingbeauty'),
 (0.9990738034248352, 'inchlcd'),
 (0.9990738034248352, 'speedrating'),
 (0.9990738034248352, '90mins'),
 (0.9990738034248352, '1080p60fps')]