In [3]:
import pandas as pd
import fasttext
df= pd.read_csv("ecommerce_dataset.csv", names=["category", "description"], header=None)
#df=pd.read_csv("ecommerce_dataset.csv")
print(df.shape)
df.head(3)

(50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...


In [6]:
df.category.value_counts()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [7]:
df.shape

(50425, 2)

In [9]:
df.dropna(inplace=True)
df.shape

(50424, 2)

In [11]:
df.category.replace("Clothing & Accessories", "Clothing_Accessories")

0          Household
1          Household
2          Household
3          Household
4          Household
            ...     
50420    Electronics
50421    Electronics
50422    Electronics
50423    Electronics
50424    Electronics
Name: category, Length: 50424, dtype: object

In [12]:
df.category.unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [13]:
df['category'] = '__label__' + df['category'].astype(str)
df.head(5)

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [14]:
df['category_description'] = df['category'] + ' ' + df['description']
df.head(3)

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...


In [15]:
import re

text = "  VIKI's | Bookcase/Bookshelf (3-Shelf/Shelve, White) | ? . hi"
text = re.sub(r'[^\w\s\']',' ', text)
text = re.sub(' +', ' ', text)
text.strip().lower()

"viki's bookcase bookshelf 3 shelf shelve white hi"

In [16]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower() 

In [17]:
df['category_description'] = df['category_description'].map(preprocess)
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


In [19]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [20]:
train.shape, test.shape

((40339, 3), (10085, 3))

In [21]:
train.to_csv("ecommerce.train", columns=["category_description"], index=False, header=False)#fasttext model aceepts this type of files only for training
test.to_csv("ecommerce.test", columns=["category_description"], index=False, header=False)

In [22]:
model = fasttext.train_supervised(input="ecommerce.train")
model.test("ecommerce.test")

(10084, 0.9674732249107497, 0.9674732249107497)

In [23]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

(('__label__electronics',), array([0.99153197]))

In [24]:
model.predict("ockey men's cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric")

(('__label__clothing_accessories',), array([1.00001001]))

In [25]:
model.predict("think and grow rich deluxe edition")

(('__label__books',), array([1.00000966]))

In [29]:
model.get_nearest_neighbors("painting")

[(0.9974862337112427, 'vacuum'),
 (0.9974687695503235, 'security'),
 (0.9967640042304993, 'temperature'),
 (0.9965460300445557, 'steam'),
 (0.9955511093139648, 'machine'),
 (0.9954201579093933, 'extended'),
 (0.9953441619873047, 'upright'),
 (0.9952775239944458, 'lint'),
 (0.9952573776245117, 'safety'),
 (0.9952205419540405, 'folding')]

In [30]:
model.get_nearest_neighbors("sony")

[(0.9982205033302307, 'gaming'),
 (0.9981942772865295, 'external'),
 (0.9979347586631775, 'devices'),
 (0.9977317452430725, '15'),
 (0.9976882338523865, 'binoculars'),
 (0.9976642727851868, 'jbl'),
 (0.9974424839019775, 'ceiling'),
 (0.996640682220459, 'glossy'),
 (0.9966025948524475, 'ram'),
 (0.9965906143188477, 'gps')]