# Text Classification using fastText

In [1]:
import pandas as pd

df = pd.read_csv("ecommerce_dataset.csv", names=["category", "description"], header=None)
df.shape

(50425, 2)

In [2]:
df.head()

Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [3]:
df.category.value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: category, dtype: int64

In [4]:
df.dropna(inplace=True)

In [5]:
df.shape

(50424, 2)

In [6]:
df.category.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)
df.category.unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [7]:
df['category'] = "__label__" + df['category'].astype(str)

In [8]:
df.category.unique()

array(['__label__Household', '__label__Books',
       '__label__Clothing_Accessories', '__label__Electronics'],
      dtype=object)

In [10]:
df['category_description'] = df['category'] + " " + df['description']
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__Household SAF Flower Print Framed Pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__Household Incredible Gifts India Wood...


In [11]:
df.category_description[0]

'__label__Household Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and so

In [14]:
import re

text = "  VIKI's | Bookcase/Bookshelf (3-Shelf/Shelve, white) | ? . hi"

In [17]:
re.sub(r"[^\w\s']", ' ', text)

"  VIKI's   Bookcase Bookshelf  3 Shelf Shelve  white        hi"

In [18]:
re.sub(r' +', ' ', text)

" VIKI's | Bookcase/Bookshelf (3-Shelf/Shelve, white) | ? . hi"

In [19]:
def preprocess(text):
    text = re.sub(r"[^\w\s']", ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip().lower()

In [20]:
preprocess(text)

"viki's bookcase bookshelf 3 shelf shelve white hi"

In [21]:
df['category_description'] = df['category_description'].map(preprocess)

In [22]:
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


In [23]:
df.category_description[0]

'__label__household paper plane design framed wall hanging motivational office decor art prints 8 7 x 8 7 inch set of 4 painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it this is an special series of paintings which makes your wall very beautiful and gives a royal touch this painting is ready to hang you would be proud to possess this unique painting that is a niche apart we use only the most modern and efficient printing technology on our prints with only the and inks and precision epson roland and hp printers this innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime we print solely with top notch 100 inks to achieve brilliant and true colours due to their high level of uv resistance our prints retain their beautiful colours for many years add colour and style to your living space with this digitally printed painting some are for pleasure and some for eternal bli

## Training Model

In [24]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [25]:
train.shape, test.shape

((40339, 3), (10085, 3))

In [26]:
train.head()

Unnamed: 0,category,description,category_description
10455,__label__Household,Ramkuwar International Silicone Cooking Tongs，...,__label__household ramkuwar international sili...
7131,__label__Household,Obsessions Polyester Vivid Hema Textile Shower...,__label__household obsessions polyester vivid ...
43131,__label__Electronics,Philips BT50B Portable Wireless Bluetooth Spea...,__label__electronics philips bt50b portable wi...
37562,__label__Clothing_Accessories,MomToBe Women's Cotton Blue & Cream Maternity ...,__label__clothing_accessories momtobe women's ...
10878,__label__Household,MCP Non-contact Digital Laser Infrared Thermom...,__label__household mcp non contact digital las...


In [27]:
test.head()

Unnamed: 0,category,description,category_description
50007,__label__Electronics,Zoook Rocker M2-Mean Machine 5-in-1 Hi-Fi Blue...,__label__electronics zoook rocker m2 mean mach...
15287,__label__Household,Eureka Forbes Mini Wet and Dry Vacuum Cleaner ...,__label__household eureka forbes mini wet and ...
4046,__label__Household,Decals Design 'Modern Elegant Ganesha God' Wal...,__label__household decals design 'modern elega...
2646,__label__Household,Nilkamal Leo Computer Trolley/Table (Brown) A ...,__label__household nilkamal leo computer troll...
25894,__label__Books,Canoeing the Congo: The First Source-to-Sea De...,__label__books canoeing the congo the first so...


In [28]:
train.to_csv("ecommerce.train", columns=['category_description'], index=False, header=False)
test.to_csv("ecommerce.test", columns=['category_description'], index=False, header=False)

In [29]:
import fasttext

model = fasttext.train_supervised(input="ecommerce.train")
model.test("ecommerce.test")

(10085, 0.9696579077838374, 0.9696579077838374)

In [30]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

(('__label__electronics',), array([0.98638427]))

In [31]:
model.predict("ockey men's cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric")

(('__label__clothing_accessories',), array([1.00001001]))

In [32]:
model.predict("think and grow rich deluxe edition")

(('__label__books',), array([1.00000989]))

In [33]:
model.get_nearest_neighbors("painting")

[(0.9987680315971375, 'xchristmas'),
 (0.9987680315971375, 'treetop'),
 (0.9987680315971375, 'bowknots'),
 (0.9987680315971375, '45x24x24cm'),
 (0.9987592697143555, 'carbonized'),
 (0.9987393617630005, '706'),
 (0.9987353682518005, 'creativevia'),
 (0.9987335801124573, '870'),
 (0.9987170100212097, 'gc2040'),
 (0.9987160563468933, 'decorativea')]

In [34]:
model.get_nearest_neighbors("sony")

[(0.9993220567703247, 'invitations'),
 (0.9993042945861816, '49066352'),
 (0.9993042945861816, '9999525002'),
 (0.9992957711219788, 'pctuner'),
 (0.9992957711219788, 'rootkits'),
 (0.9992955923080444, 'portals'),
 (0.9992955923080444, 'adwares'),
 (0.9992955923080444, 'executable'),
 (0.9992955923080444, 'initiate'),
 (0.9992955923080444, 'loggers')]

In [35]:
model.get_nearest_neighbors("digital")

[(0.993916392326355, 'filter'),
 (0.9936544895172119, 'original'),
 (0.9936010241508484, 'hd'),
 (0.993452787399292, 'moisturizer'),
 (0.9932772517204285, 'allows'),
 (0.9930575489997864, 'exceeded'),
 (0.9921830892562866, 'receipts'),
 (0.9921542406082153, 'e394'),
 (0.9916547536849976, '5580'),
 (0.9913280010223389, 'negate')]

In [37]:
model.get_nearest_neighbors("Bangalore")

[(0.0, 'to'),
 (0.0, 'and'),
 (0.0, 'a'),
 (0.0, 'with'),
 (0.0, 'for'),
 (0.0, 'is'),
 (0.0, '</s>'),
 (0.0, 'ifci'),
 (0.0, 'upb'),
 (0.0, '66pcs')]