<a href="https://colab.research.google.com/github/saddarudin/google_colab/blob/main/nlp_fastText_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

df = pd.read_csv('Ecommerce_data.csv')
print(df.shape)
df.head()

(24000, 2)


Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    24000 non-null  object
 1   label   24000 non-null  object
dtypes: object(2)
memory usage: 375.1+ KB


In [3]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Household,6000
Electronics,6000
Clothing & Accessories,6000
Books,6000


In [4]:
df.label.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)
df.label.value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.label.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Household,6000
Electronics,6000
Clothing_Accessories,6000
Books,6000


## FastText expects the file to be like this:
### __ label __household Paper Plane design...
### __ label __electronics Apple iPhone 6S...




In [5]:
df['label'] = "__label__"+df['label'].astype(str)
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,__label__Household
1,"Contrast living Wooden Decorative Box,Painted ...",__label__Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,__label__Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,__label__Clothing_Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,__label__Clothing_Accessories


In [6]:
df['category_desctiption'] = df['label']+' '+df['Text']
df.head()

Unnamed: 0,Text,label,category_desctiption
0,Urban Ladder Eisner Low Back Study-Office Comp...,__label__Household,__label__Household Urban Ladder Eisner Low Bac...
1,"Contrast living Wooden Decorative Box,Painted ...",__label__Household,__label__Household Contrast living Wooden Deco...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,__label__Electronics,__label__Electronics IO Crest SY-PCI40010 PCI ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,__label__Clothing_Accessories,__label__Clothing_Accessories ISAKAA Baby Sock...
4,Indira Designer Women's Art Mysore Silk Saree ...,__label__Clothing_Accessories,__label__Clothing_Accessories Indira Designer ...


In [7]:
import re

text = "VIKI's | Bookcase/Bookshelf (3-shelf/Shelve, White) | ? . hi"

## Anything that is not a word and neither space means it should be special character so to represent it in regular expression we use
### ^\w\s

In [8]:
text = re.sub(r'[^\w\s\']',' ',text)
text

"VIKI's   Bookcase Bookshelf  3 shelf Shelve  White        hi"

In [9]:
text = re.sub(r" +",' ',text)
text

"VIKI's Bookcase Bookshelf 3 shelf Shelve White hi"

In [10]:
text = text.strip().lower()

In [11]:
text

"viki's bookcase bookshelf 3 shelf shelve white hi"

In [12]:
def preprocess(text):
  text = re.sub(r'[^\w\s\']',' ',text)
  text = re.sub(r" +",' ',text)
  return text.strip().lower()

In [13]:
preprocess("VIKI's | Bookcase/Bookshelf (3-shelf/Shelve, White) | ? . hi")

"viki's bookcase bookshelf 3 shelf shelve white hi"

In [14]:
df['category_desctiption'] = df.category_desctiption.apply(preprocess)
df.head()

Unnamed: 0,Text,label,category_desctiption
0,Urban Ladder Eisner Low Back Study-Office Comp...,__label__Household,__label__household urban ladder eisner low bac...
1,"Contrast living Wooden Decorative Box,Painted ...",__label__Household,__label__household contrast living wooden deco...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,__label__Electronics,__label__electronics io crest sy pci40010 pci ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,__label__Clothing_Accessories,__label__clothing_accessories isakaa baby sock...
4,Indira Designer Women's Art Mysore Silk Saree ...,__label__Clothing_Accessories,__label__clothing_accessories indira designer ...


In [15]:
from sklearn.model_selection import train_test_split

train,test = train_test_split(df,test_size=0.20)
train.shape

(19200, 3)

In [16]:
train.head()

Unnamed: 0,Text,label,category_desctiption
6871,Xs and Os Women Halter Neck Babydoll Lingerie ...,__label__Clothing_Accessories,__label__clothing_accessories xs and os women ...
3854,COCO CHANEL Men's Combo of Cotton Silk Necktie...,__label__Clothing_Accessories,__label__clothing_accessories coco chanel men'...
16265,Store2508® Moisture Absorber Room Dehumidifier...,__label__Household,__label__household store2508 moisture absorber...
1138,Your Prime Minister is Dead,__label__Books,__label__books your prime minister is dead
3433,Essays for Civil and Judicial Services Examin...,__label__Books,__label__books essays for civil and judicial s...


In [17]:
test.head()

Unnamed: 0,Text,label,category_desctiption
18649,Edup 802.11n Wireless Adapter with 6dbi Extern...,__label__Electronics,__label__electronics edup 802 11n wireless ada...
5223,KYLON Black And Red Bowtie For Kids It can be ...,__label__Clothing_Accessories,__label__clothing_accessories kylon black and ...
2691,LealDealz Premium Cotton Loafer Socks with Ant...,__label__Clothing_Accessories,__label__clothing_accessories lealdealz premiu...
11371,New Saraswati Health and Physical Education Cl...,__label__Books,__label__books new saraswati health and physic...
17320,Puma Men's Track Jacket,__label__Clothing_Accessories,__label__clothing_accessories puma men's track...


In [18]:
train.to_csv('ecommerce.txt',columns=['category_desctiption'],header=None,index=False)

In [19]:
test.to_csv('ecommerce_test.txt',columns=['category_desctiption'],header=None,index=False)

In [20]:
!pip install fasttext
import fasttext



In [21]:
model = fasttext.train_supervised(input='ecommerce.txt')

In [22]:
model.test("ecommerce_test.txt")

(4800, 0.9683333333333334, 0.9683333333333334)

### model.test returns (test_data_size, precision, recall)

### Since fasttext is incompatible with current version of numpy so it is not printing the array so I'm downgrading numpy

In [23]:
!pip install numpy==1.24.4



In [24]:
prediction= model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

In [25]:
prediction

(('__label__electronics',), array([0.99861699]))

In [26]:
model.predict('Think and Grow Rich')

(('__label__books',), array([1.00000966]))

In [27]:
model.get_nearest_neighbors("sony")

[(0.9989221096038818, 'sata3'),
 (0.9989211559295654, 'coushioned'),
 (0.9989211559295654, 'elv'),
 (0.9989211559295654, '43mm'),
 (0.9989211559295654, 'cc3'),
 (0.9989211559295654, 'labelled'),
 (0.9989157915115356, 'attackers'),
 (0.9989157915115356, 'trackmylaptop'),
 (0.9989131689071655, 'nat'),
 (0.9989099502563477, 'winxp')]

In [28]:
model.get_nearest_neighbors("painting")

[(0.9957024455070496, 'oven'),
 (0.9956722855567932, 'inchpackage'),
 (0.9956415295600891, 'homebig'),
 (0.995618462562561, 'bormioli'),
 (0.995618462562561, 'dinners'),
 (0.9956150650978088, 'hereby'),
 (0.9956116676330566, 'sham'),
 (0.9955703020095825, 'bagsthink'),
 (0.9955605268478394, 'stadiums'),
 (0.9954984188079834, '35313')]