In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

In [2]:
corpus=['The cat sat on the mat.',
      'The dog chased the ball.',
      'The bird flew away.',
      'The sun is shining brightly.',
      'I love to eat pizza.'
     ]

In [4]:
v=TfidfVectorizer()
transformed=v.fit_transform(corpus)
print(v.vocabulary_)

{'the': 17, 'cat': 4, 'sat': 14, 'on': 12, 'mat': 11, 'dog': 6, 'chased': 5, 'ball': 1, 'bird': 2, 'flew': 8, 'away': 0, 'sun': 16, 'is': 9, 'shining': 15, 'brightly': 3, 'love': 10, 'to': 18, 'eat': 7, 'pizza': 13}


In [8]:
feature_names=v.get_feature_names_out()

for words in feature_names:
    indx=v.vocabulary_.get(words)
    print(f"{words}:{v.idf_[indx]}")

away:2.09861228866811
ball:2.09861228866811
bird:2.09861228866811
brightly:2.09861228866811
cat:2.09861228866811
chased:2.09861228866811
dog:2.09861228866811
eat:2.09861228866811
flew:2.09861228866811
is:2.09861228866811
love:2.09861228866811
mat:2.09861228866811
on:2.09861228866811
pizza:2.09861228866811
sat:2.09861228866811
shining:2.09861228866811
sun:2.09861228866811
the:1.1823215567939547
to:2.09861228866811


In [44]:
df=pd.read_csv('/home/siva/Downloads/ecommerceDataset.csv',
               names=['category','content'])
df.dropna(subset=['content'],axis=0,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50424 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  50424 non-null  object
 1   content   50424 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [45]:
df.category.value_counts()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8670
Name: count, dtype: int64

In [47]:
df_sampled=pd.DataFrame(columns=['category','content'])
for value in df.category.unique():
    df_temp=df[df['category']==value].sample(8670,random_state=2023)
    df_sampled=pd.concat([df_sampled,df_temp],axis=0)
df_sampled

Unnamed: 0,category,content
18926,Household,"iBELL Electric Screwdriver Model: SD12-75, Dia..."
6637,Household,TIED RIBBONS Anti Slip Shelf Liner Mat for Kit...
14244,Household,"Ramakada Make Up and Cosmetic Set Suitcase, Du..."
1073,Household,Commercial Special 6/10 ft- Bamboo Roll Up Bli...
1455,Household,Tayyaba Enterprises Pure Sheesham Wooden Rocki...
...,...,...
44219,Electronics,Teconica GM-360 Mini Rugby Bluetooth Wireless ...
46141,Electronics,Osaka S-Type Bracket Elinchrom S Mount Holder ...
41341,Electronics,"rts Esynic 3.5"" USB External-Floppy Disk Drive..."
44913,Electronics,Rhobos Mini Music Clip Mp3 Player with Sd Card...


In [48]:
label_encoder=LabelEncoder()
df_sampled['category_encoded']=label_encoder.fit_transform(\
                                    df_sampled['category'])
df_sampled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34680 entries, 18926 to 40529
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   category          34680 non-null  object
 1   content           34680 non-null  object
 2   category_encoded  34680 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [49]:
X_train,X_test,y_train,y_test=train_test_split(\
    df_sampled['content'],df_sampled['category'],test_size=0.2,
    stratify=df_sampled['category'])
clf=Pipeline([
    ('tf-idf',TfidfVectorizer()),
    ('knn',KNeighborsClassifier())
])
clf.fit(X_train,y_train)

In [None]:
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))