In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [3]:
v = TfidfVectorizer()
transform_output = v.fit_transform(corpus)
print(v.vocabulary_)


{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [10]:
for word in v.get_feature_names_out():
  index = v.vocabulary_.get(word)
  print(v.idf_[index],word)

2.386294361119891 already
2.386294361119891 am
2.386294361119891 amazon
2.386294361119891 and
1.2876820724517808 announcing
2.386294361119891 apple
2.386294361119891 are
2.386294361119891 ate
2.386294361119891 biryani
2.386294361119891 dot
1.9808292530117262 eating
2.386294361119891 eco
2.386294361119891 google
2.386294361119891 grapes
2.386294361119891 iphone
2.386294361119891 ironman
1.1335313926245225 is
2.386294361119891 loki
2.386294361119891 microsoft
2.386294361119891 model
1.2876820724517808 new
2.386294361119891 pixel
2.386294361119891 pizza
2.386294361119891 surface
2.386294361119891 tesla
2.386294361119891 thor
1.2876820724517808 tomorrow
2.386294361119891 you


In [4]:
v.idf_

array([2.38629436, 2.38629436, 2.38629436, 2.38629436, 1.28768207,
       2.38629436, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       1.98082925, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       2.38629436, 1.13353139, 2.38629436, 2.38629436, 2.38629436,
       1.28768207, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       2.38629436, 1.28768207, 2.38629436])

In [11]:
corpus[:2]

['Thor eating pizza, Loki is eating pizza, Ironman ate pizza already',
 'Apple is announcing new iphone tomorrow']

In [13]:
transform_output.toarray()[:2]

array([[0.24266547, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.24266547, 0.        , 0.        ,
        0.40286636, 0.        , 0.        , 0.        , 0.        ,
        0.24266547, 0.11527033, 0.24266547, 0.        , 0.        ,
        0.        , 0.        , 0.72799642, 0.        , 0.        ,
        0.24266547, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.30652086,
        0.5680354 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.5680354 ,
        0.        , 0.26982671, 0.        , 0.        , 0.        ,
        0.30652086, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.30652086, 0.        ]])

In [16]:
import pandas as pd
df = pd.read_csv('Ecommerce_data.csv')
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [17]:
df['label'].value_counts()

Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: label, dtype: int64

In [18]:
df['label_new'] = df['label'].map({
     'Household' : 0,
    'Books': 1,
    'Electronics': 2,
    'Clothing & Accessories': 3
})

In [19]:
df['label_new'].value_counts()

0    6000
2    6000
3    6000
1    6000
Name: label_new, dtype: int64

In [26]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df.Text,
    df.label_new,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.label_new
)

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [24]:
clf = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('knn',KNeighborsClassifier())
])

In [27]:
clf.fit(x_train,y_train)

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
  doc =  nlp(text)
  l = []
  for token in doc:
    if not (token.is_punct and token.is_stop):
      l.append(token.lemma_)
  return " ".join(l)

df['preprocess_txt'] =  df['Text'].apply(preprocess)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df.Text,
    df.preprocess_txt,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.label_new
)

In [29]:
y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred))

KeyboardInterrupt: ignored