<a href="https://colab.research.google.com/github/saddarudin/google_colab/blob/main/nlp_tf_idf_vectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes",
    "something is amazing"
]

In [36]:
v = TfidfVectorizer()
transformed_output = v.fit_transform(corpus)
v.vocabulary_

{'thor': 27,
 'eating': 11,
 'pizza': 23,
 'loki': 18,
 'is': 17,
 'ironman': 16,
 'ate': 8,
 'already': 0,
 'apple': 6,
 'announcing': 5,
 'new': 21,
 'iphone': 15,
 'tomorrow': 28,
 'tesla': 26,
 'model': 20,
 'google': 13,
 'pixel': 22,
 'microsoft': 19,
 'surface': 25,
 'amazon': 3,
 'eco': 12,
 'dot': 10,
 'am': 1,
 'biryani': 9,
 'and': 4,
 'you': 29,
 'are': 7,
 'grapes': 14,
 'something': 24,
 'amazing': 2}

In [37]:
all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
  indx = v.vocabulary_.get(word)
  idf_score = v.idf_[indx]
  print(f"{word} {idf_score}")

already 2.504077396776274
am 2.504077396776274
amazing 2.504077396776274
amazon 2.504077396776274
and 2.504077396776274
announcing 1.4054651081081644
apple 2.504077396776274
are 2.504077396776274
ate 2.504077396776274
biryani 2.504077396776274
dot 2.504077396776274
eating 2.09861228866811
eco 2.504077396776274
google 2.504077396776274
grapes 2.504077396776274
iphone 2.504077396776274
ironman 2.504077396776274
is 1.1177830356563834
loki 2.504077396776274
microsoft 2.504077396776274
model 2.504077396776274
new 1.4054651081081644
pixel 2.504077396776274
pizza 2.504077396776274
something 2.504077396776274
surface 2.504077396776274
tesla 2.504077396776274
thor 2.504077396776274
tomorrow 1.4054651081081644
you 2.504077396776274


In [38]:
corpus[:2]

['Thor eating pizza, Loki is eating pizza, Ironman ate pizza already',
 'Apple is announcing new iphone tomorrow']

In [39]:
transformed_output[:2].toarray()

array([[0.24247317, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.24247317, 0.        ,
        0.        , 0.40642288, 0.        , 0.        , 0.        ,
        0.        , 0.24247317, 0.10823643, 0.24247317, 0.        ,
        0.        , 0.        , 0.        , 0.7274195 , 0.        ,
        0.        , 0.        , 0.24247317, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.31652498, 0.5639436 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5639436 , 0.        , 0.25173606, 0.        , 0.        ,
        0.        , 0.31652498, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.31652498, 0.        ]])

In [40]:
import pandas as pd

df = pd.read_csv('Ecommerce_data.csv')
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [41]:
df.shape

(24000, 2)

In [42]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Household,6000
Electronics,6000
Clothing & Accessories,6000
Books,6000


In [43]:
labels = {'Household':0, 'Electronics':1, 'Clothing & Accessories':2, 'Books':3}
df.label = df.label.map(labels)
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,0
1,"Contrast living Wooden Decorative Box,Painted ...",0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,2
4,Indira Designer Women's Art Mysore Silk Saree ...,2


In [44]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df.Text,df.label,test_size=0.20,random_state=2022,stratify=df.label)

x_train.shape,x_test.shape

((19200,), (4800,))

In [45]:
y_train.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,4800
2,4800
3,4800
1,4800


In [46]:
y_test.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,1200
2,1200
3,1200
1,1200


In [47]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [48]:
clf = Pipeline([
    ('vectorizer',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier())
])

clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.96      0.97      0.97      1200
           2       0.97      0.98      0.98      1200
           3       0.98      0.95      0.96      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [49]:
import spacy

nlp = spacy.load('en_core_web_sm')

def preprocess(text):
  doc = nlp(text)
  filtered_tokens = []

  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_tokens.append(token.lemma_)

  return " ".join(filtered_tokens)