In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

# corpus = [
#     "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
#     "Apple is announcing new iphone tomorrow",
#     "Tesla is announcing new model-3 tomorrow",
#     "Google is announcing new pixel-6 tomorrow",
#     "Microsoft is announcing new surface tomorrow",
#     "Amazon is announcing new eco-dot tomorrow",
#     "I am eating biryani and you are eating grapes"
# ]

corpus=['talha thor hammad']

In [23]:
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)
print(v.vocabulary_)

{'talha': 1, 'thor': 2, 'hammad': 0}


In [24]:
v.transform(['Talha eat pizza']).toarray()

array([[0., 1., 0.]])

In [25]:
import pandas as pd

#read the data into a pandas dataframe
df = pd.read_csv("Ecommerce_data.csv")
print(df.shape)
df.head(5)

(24000, 2)


Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [26]:
#check the distribution of labels 
df['label'].value_counts()

Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: label, dtype: int64

In [27]:
df['label_num'] = df['label'].map({
    'Household' : 0, 
    'Books': 1, 
    'Electronics': 2, 
    'Clothing & Accessories': 3
})

#checking the results 
df.head(5)

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3


In [28]:
df.shape

(24000, 3)

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Text, 
    df.label_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.label_num
)

In [40]:
X_test

20706    Lal Haveli Designer Handmade Patchwork Decorat...
19166    GOTOTOP Classical Retro Cotton & PU Leather Ne...
15209    FabSeasons Camouflage Polyester Multi Function...
2462     Indian Superfoods: Change the Way You Eat Revi...
6621     Milton Marvel Insulated Steel Casseroles, Juni...
                               ...                        
15690    OYO BABY Waterproof Bed Protector Dry Sheet -L...
17711    Asus Gaming FX570UD-E4168T 15.6-inch Laptop (8...
17906    Dennis Lingo Men's Cotton Casual Full Sleeves ...
19585    ALC Creation Women's Crepe A-Line Kurti This i...
7865              Goosebumps Most Wanted #04: The Haunter 
Name: Text, Length: 4800, dtype: object

In [44]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))
v.get_feature_names_out()

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1200
           1       0.98      0.92      0.95      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.99      0.98      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



array(['hammad', 'talha', 'thor'], dtype=object)

In [50]:
# Access the TfidfVectorizer component of your pipeline
tfidf_vectorizer = clf.named_steps['vectorizer_tfidf']

# Get the feature names (vocabulary)
vocabulary = tfidf_vectorizer.get_feature_names_out()

# Print the vocabulary
print(vocabulary[:-10])
vocabulary.shape


['00' '000' '0000' ... 'மல' 'யலம' 'ரச']


(47580,)

In [32]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9602083333333333

In [33]:
clf.score(X_test, y_test)

0.9602083333333333

In [34]:
X_test[:5]

20706    Lal Haveli Designer Handmade Patchwork Decorat...
19166    GOTOTOP Classical Retro Cotton & PU Leather Ne...
15209    FabSeasons Camouflage Polyester Multi Function...
2462     Indian Superfoods: Change the Way You Eat Revi...
6621     Milton Marvel Insulated Steel Casseroles, Juni...
Name: Text, dtype: object

In [35]:
X_test[:5][6621]

'Milton Marvel Insulated Steel Casseroles, Junior Gift Set, 3 Pieces, Pink Color Name:Pink    Multipurpose 3 Piece Casserole Set for Your Regular Use : Keep your freshly cooked food warmer for longer with the Milton casserole set of three. The other two casserole pieces of 1000 ml and 1500 ml, respectively can be used to store larger portions of food. This set is ideal for modern families. Manage time efficiently by preparing all the dishes in advance and having it steaming hot out of the casserole whenever you want.   Top Quality Insulation to keep Food Warm for Longer : These Milton casseroles are specially engineered so that the food you store inside is warmer and fresher for longer. The inner sides of the casseroles are made of quality steel while the outer body is made from high quality plastic, making the containers lightweight and efficient.   Easy to Maintain and Store Away : The simple streamlined designs of these casseroles make it easy for you to clean them. Available in an 

In [36]:
y_test[:5]

20706    0
19166    2
15209    3
2462     1
6621     3
Name: label_num, dtype: int64

In [37]:
y_pred[:5]

array([0, 2, 3, 1, 0], dtype=int64)

In [38]:
# Sample predicted values
# y_pred = [2]

# # Loop through each predicted value and apply the conditions
# for val in y_pred:
#     if val == 0:
#         print('Household')
#     elif val == 1:
#         print('Books')
#     elif val == 2:
#         print('Electronics')
#     else:
#         print('Clothing & Accessories')
