In [1]:
import pandas as pd

df = pd.read_csv("Ecommerce_data.csv")

In [2]:
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [3]:
df.label.value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [4]:
df['label_num'] =  df.label.map({
    'Household' : 0,
    'Electronics': 1,
    'Clothing & Accessories':2,
    'Books':3
})

In [5]:
df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
df.Text,
df.label_num,
test_size=0.2,
random_state=1999,
stratify= df.label_num
)

In [8]:
print(X_train.shape,X_test.shape)

(19200,) (4800,)


In [9]:
y_train.value_counts()

label_num
0    4800
3    4800
2    4800
1    4800
Name: count, dtype: int64

In [10]:
y_test.value_counts()

label_num
0    1200
1    1200
2    1200
3    1200
Name: count, dtype: int64

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
clf = Pipeline([('vectorizer_tfidf', TfidfVectorizer()),
                ('knn', KNeighborsClassifier())])

In [13]:
clf.fit(X_train, y_train)

In [14]:
y_pred = clf.predict(X_test)
y_pred

array([0, 1, 1, ..., 2, 2, 2], dtype=int64)

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1200
           1       0.96      0.97      0.97      1200
           2       0.97      0.98      0.98      1200
           3       0.97      0.95      0.96      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [16]:
y_test[:5]

6556     0
22921    1
21024    1
14040    1
15434    2
Name: label_num, dtype: int64

In [17]:
y_pred[:5]

array([0, 1, 1, 1, 2], dtype=int64)

In [18]:
import spacy

In [19]:
nlp = spacy.load('en_core_web_sm')

def preprocess(text):
    doc = nlp(text)
    
    filtered_token = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_token.append(token.lemma_)
    return " ".join(filtered_token)



In [20]:
df['Preprocessed_txt'] = df['Text'].apply(preprocess)

In [21]:
df.head()

Unnamed: 0,Text,label,label_num,Preprocessed_txt
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0,Urban Ladder Eisner low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1,IO Crest SY PCI40010 PCI raid Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2,Indira Designer Women Art Mysore Silk Saree Bl...


In [22]:
X_train, X_test, y_train, y_test = train_test_split(
df.Preprocessed_txt,
df.label_num,
test_size=0.2,
random_state=1999,
stratify= df.label_num
)

In [23]:
clf = Pipeline([('vectorizer_tfidf', TfidfVectorizer()),
                ('knn', KNeighborsClassifier())])

In [24]:
clf.fit(X_train, y_train)

In [25]:
y_pred = clf.predict(X_test)
y_pred

array([0, 1, 1, ..., 2, 2, 2], dtype=int64)

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      1200
           1       0.96      0.97      0.97      1200
           2       0.97      0.98      0.98      1200
           3       0.98      0.95      0.96      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800

