In [1]:
!pip install pandas



In [2]:
import pandas as pd
df=pd.read_csv("./data/Ecommerce_data.csv")

In [3]:
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [4]:
df.shape

(24000, 2)

In [5]:
df.label.value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [6]:
df["label"]=df.label.map({
    "Household":0,
    "Books":1,
    "Electronics":2,
    "Clothing & Accessories":3
})
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,0
1,"Contrast living Wooden Decorative Box,Painted ...",0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,3
4,Indira Designer Women's Art Mysore Silk Saree ...,3


In [7]:
!pip install scikit-learn spacy



In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(
df.Text,df.label,test_size=0.2,random_state=2022,stratify=df.label)

In [9]:
X_train.shape

(19200,)

In [10]:
y_train.value_counts()

label
0    4800
2    4800
3    4800
1    4800
Name: count, dtype: int64

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

clf=Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier())
    
])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.97      0.95      0.96      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.98      0.97      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [12]:
y_test[:5]

20706    0
19166    2
15209    3
2462     1
6621     3
Name: label, dtype: int64

In [13]:
y_pred[:5]

array([0, 2, 3, 1, 0], dtype=int64)

Now Using spacy

In [14]:
import spacy
nlp=spacy.load("en_core_web_sm")
def preprocess(text):
    doc=nlp(text)
    filtered_tokens=[]
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [15]:
df['preprocessed_txt']=df["Text"].apply(preprocess)

In [16]:
df.head()

Unnamed: 0,Text,label,preprocessed_txt
0,Urban Ladder Eisner Low Back Study-Office Comp...,0,Urban Ladder Eisner low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",0,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,2,IO Crest SY PCI40010 PCI raid Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,3,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,3,Indira Designer Women Art Mysore Silk Saree Bl...


In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_txt, 
    df.label,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.label
)

In [18]:
clf=Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier())
    
])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.97      0.95      0.96      1200
           2       0.97      0.97      0.97      1200
           3       0.98      0.98      0.98      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [19]:
text="Kaffe Fassett's Sew Simple Quilts & Patchworks: 17 Designs Using Kaffe Fassett's Artisan Fabrics Paperback – July 9, 2019"

In [20]:
preprocess_text=preprocess(text)

In [21]:
preprocess_text

'Kaffe Fassett Sew Simple Quilts Patchworks 17 design Kaffe Fassett Artisan Fabrics Paperback July 9 2019'

In [22]:
y_pred=clf.predict([preprocess_text])
y_pred

array([1], dtype=int64)

In [24]:
import pickle

# Load the model from the pickle file
with open('model/ecommerce.pkl', 'wb') as f:
    pickle.dump(clf, f)