In [1]:
#Problem Statement: Given a description about a product sold on e-commerce website, classify it in one of the 4 categories
#Electronics (0), Households (1), Books (2), or Clothing & Accessories (3) products
#Import Necessary Libraries
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords, words
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Ecommerce Dataset.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Class,Reviews
0,0,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
1,1,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
2,2,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
3,3,Household,Incredible Gifts India Wooden Happy Birthday U...
4,4,Household,Pitaara Box Romantic Venice Canvas Painting 6m...


In [4]:
df = df.loc[:, ["Class", "Reviews"]]

In [5]:
df.head()

Unnamed: 0,Class,Reviews
0,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
1,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
2,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
3,Household,Incredible Gifts India Wooden Happy Birthday U...
4,Household,Pitaara Box Romantic Venice Canvas Painting 6m...


In [6]:
df['Reviews'] = df['Reviews'].str.replace(r'\W',' ') # remove the regular expression

In [7]:
df.head()

Unnamed: 0,Class,Reviews
0,Household,SAF Floral Framed Painting Wood 30 inch x ...
1,Household,SAF UV Textured Modern Art Print Framed Pain...
2,Household,SAF Flower Print Framed Painting Synthetic 1...
3,Household,Incredible Gifts India Wooden Happy Birthday U...
4,Household,Pitaara Box Romantic Venice Canvas Painting 6m...


In [8]:
x = df[['Reviews']]
y = df[['Class']]

In [9]:
x.head()

Unnamed: 0,Reviews
0,SAF Floral Framed Painting Wood 30 inch x ...
1,SAF UV Textured Modern Art Print Framed Pain...
2,SAF Flower Print Framed Painting Synthetic 1...
3,Incredible Gifts India Wooden Happy Birthday U...
4,Pitaara Box Romantic Venice Canvas Painting 6m...


In [10]:
y = y.replace(['Household', 'Books', 'Electronics','Clothing & Accessories'], [1,2,0,3])

In [11]:
y.head()

Unnamed: 0,Class
0,1
1,1
2,1
3,1
4,1


In [12]:
x['Reviews'] = x['Reviews'].str.lower()

In [13]:
x['Reviews'].head()

0    saf  floral  framed painting  wood  30 inch x ...
1    saf  uv textured modern art print framed  pain...
2    saf flower print framed painting  synthetic  1...
3    incredible gifts india wooden happy birthday u...
4    pitaara box romantic venice canvas painting 6m...
Name: Reviews, dtype: object

In [14]:
x['Reviews'][2]

'saf flower print framed painting  synthetic  13 5 inch x 22 inch  uv textured  set of 3  sanfsw4951  color multicolor                                                                                                              size 35 cm x 50 cm x 2 cm   a beautiful painting involves the action or skill of using paint in the right manner  hence  the end product will be a picture that can speak a thousand words they say  arts have been in trend for quite some time now  it can give different viewer different meanings style and design the saf wood matte painting with frame is quite abstract and mysteriously beautiful  the painting has a nice frame to it  you can gift this to a family or a friend  the painting has various forms of certain figures on it as seen in the image  you can add a good set of lights to the place where the painting is and the decor will give a different feel and look to the place  quality and durability the painting has a matte finish and includes a good quality fra

In [15]:
x['Reviews'] = x['Reviews'].str.strip()

In [17]:
x['Reviews'] = x['Reviews'].values.astype('U')

In [18]:
x['Reviews'][1]

'saf  uv textured modern art print framed  painting  synthetic  35 cm x 50 cm x 3 cm  set of 3  color multicolor                                                                                                              size 35 cm x 50 cm x 3 cm   overview a beautiful painting involves the action or skill of using paint in the right manner  hence  the end product will be a picture that can speak a thousand words they say  arts have been in trend for quite some time now  it can give different viewer different meanings style and design the saf wood matte abstract painting with frame is quite abstract and mysteriously beautiful  the painting has a nice frame to it  you can gift this to a family or a friend  the painting has various forms of certain figures on it as seen in the image  you can add a good set of lights to the place where the painting is and the decor will give a different feel and look to the place  quality and durability the painting has a matte finish and includes a good

In [20]:
#Text Preprocessing
def clean_text_data(text):
    remove_punc1 = [char for char in text if char not in string.punctuation]
    remove_punc2 = ''.join(remove_punc1) #sentence without punctuation 
    
    stop_words_remove = [word for word in remove_punc2.split() if word.lower() not in stopwords.words('english') ]
    return stop_words_remove

In [22]:
abc=x['Reviews'].head(5).apply(clean_text_data)

In [23]:
abc

0    [saf, floral, framed, painting, wood, 30, inch...
1    [saf, uv, textured, modern, art, print, framed...
2    [saf, flower, print, framed, painting, synthet...
3    [incredible, gifts, india, wooden, happy, birt...
4    [pitaara, box, romantic, venice, canvas, paint...
Name: Reviews, dtype: object

In [25]:
#Feature extraction
tfidf

<5x5 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [28]:
tfidf = TfidfVectorizer(analyzer=clean_text_data).fit_transform(x['Reviews'])
tfidf

<50428x78718 sparse matrix of type '<class 'numpy.float64'>'
	with 2909369 stored elements in Compressed Sparse Row format>

# Split Dataset

In [33]:
y.head()

Unnamed: 0,Class
0,1
1,1
2,1
3,1
4,1


In [38]:
x = pd.DataFrame(tfidf)

In [39]:
x.head()

Unnamed: 0,0
0,"(0, 51682)\t0.12983810369293472\n (0, 33537..."
1,"(0, 13116)\t0.05175879731211598\n (0, 19956..."
2,"(0, 2730)\t0.03481214760312151\n (0, 61864)..."
3,"(0, 66858)\t0.0768872618688117\n (0, 46826)..."
4,"(0, 74493)\t0.054543641411634064\n (0, 5487..."


In [40]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=.30)

In [41]:
xtrain.head()

Unnamed: 0,0
13237,"(0, 71999)\t0.3814323882199868\n (0, 75879)..."
4908,"(0, 31130)\t0.10574085325416183\n (0, 77977..."
44503,"(0, 60693)\t0.21607265749423485\n (0, 2631)..."
49866,"(0, 27679)\t0.2886225755686193\n (0, 53119)..."
8612,"(0, 27568)\t0.1298460279965181\n (0, 27142)..."


In [45]:
ytrain.head()

Unnamed: 0,Class
13237,1
4908,1
44503,0
49866,0
8612,1


In [42]:
# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB

In [43]:
mul = MultinomialNB()

In [None]:
mul.fit(xtrain, ytrain)

In [None]:
mul.score(xtest, ytest)

In [None]:
# Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB

In [None]:
ber = BernoulliNB()

In [46]:
ber.fit(xtrain, ytrain)

In [None]:
ber..score(xtest, ytest)