In [1]:
#import all the libraries needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
#nltk.download()
from gensim import corpora, models
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [2]:
##get the dataset 
data = pd.read_csv("DataSetMaster.csv")

In [3]:
#populate the stoplist from the nltk corpus
stoplist = set(stopwords.words("english"))

In [4]:
#Data preparation and cleaning
for review in data.values:
        # Remove punctuations
        
        review[0] = re.sub(r'[^a-zA-Z]', ' ', review[0])
        # To lowercase
        review[0] = review[0].lower()
        # Remove stop words
        texts = [word for word in review[0].lower().split() if word not in stoplist]
        try:
            #review[0].append(' '.join(texts))
            review[0] = ' '.join(texts)
        except:
            pass

In [5]:
# split the dataset into training and testing data
train, test = train_test_split(data, test_size = 0.3)# splitting into test and train data
X_train = train['Review']
y_train = train['Class']
X_test = test['Review']
y_test = test['Class']

In [6]:
# populating categories(target variable) from the dataset
categories = data['Class'].unique()

In [25]:
# dummy classifier for baseline modeling

dummy_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', DummyClassifier()),
])
dummy_clf = dummy_clf.fit(X_train, y_train)
dummy_predicted = dummy_clf.predict(X_test)
np.mean(dummy_predicted == y_test)

0.0446969696969697

In [7]:
# decision tree classifier

dt_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf',  DecisionTreeClassifier(max_depth=100)),
])
dt_clf = dt_clf.fit(X_train, y_train)
dt_predicted = dt_clf.predict(X_test)
np.mean(dt_predicted == y_test) 

0.64136363636363636

In [None]:
DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_split=1e-07, class_weight=None, presort=False)

In [55]:
#Metrics for Decision Tree Classifier
print(metrics.classification_report(y_test, dt_predicted,
    target_names=categories))

                          precision    recall  f1-score   support

                    baby       0.79      0.78      0.78       310
                  Beauty       0.79      0.64      0.71       312
        Apps for Android       0.26      0.76      0.39       292
            digitalmusic       0.77      0.74      0.76       314
         Office Products       0.73      0.65      0.69       298
            Pet Supplies       0.71      0.69      0.70       275
              Automobile       0.64      0.56      0.60       295
                 grocery       0.66      0.64      0.65       272
             Movies & TV       0.72      0.62      0.67       301
            ToysAndGames       0.75      0.59      0.66       298
       patio&lawn&garden       0.75      0.73      0.74       289
Tools & Home Improvement       0.84      0.78      0.81       320
             CD & Vinyl        0.81      0.73      0.77       301
       SportsAndOutdoors       0.56      0.48      0.52       296
         

In [8]:
# SGD Classifier

sgd_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5)),
])
sgd_clf = sgd_clf.fit(X_train, y_train)
sgd_predicted = sgd_clf.predict(X_test)
np.mean(sgd_predicted == y_test) 

0.84727272727272729

In [58]:
#Metrics for SGD Classifier
print(metrics.classification_report(y_test, sgd_predicted,
    target_names=categories))

                          precision    recall  f1-score   support

                    baby       0.88      0.96      0.92       310
                  Beauty       0.95      0.77      0.85       312
        Apps for Android       0.82      0.79      0.81       292
            digitalmusic       0.85      0.94      0.89       314
         Office Products       0.92      0.78      0.84       298
            Pet Supplies       0.75      0.87      0.81       275
              Automobile       0.88      0.81      0.85       295
                 grocery       0.86      0.80      0.83       272
             Movies & TV       0.84      0.81      0.83       301
            ToysAndGames       0.87      0.84      0.85       298
       patio&lawn&garden       0.87      0.91      0.89       289
Tools & Home Improvement       0.89      0.91      0.90       320
             CD & Vinyl        0.85      0.95      0.90       301
       SportsAndOutdoors       0.88      0.72      0.79       296
         

In [14]:
# Multinomial NB Classifier

mnb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                     ('tfidf', TfidfTransformer()),
                     ('clf',  MultinomialNB()),
])
mnb_clf = mnb_clf.fit(X_train, y_train)
mnb_predicted = mnb_clf.predict(X_test)
np.mean(mnb_predicted == y_test) 

0.86015151515151511

In [60]:
#Metrics for Multinomial NB  Classifier
print(metrics.classification_report(y_test, mnb_predicted,
    target_names=categories))

                          precision    recall  f1-score   support

                    baby       0.98      0.92      0.95       310
                  Beauty       0.99      0.68      0.80       312
        Apps for Android       0.89      0.78      0.83       292
            digitalmusic       0.88      0.94      0.91       314
         Office Products       0.91      0.95      0.93       298
            Pet Supplies       0.73      0.89      0.80       275
              Automobile       0.96      0.83      0.89       295
                 grocery       0.66      0.89      0.76       272
             Movies & TV       0.90      0.81      0.85       301
            ToysAndGames       0.88      0.87      0.87       298
       patio&lawn&garden       0.93      0.91      0.92       289
Tools & Home Improvement       0.94      0.91      0.93       320
             CD & Vinyl        0.96      0.94      0.95       301
       SportsAndOutdoors       0.87      0.79      0.83       296
         

In [10]:
# Random Forest Classifier

rf_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf',  RandomForestClassifier(n_estimators=300, max_depth=None,
                                                     min_samples_split=50, random_state=0)),
])
rf_clf = rf_clf.fit(X_train, y_train)
rf_predicted = rf_clf.predict(X_test)
np.mean(rf_predicted == y_test) 

0.84499999999999997

In [62]:
#Metrics for Random Forest  Classifier
print(metrics.classification_report(y_test, rf_predicted,
    target_names=categories))

                          precision    recall  f1-score   support

                    baby       0.88      0.93      0.90       310
                  Beauty       0.95      0.79      0.86       312
        Apps for Android       0.71      0.77      0.74       292
            digitalmusic       0.85      0.92      0.88       314
         Office Products       0.89      0.81      0.85       298
            Pet Supplies       0.77      0.85      0.81       275
              Automobile       0.71      0.78      0.74       295
                 grocery       0.83      0.73      0.78       272
             Movies & TV       0.82      0.81      0.81       301
            ToysAndGames       0.86      0.78      0.82       298
       patio&lawn&garden       0.88      0.90      0.89       289
Tools & Home Improvement       0.88      0.88      0.88       320
             CD & Vinyl        0.86      0.92      0.89       301
       SportsAndOutdoors       0.89      0.59      0.71       296
         

In [11]:
#Logistic Regression

lr_clf = Pipeline([('vect', CountVectorizer(tokenizer=LemmaTokenizer(),ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf',  LogisticRegression(penalty='l2', dual=False, tol=0.0001, 
                                                 C=1.0, fit_intercept=True, intercept_scaling=1, 
                                                 solver='liblinear', max_iter=100, multi_class='ovr',
                                                 verbose=0, warm_start=False, n_jobs=1)),
])
lr_clf = lr_clf.fit(X_train, y_train)
lr_predicted = lr_clf.predict(X_test)
np.mean(lr_predicted == y_test)

0.87575757575757573

In [64]:
#Metrics for Logistic Regression  Classifier
print(metrics.classification_report(y_test, lr_predicted,
    target_names=categories))

                          precision    recall  f1-score   support

                    baby       0.94      0.96      0.95       310
                  Beauty       0.98      0.84      0.90       312
        Apps for Android       0.78      0.88      0.83       292
            digitalmusic       0.93      0.93      0.93       314
         Office Products       0.93      0.91      0.92       298
            Pet Supplies       0.82      0.86      0.84       275
              Automobile       0.88      0.88      0.88       295
                 grocery       0.86      0.87      0.87       272
             Movies & TV       0.85      0.85      0.85       301
            ToysAndGames       0.91      0.86      0.88       298
       patio&lawn&garden       0.94      0.90      0.92       289
Tools & Home Improvement       0.96      0.90      0.93       320
             CD & Vinyl        0.97      0.94      0.95       301
       SportsAndOutdoors       0.84      0.86      0.85       296
         

In [1]:
######### TESTING BELOW ######### BINARY-------

In [89]:
lr_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf',  LogisticRegression()),
])
lr_clf = lr_clf.fit(X_train, y_train)
lr_predicted = lr_clf.predict(X_test)
np.mean(lr_predicted == y_test)

0.8937878787878788