# Research Question : Implementing Machine Learning models for Review Classification

In [1]:
# Loading Necessary Libraries

%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from pandas.core import datetools
from nltk.tokenize import RegexpTokenizer
import statsmodels.api as sm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
import re
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, SimpleRNN
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine
from scipy.sparse.linalg import svds
from nltk.stem import WordNetLemmatizer
from scipy.sparse import csc_matrix
import time

  
Using TensorFlow backend.


In [2]:
#Clothing_ID = 862

# corpus_pandas_dataFrame = pd.read_csv("../input/Womens Clothing E-Commerce Reviews.csv")
# corpus_with_particular_ID = corpus_pandas_dataFrame[corpus_pandas_dataFrame["Clothing ID"] == Clothing_ID]
# corpus_with_particular_ID = corpus_with_particular_ID.dropna(subset=['Review Text'])
# review_text = corpus_with_particular_ID["Review Text"]
# label_dataset = corpus_with_particular_ID["Recommended IND"]

corpus_pandas_dataFrame = pd.read_csv("../input/Womens Clothing E-Commerce Reviews.csv")
entire_corpus = corpus_pandas_dataFrame.dropna(subset=['Review Text'])
review_text = entire_corpus["Review Text"]
label_dataset = entire_corpus["Recommended IND"]
#print(review_text)

X_set = list() # This will be a list of strings, where each string will be a "document"
y_set = list()

## Pre-processing Corpus
for each_review,label in zip(review_text,label_dataset):
    preprocessed_review = each_review.lower() # Lower-casing each document
    preprocessed_review = re.sub(r'[^A-Za-z ]', '', preprocessed_review) # removing Punctuations from each document
    X_set.append(preprocessed_review)
    y_set.append(label)

labeled_X_y_set = list(zip(X_set,y_set))
print("Length of Labeled Dataset :",len(labeled_X_y_set))#,"\n",labeled_X_y_set) # print the pre-processed corpus

#-----------------------------------------------------------------------------------------------
                    ## Stemming and Lemmatization - Applying to the pre-processed corpus
#-----------------------------------------------------------------------------------------------
lemmatizer = WordNetLemmatizer()
for each_document_index in range(len(X_set)):
    list_of_words = X_set[each_document_index].strip().split()
    
    for each_word_index in range(len(list_of_words)) :
        list_of_words[each_word_index] = lemmatizer.lemmatize(list_of_words[each_word_index])
    
    X_set[each_document_index] = ' '.join(list_of_words)


#-----------------------------------------------------------------------------------------------
                                # STOPWORDS Removal from the Corpus #
#-----------------------------------------------------------------------------------------------
stopWords = stopwords.words('english')

## Pre-processing Stopwords
for stopWord_index in range(len(stopWords)):
    stopWords[stopWord_index] = stopWords[stopWord_index].lower() # Lower-casing
    stopWords[stopWord_index] = re.sub(r'[^A-Za-z ]','',stopWords[stopWord_index]) # removing Punctuations

vectorizer = CountVectorizer(min_df=0, stop_words=stopWords, strip_accents='ascii')
docs_tf = vectorizer.fit_transform(X_set)
vocabulary_terms = vectorizer.get_feature_names()
docs_query_tf = vectorizer.transform(X_set) 
transformer = TfidfTransformer(smooth_idf = False)
tfidf = transformer.fit_transform(docs_query_tf.toarray())
#print(vocabulary_terms)

U, s, V = svds(tfidf.T)
K = 2 # number of components

docs_rep = np.dot(np.diag(s[-K:]), V[-K:, :]).T # D x K matrix 
terms_rep = np.dot(U[:,-K:], np.diag(s[-K:])) # V x K matrix
print(docs_rep)

Length of Labeled Dataset : 22641
[[-0.00475422 -0.05088965]
 [ 0.08621034 -0.22376388]
 [ 0.05883809 -0.20286163]
 ...
 [-0.00090118 -0.29795682]
 [ 0.14559843 -0.2644313 ]
 [ 0.09049927 -0.13752938]]


In [3]:
entire_corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22641 entries, 0 to 23485
Data columns (total 11 columns):
Unnamed: 0                 22641 non-null int64
Clothing ID                22641 non-null int64
Age                        22641 non-null int64
Title                      19675 non-null object
Review Text                22641 non-null object
Rating                     22641 non-null int64
Recommended IND            22641 non-null int64
Positive Feedback Count    22641 non-null int64
Division Name              22628 non-null object
Department Name            22628 non-null object
Class Name                 22628 non-null object
dtypes: int64(6), object(5)
memory usage: 2.1+ MB


In [4]:
entire_corpus.describe()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Rating,Recommended IND,Positive Feedback Count
count,22641.0,22641.0,22641.0,22641.0,22641.0,22641.0
mean,11740.849035,919.332362,43.280376,4.183561,0.818868,2.630582
std,6781.957509,202.266874,12.32698,1.115762,0.385136,5.786164
min,0.0,1.0,18.0,1.0,0.0,0.0
25%,5872.0,861.0,34.0,4.0,1.0,0.0
50%,11733.0,936.0,41.0,5.0,1.0,1.0
75%,17621.0,1078.0,52.0,5.0,1.0,3.0
max,23485.0,1205.0,99.0,5.0,1.0,122.0


# Machine Learning Models

## Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(docs_rep, y_set, test_size=0.33,random_state=42)
model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, y_train)
print("Accuracy of the Logistic Regression Model :",model.score(X_test,y_test))


Accuracy of the Logistic Regression Model : 0.8154443254817987


## Support Vector Machines(SVM)

In [7]:
from sklearn.svm import SVC

C = 1.0 # SVM regularization parameter
X_train, X_test, y_train, y_test = train_test_split(docs_rep, y_set, test_size=0.33,random_state=42)
svc = SVC(kernel='linear', C=1,gamma='auto').fit(X_train, y_train)

print("Accuracy of the SVM Model :",svc.score(X_test, y_test))


Accuracy of the SVM Model : 0.8155781584582441


## Logistic Regression vs. SVM

We discern that both the classification methods end up displaying **approximately same score(accuracy)**.
- *Accuracy of the Logistic Regression Model : 0.8154443254817987*
- *Accuracy of the SVM Model : 0.8155781584582441*