In [26]:
import numpy as np
import pandas as pd
import re, string, random, pickle, nltk, scipy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from collections import Counter

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk import FreqDist
from nltk.classify import ClassifierI
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import word_tokenize

In [2]:
stop_words = list(set(stopwords.words('english')))
text_train = pd.read_csv(r"datasets/review_text_train.csv", index_col = False, delimiter = ',', header=0)
meta_train = pd.read_csv(r"datasets/review_meta_train.csv", index_col = False, delimiter = ',', header=0)
text_test = pd.read_csv(r"datasets/review_text_test.csv", index_col = False, delimiter = ',', header=0)

In [13]:
from nltk.corpus import wordnet


In [17]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,"N": wordnet.NOUN,"V": wordnet.VERB,"R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [38]:
all_reviews1 = []
all_words = []
allowed_p_o_s = ["J","V","R"]
lemmatizer = WordNetLemmatizer()
l=0

for i in range(0,text_train.shape[0]):
    text = text_train.at[i,'review']
    rating = meta_train.at[i,'rating']
    new_text = ''
    
    cleaned_text = re.sub(r'[^(a-zA-Z)\s]',' ', text)
    tokenized_text = word_tokenize(cleaned_text)
    
    text_w_o_stop = [word for word in tokenized_text if not word in stop_words]

    for token, tag in nltk.pos_tag(text_w_o_stop):
        p_o_s = get_wordnet_pos(token)
        token = lemmatizer.lemmatize(token, p_o_s)
        if tag[0] in allowed_p_o_s:
            new_text+=token.lower()
            new_text+=' '
    tup = (new_text,rating)
    all_reviews1.append(tup)
    if l%1000==0:
        print(l)
    l+=1

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000


In [41]:
all_reviews1[:2]

[('excellent fantastic well know know make comfortable great cheese great great salad decor new old minor right ',
  5),
 ('kill yet difficult get self aspect go early last team yap sit right outside bad get perfect miserable want atmosphere hold go loud want hit ',
  1)]

In [42]:
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:5000]
random.shuffle(all_reviews1)

all_reviews = [a[0] for a in all_reviews1]
all_ratings = [a[1] for a in all_reviews1]

lim = int(len(all_reviews)*.8)
my_reviews_train_clean = all_reviews[:lim]
my_reviews_test_clean = all_reviews[lim:]
my_target_train = all_ratings[:lim]
my_target_test = all_ratings[lim:]

In [43]:
print('k')

k


In [44]:
my_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
my_vectorizer.fit(my_reviews_train_clean)
X_all = my_vectorizer.transform(all_reviews)
my_test_reviews = list(text_test['review'])
X_test_all = my_vectorizer.transform(my_test_reviews)
X = my_vectorizer.transform(my_reviews_train_clean)
X_test = my_vectorizer.transform(my_reviews_test_clean)

In [45]:
class EnsembleClassifier(ClassifierI):
    
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    def classify(self, given_features):
        classifier_votes = []
        for c in self._classifiers:
            vote = int(c.predict(given_features))
            classifier_votes.append(vote)
        return Counter(classifier_votes).most_common(1)[0][0]


In [46]:
MNB_clf = MultinomialNB(alpha=0.5, fit_prior=False)
MNB_clf.fit(X, my_target_train)

BNB_clf = BernoulliNB(alpha = 0.001)
BNB_clf.fit(X, my_target_train)

LogReg_clf = LogisticRegression(C=0.5)
LogReg_clf.fit(X, my_target_train)

SGD_clf = SGDClassifier(loss='perceptron')
SGD_clf.fit(X, my_target_train)

SVC_clf = LinearSVC(C=0.05)
SVC_clf.fit(X, my_target_train)



LinearSVC(C=0.05, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [47]:
classifiers = [MNB_clf, BNB_clf, LogReg_clf, SGD_clf, SVC_clf]
for model in classifiers:
    name = str(model).split('(')[0]
    predictions = model.predict(X_test)
    print ("Accuracy for %s model = %s" % (name, accuracy_score(my_target_test, predictions)))
    print(precision_recall_fscore_support(my_target_test, predictions))
    print(confusion_matrix(my_target_test, predictions))
    print('\n')

Accuracy for MultinomialNB model = 0.7686141788386177
(array([0.65492958, 0.62357414, 0.79649797]), array([0.20993228, 0.37992278, 0.9623323 ]), array([0.31794872, 0.47216891, 0.87159715]), array([ 443, 1295, 3876]))
[[  93  183  167]
 [  17  492  786]
 [  32  114 3730]]


Accuracy for BernoulliNB model = 0.7452796579978624
(array([0.81415929, 0.60291439, 0.75949111]), array([0.20767494, 0.25559846, 0.97033024]), array([0.33093525, 0.35900217, 0.85206162]), array([ 443, 1295, 3876]))
[[  92  106  245]
 [  18  331  946]
 [   3  112 3761]]


Accuracy for LogisticRegression model = 0.8314926968293552
(array([0.78647687, 0.73102786, 0.85880708]), array([0.49887133, 0.58764479, 0.95098039]), array([0.61049724, 0.6515411 , 0.90254652]), array([ 443, 1295, 3876]))
[[ 221  104  118]
 [  46  761  488]
 [  14  176 3686]]


Accuracy for SGDClassifier model = 0.8156394727467047
(array([0.6909621 , 0.71298174, 0.84924154]), array([0.53498871, 0.54285714, 0.93885449]), array([0.60305344, 0.61639632,

In [None]:
ensemble_clf = EnsembleClassifier(MNB_clf, BNB_clf, LogReg_clf, SGD_clf, SVC_clf)
ensemble_preds = [ensemble_clf.classify(features) for features in X_test]

In [76]:
print ("Accuracy for model = %s" % (accuracy_score(my_target_test, ensemble_preds)))
print(precision_recall_fscore_support(my_target_test, ensemble_preds))
print('\n')

Accuracy for model = 0.8845742785892412
(array([0.9009009 , 0.82309125, 0.89897789]), array([0.65359477, 0.69442262, 0.97424008]), array([0.75757576, 0.75330209, 0.93509705]), array([ 459, 1273, 3882]))


