## Importing necessary libraries and modules

In [24]:
import pandas as pd
import os
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords  
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn import model_selection, naive_bayes 
from sklearn.svm import SVC
from sklearn.tree import ExtraTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

## Creating the dataframe 

In [2]:
#appending the positive reviews into the dataframe df_pos
data_pos = []
all_files = os.listdir("positive_reviews/")
for x in all_files:
    path = os.path.join("positive_reviews",x)
    with open(path,"r",encoding="utf8") as f:
        a = []
        a.append(str(f.readline()))
        a.append("1")
        data_pos.append(a)
df_pos = pd.DataFrame(data_pos, columns = ['text', 'label'])

#appending the negative reviews into the dataframe df_neg
data_neg = []
all_files = os.listdir("negative_reviews/")
for x in all_files:
    path = os.path.join("negative_reviews",x)
    with open(path,"r",encoding="utf8") as f:
        a = []
        a.append(str(f.readline()))
        a.append("0")
        data_neg.append(a)
df_neg = pd.DataFrame(data_neg, columns = ['text','label'])

df = pd.concat([df_pos,df_neg]).sort_index(kind='merge') #merging df_pos and df_neg into df
df['index'] = np.arange(len(df)) #creating a new index column
df= df.set_index('index')

## Pre-Processing

In [3]:
#Remove blank rows if any.
df['text'].dropna(inplace=True)
#Change all the text to lower case to avoid ambiguity
df['text_mod'] = [entry.lower() for entry in df['text']]
#Tokenization : In this each entry in the corpus will be broken into set of words
df['text_mod']= [word_tokenize(entry) for entry in df['text_mod']]
#Remove Stop words, Numeric and perfom Word Stemming/Lemmatization 
#Stopwords are commonly used words like "to,the,in" which can be removed from Corpus and doesn't affect the accuracy much.
#Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as 
#a single item. Lemmatization is similar to stemming but it brings context to the words. So it links words with similar 
#meaning to one word.Like "Saying,said,says" reduced to "say"
#POS_tag determines the Part of Speech to which the word belong.
tag_map = defaultdict(lambda : wn.NOUN) #default is NOUN
tag_map['J'] = wn.ADJ #Adjective
tag_map['V'] = wn.VERB #Verb
tag_map['R'] = wn.ADV #Adverb
for index,entry in enumerate(df['text_mod']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) / Verb(V) / Adjective(J) / Adverb(R)
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_fin = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_fin)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

## Train Test Split

In [4]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['text_final'],df['label'],test_size=0.2,random_state=42)
target_names = ['pos', 'neg']

## Logistic Regression

In [5]:
# Pipeline - Sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be
# ‘transforms’,that is, they must implement fit and transform methods. The final estimator only needs to implement fit.
# Count Vectorizer - Convert a collection of text documents to a matrix of token counts. This implementation produces a 
# sparse representation of the counts
# Tfidf : Term Frequency - Inverse Document Frequency

# Logistic Regression
text_clf_logreg = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf_logreg', LogisticRegression(random_state=42,max_iter=200)),])
text_clf_logreg = text_clf_logreg.fit(Train_X, Train_Y)
predicted_logreg = text_clf_logreg.predict(Test_X)
print("---Logistic Regression Classification Report---")
print(classification_report(Test_Y, predicted_logreg, target_names=target_names))

---Logistic Regression Classification Report---
              precision    recall  f1-score   support

         pos       0.90      0.87      0.89      2492
         neg       0.88      0.90      0.89      2508

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



## SGD (Stochastic Gradient Descent) Classifier

In [6]:
text_clf_sgd = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-sgd', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=100, random_state=42)),])
text_clf_sgd = text_clf_sgd.fit(Train_X, Train_Y)
predicted_sgd = text_clf_sgd.predict(Test_X)
print("---SGD Classification Report---")
print(classification_report(Test_Y, predicted_sgd, target_names=target_names))

---SGD Classification Report---
              precision    recall  f1-score   support

         pos       0.91      0.77      0.84      2492
         neg       0.80      0.93      0.86      2508

    accuracy                           0.85      5000
   macro avg       0.86      0.85      0.85      5000
weighted avg       0.86      0.85      0.85      5000



## Multinomial Naive Bayes

In [7]:
text_clf_nb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])
text_clf_nb = text_clf_nb.fit(Train_X, Train_Y)
predicted_nb = text_clf_nb.predict(Test_X)
print("---Naive Bayes Classification Report---")
print(classification_report(Test_Y, predicted_nb, target_names=target_names))

---Naive Bayes Classification Report---
              precision    recall  f1-score   support

         pos       0.85      0.88      0.87      2492
         neg       0.87      0.85      0.86      2508

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



## SVM (Linear Kernel)

In [8]:
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-svm', SVC(C=1.0, kernel='linear', degree=3, gamma='auto')),])
text_clf_svm = text_clf_svm.fit(Train_X, Train_Y)
predicted_svm = text_clf_svm.predict(Test_X)
print("---SVM Classification Report---")
print(classification_report(Test_Y, predicted_svm, target_names=target_names))

---SVM Classification Report---
              precision    recall  f1-score   support

         pos       0.89      0.88      0.89      2492
         neg       0.88      0.90      0.89      2508

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



## Decision Tree

In [11]:
text_clf_dt = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf-dt', DecisionTreeClassifier()), ])
text_clf_dt = text_clf_dt.fit(Train_X, Train_Y)
predict_dt = text_clf_dt.predict(Test_X)
print("---Decision Tree Classification Report---")
print(classification_report(Test_Y, predict_dt, target_names=target_names))

---Decision Tree Classification Report---
              precision    recall  f1-score   support

         pos       0.72      0.71      0.71      2492
         neg       0.71      0.72      0.72      2508

    accuracy                           0.71      5000
   macro avg       0.71      0.71      0.71      5000
weighted avg       0.71      0.71      0.71      5000



## Extra Tree 

In [15]:
text_clf_etc = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf-etc', ExtraTreeClassifier()), ])
text_clf_etc = text_clf_etc.fit(Train_X, Train_Y)
predict_etc = text_clf_etc.predict(Test_X)
print("---Extra Tree Classification Report---")
print(classification_report(Test_Y, predict_etc, target_names=target_names))

---ExtraTreeClassifier Classification Report---
              precision    recall  f1-score   support

         pos       0.64      0.63      0.64      2492
         neg       0.64      0.65      0.64      2508

    accuracy                           0.64      5000
   macro avg       0.64      0.64      0.64      5000
weighted avg       0.64      0.64      0.64      5000



## Multi Layer Perceptron

In [17]:
text_clf_mlp = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf-mlp', MLPClassifier()), ])
text_clf_mlp = text_clf_mlp.fit(Train_X, Train_Y)
predict_mlp = text_clf_mlp.predict(Test_X)
print("---Multi Layer Perceptron Classification Report---")
print(classification_report(Test_Y, predict_mlp, target_names=target_names))

---Multi Layer Perceptron Classification Report---
              precision    recall  f1-score   support

         pos       0.88      0.87      0.88      2492
         neg       0.87      0.88      0.88      2508

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



## Ridge Classifier

In [19]:
text_clf_rc = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf-rc', RidgeClassifier()), ])
text_clf_rc = text_clf_rc.fit(Train_X, Train_Y)
predict_rc = text_clf_rc.predict(Test_X)
print("---Ridge Classifier Classification Report---")
print(classification_report(Test_Y, predict_rc, target_names=target_names))

---Ridge Classifier Classification Report---
              precision    recall  f1-score   support

         pos       0.89      0.88      0.89      2492
         neg       0.88      0.89      0.89      2508

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



## AdaBoost 

In [21]:
text_clf_adb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf-adb', AdaBoostClassifier(n_estimators=100)), ])
text_clf_adb = text_clf_adb.fit(Train_X, Train_Y)
predict_adb = text_clf_adb.predict(Test_X)
print("---AdaBoost Classifier Classification Report---")
print(classification_report(Test_Y, predict_adb, target_names=target_names))

---AdaBoost Classifier Classification Report---
              precision    recall  f1-score   support

         pos       0.84      0.80      0.82      2492
         neg       0.81      0.85      0.83      2508

    accuracy                           0.82      5000
   macro avg       0.83      0.82      0.82      5000
weighted avg       0.83      0.82      0.82      5000



## GradientBoost

In [23]:
text_clf_gb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf-gb', GradientBoostingClassifier()), ])
text_clf_gb = text_clf_gb.fit(Train_X, Train_Y)
predict_gb = text_clf_gb.predict(Test_X)
print("---GradientBoost Classifier Classification Report---")
print(classification_report(Test_Y, predict_gb, target_names=target_names))

---GradientBoost Classifier Classification Report---
              precision    recall  f1-score   support

         pos       0.83      0.74      0.78      2492
         neg       0.77      0.85      0.81      2508

    accuracy                           0.80      5000
   macro avg       0.80      0.80      0.80      5000
weighted avg       0.80      0.80      0.80      5000



## Random Forest

In [25]:
text_clf_rf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf-rf', RandomForestClassifier(max_depth=2)), ])
text_clf_rf = text_clf_rf.fit(Train_X, Train_Y)
predict_rf = text_clf_rf.predict(Test_X)
print("---Random Forest Classification Report---")
print(classification_report(Test_Y, predict_rf, target_names=target_names))

---Random Forest Classification Report---
              precision    recall  f1-score   support

         pos       0.76      0.73      0.75      2492
         neg       0.74      0.77      0.76      2508

    accuracy                           0.75      5000
   macro avg       0.75      0.75      0.75      5000
weighted avg       0.75      0.75      0.75      5000

