### Group Details

Group ID - 112
Dataset Name - Movie Review - (Data-Review_Polarity)




### Import required libraries and ignore the warning messages.

In [1]:
# If missing, then install the required libabries here.

# pip install nltk

In [2]:
import tarfile 
import nltk
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer       #lemmatization
from nltk.corpus.reader import NOUN, VERB, ADJ, ADV
from collections import defaultdict
from nltk.corpus import wordnet

from nltk.tokenize import word_tokenize
import string #punctuation
from nltk.corpus import stopwords
import os #read documents
import re #url

from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from random import shuffle
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import GaussianNB             #Import Gaussian Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics                            #Import scikit-learn metrics module for accuracy calculation
from sklearn.ensemble import RandomForestClassifier    #RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier #GradientBoostingClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

from sklearn import metrics
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Download stopwords and use English stopwords as we have reviews in english language.
nltk.download('stopwords')
stop_en = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\P10506243\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# sets of punctuation
punctuation_translator = str.maketrans("","",string.punctuation)

### Extract the data

In [5]:
## open file 
#import tarfile 
#file = tarfile.open('review_polarity.tar.gz') 
# extracting file 
#file.extractall('./Extracted Data') 
#file.close()

### Read and clean the data

Apply Tokenization (It will split the review data into single word).
Remove single character, punctuation, stop words and web links
Apply lemmitization (The process of converting a word to its base form)
Apply POS Tag (POS tagger is used to assign grammatical information of each word of the sentence.)

In [6]:
def read_document_and_remove_messy_text(path, stop_words, punctuation_translator):
    file = open(path, 'rb');
    content = file.read()
    words = word_tokenize(str(content))
    
    words_without_punctuation = []  
    for word in words:
        word = word.translate(punctuation_translator)
        if len(word)>2:  #don't add empty strings or irrelevant one
            words_without_punctuation.append(word)
    #print(words_without_punctuation)
    
    words_without_stop_words_and_punctuation = [word for word in words_without_punctuation if not word in stop_en]
    #print(words_without_stop_words_and_punctuation)
    
    words_without_stop_words_punctuation_and_url = []
    for word in words_without_stop_words_and_punctuation:
        word = re.sub(r"http\S+", "", word)
        words_without_stop_words_punctuation_and_url.append(word)
    
    # Lemmatization is the process of converting a word to its base form
    lemmatizer = WordNetLemmatizer()
    tag_map = defaultdict(lambda : wordnet.NOUN)
    tag_map['N'] = wordnet.NOUN
    tag_map['J'] = wordnet.ADJ
    tag_map['V'] = wordnet.VERB
    tag_map['R'] = wordnet.ADV
    
    # POS Tagging in NLTK is a process to mark up the words in text format for a particular part of a speech based on its definition and context
    tags = nltk.pos_tag(words_without_stop_words_punctuation_and_url)
    lemmas = [lemmatizer.lemmatize(token,tag_map[tag[0]]) for token,tag in tags]
    
    return lemmas


In [None]:
# Read each and every file, and pass the review data for cleansing purpose.

def clean_all_documents(folder_path, stop_words, punctuation_translator):
    file_names = os.listdir(folder_path)
    documents = [read_document_and_remove_messy_text(folder_path+'/'+file_name, stop_words, punctuation_translator) 
                 for file_name in file_names]
    return documents

negative_docs = clean_all_documents("Extracted Data/txt_sentoken/neg",stop_en, punctuation_translator)
positive_docs = clean_all_documents("Extracted Data/txt_sentoken/pos",stop_en, punctuation_translator)

In [None]:
# create list for all consolidated positive and negative reviews
all_documents_string_array = []
for i in range(len(negative_docs)):
    sentence = ' '.join(negative_docs[i])
    all_documents_string_array.append(sentence)
for i in range(len(positive_docs)):
    sentence = ' '.join(positive_docs[i])
    all_documents_string_array.append(sentence)

    
# Check the count of reviews.
len(all_documents_string_array)

### Convert a collection of text documents to a matrix of token counts

It is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.
This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.

In [None]:

# We have kept 5000 as max feature value. which will consider top 5000 words from entire dataset.
# Consider the top max_features ordered by term frequency across the corpus.
vectorizer = CountVectorizer(max_features = 5000, binary=True)   
def vectorize_occurences(corpus, vectorizer):
    X = vectorizer.fit_transform(corpus)
    return X

  

### Transformation

We have used TfidfTransformer.

Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency. This is a common term weighting scheme in information retrieval, that has also found good use in document classification.

The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given document is to scale down the impact of tokens that occur very frequently in a given corpus and that are hence empirically less informative than features that occur in a small fraction of the training corpus.

In [None]:
# Transform a count matrix to a normalized tf or tf-idf representation
# tf(t, d) is the number of times a term occurs in the given document.
def calculate_frequencies(corpus):
    tf_transformer = TfidfTransformer(use_idf=False).fit(corpus)
    X = tf_transformer.transform(corpus)
    return X

# Create bag of words for all review words which we had cleaning in previous step
bag_of_words_occurences = vectorize_occurences(all_documents_string_array, vectorizer)
bag_of_words_frequencies = calculate_frequencies(bag_of_words_occurences)
# print(bag_of_words_frequencies) 

In [None]:
# Create list of class values for positive and negative review data.
#1000 0 for negatives and 1000 1 for positives

number_of_documents_for_each_class = int(len(negative_docs))
negative_labels = np.zeros((1,number_of_documents_for_each_class), dtype=int)[0]
number_of_documents_for_each_class = int(len(positive_docs))
positive_labels = np.ones((1,number_of_documents_for_each_class), dtype=int)[0]


# Assign bag of words for negative and positive 
negative_bow = bag_of_words_frequencies[:number_of_documents_for_each_class]
positive_bow = bag_of_words_frequencies[number_of_documents_for_each_class:]


print('Negative Bag of words')
print(negative_bow)

print('\nPositive Bag of words')
print(positive_bow)

### Dataset details:
Data consist of 1000 positive reviews with class value = 1 and 1000 reviews of negative reviews which we have marked 0 as class value. 


### Multinomial Calssifier

In [None]:
#Create a Multinomial Naive Bayes Classifier
gnb = MultinomialNB()           #good
def multinomial_calssifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy, fold):
    #Train the model using the training sets
    gnb.fit(features_training_set, labels_training_set)
    #Predict the response for test dataset
    pred = gnb.predict(features_test_set)
    # Model Accuracy, how often is the classifier correct?
    print("Multinomial Naive Bayes Accuracy:",metrics.accuracy_score(labels_test_set, pred))
    average_accuracy += metrics.accuracy_score(labels_test_set, pred)
    print("Average accuracy: ", average_accuracy/fold)
    print("")
    print(metrics.classification_report(labels_test_set, pred))
    print("")
    print("Confusion Matrix: ")
    print(metrics.confusion_matrix(labels_test_set, pred))
    print("")
    # Convert class label lists to dictionary 
    dict = {'Actual': labels_test_set, 'Predicted': pred} 
    df = pd.DataFrame(dict)
    # Save data to excel
    df.to_excel('multinomial_calssifier '+ str(fold)+'.xlsx')
    return average_accuracy

In [None]:
# Use cross validation for greater accuracy
kfold = KFold(n_splits = 5)  
fold = 0; average_accuracy = 0;

for train, test in kfold.split(negative_bow):
    fold = fold+1
    features_training_set = negative_bow[train].toarray().tolist()  # get training data
    features_training_set += positive_bow[train].toarray().tolist() # get training data

    features_test_set = negative_bow[test].toarray().tolist()       # get testing data
    features_test_set += positive_bow[test].toarray().tolist()      # get testing data
    
    labels_training_set = negative_labels[train].tolist()           # get training label(class) data
    labels_training_set += positive_labels[train].tolist()          # get training label(class) data
    
    labels_test_set = negative_labels[test].tolist()                # get testing label(class) data
    labels_test_set += positive_labels[test].tolist()               # get testing label(class) data
    
    average_accuracy = multinomial_calssifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy ,fold)

print('---------- Completed Multinomial Calssifier --------')

### Random Forest Classifier

In [None]:
def random_forest_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy, fold):   #best
    clf = RandomForestClassifier(n_estimators = 1000)
    #rfe = RFE(estimator=clf, step=0.5)
    clf = clf.fit(features_training_set, labels_training_set)     #maybe use rfe instean of clf??
    pred = clf.predict(features_test_set)
    print("Random Forest Classifier Accuracy:",metrics.accuracy_score(labels_test_set, pred))
    average_accuracy += metrics.accuracy_score(labels_test_set, pred)
    print("Average accuracy: ", average_accuracy/fold)
    print("")
    print(metrics.classification_report(labels_test_set, pred))
    print("")
    print("Confusion Matrix: ")
    print(metrics.confusion_matrix(labels_test_set, pred))
    print("")
    # Convert class label lists to dictionary 
    dict = {'Actual': labels_test_set, 'Predicted': pred} 
    df = pd.DataFrame(dict)
    # Save data to excel
    df.to_excel('random_forest_classifier '+ str(fold)+'.xlsx')
    return average_accuracy

In [None]:
# Use cross validation for greater accuracy
kfold = KFold(n_splits = 5)  
fold = 0; average_accuracy = 0;

for train, test in kfold.split(negative_bow):    # 
    fold = fold+1
    features_training_set = negative_bow[train].toarray().tolist()  # get training data
    features_training_set += positive_bow[train].toarray().tolist() # get training data

    features_test_set = negative_bow[test].toarray().tolist()       # get testing data
    features_test_set += positive_bow[test].toarray().tolist()      # get testing data
    
    labels_training_set = negative_labels[train].tolist()           # get training label(class) data
    labels_training_set += positive_labels[train].tolist()          # get training label(class) data
    
    labels_test_set = negative_labels[test].tolist()                # get testing label(class) data
    labels_test_set += positive_labels[test].tolist()               # get testing label(class) data
    
    average_accuracy = random_forest_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy ,fold)
    
print('---------- Completed Random Forest Classifier --------')

### Gradient Boosting Classifier

In [None]:
def gradient_boosting_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy, fold):  #asa si asa
    clf = GradientBoostingClassifier(n_estimators=100)
    clf = clf.fit(features_training_set, labels_training_set)
    pred = clf.predict(features_test_set)
    print("Gradient Boosting Classifier Accuracy:",metrics.accuracy_score(labels_test_set, pred))
    average_accuracy += metrics.accuracy_score(labels_test_set, pred)
    print("Average accuracy: ", average_accuracy/fold)
    print("")
    print(metrics.classification_report(labels_test_set, pred))
    print("")
    print("Confusion Matrix: ")
    print(metrics.confusion_matrix(labels_test_set, pred))
    print("")
    # Convert class label lists to dictionary 
    dict = {'Actual': labels_test_set, 'Predicted': pred} 
    df = pd.DataFrame(dict)
    # Save data to excel
    df.to_excel('gradient_boosting_classifier '+ str(fold)+'.xlsx')
    return average_accuracy

In [None]:
# Use cross validation for greater accuracy
kfold = KFold(n_splits = 5)  
fold = 0; average_accuracy = 0;

for train, test in kfold.split(negative_bow):
    fold = fold+1
    features_training_set = negative_bow[train].toarray().tolist()  # get training data
    features_training_set += positive_bow[train].toarray().tolist() # get training data

    features_test_set = negative_bow[test].toarray().tolist()       # get testing data
    features_test_set += positive_bow[test].toarray().tolist()      # get testing data
    
    labels_training_set = negative_labels[train].tolist()           # get training label(class) data
    labels_training_set += positive_labels[train].tolist()          # get training label(class) data
    
    labels_test_set = negative_labels[test].tolist()                # get testing label(class) data
    labels_test_set += positive_labels[test].tolist()               # get testing label(class) data
    
    average_accuracy = gradient_boosting_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy ,fold)
    
    print('---------- Completed Gradient Boosting Classifier --------')

### Logistic Regression Classifier

In [None]:
def logistic_regression_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy, fold):
    clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
    clf = clf.fit(features_training_set, labels_training_set)
    pred = clf.predict(features_test_set)
    print("Logistic Regression Classifier Accuracy:",metrics.accuracy_score(labels_test_set, pred))
    average_accuracy += metrics.accuracy_score(labels_test_set, pred)
    print("Average accuracy: ", average_accuracy/fold)
    print("")
    print(metrics.classification_report(labels_test_set, pred))
    print("")
    print("Confusion Matrix: ")
    print(metrics.confusion_matrix(labels_test_set, pred))
    print("")
        # Convert class label lists to dictionary 
    dict = {'Actual': labels_test_set, 'Predicted': pred} 
    df = pd.DataFrame(dict)
    # Save data to excel
    df.to_excel('logistic_regression_classifier '+ str(fold)+'.xlsx')
    
    return average_accuracy

In [None]:
# Use cross validation for greater accuracy
kfold = KFold(n_splits = 5)  
fold = 0; average_accuracy = 0;

for train, test in kfold.split(negative_bow):
    fold = fold+1
    features_training_set = negative_bow[train].toarray().tolist()  # get training data
    features_training_set += positive_bow[train].toarray().tolist() # get training data

    features_test_set = negative_bow[test].toarray().tolist()       # get testing data
    features_test_set += positive_bow[test].toarray().tolist()      # get testing data
    
    labels_training_set = negative_labels[train].tolist()           # get training label(class) data
    labels_training_set += positive_labels[train].tolist()          # get training label(class) data
    
    labels_test_set = negative_labels[test].tolist()                # get testing label(class) data
    labels_test_set += positive_labels[test].tolist()               # get testing label(class) data
    
    average_accuracy = logistic_regression_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy ,fold) 
    
print('---------- Completed Logistic Regression Classifier --------')

### SVM Classifier

In [None]:
def svm_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy, fold):  #best
    clf = svm.SVC(gamma='scale')
    clf = clf.fit(features_training_set, labels_training_set)
    pred = clf.predict(features_test_set)
    print("SVM Classifier Accuracy:",metrics.accuracy_score(labels_test_set, pred))
    average_accuracy += metrics.accuracy_score(labels_test_set, pred)
    print("Average accuracy: ", average_accuracy/fold)
    print("")
    print(metrics.classification_report(labels_test_set, pred))
    print("")
    print("Confusion Matrix: ")
    print(metrics.confusion_matrix(labels_test_set, pred))
    print("")
        # Convert class label lists to dictionary 
    dict = {'Actual': labels_test_set, 'Predicted': pred} 
    df = pd.DataFrame(dict)
    # Save data to excel
    df.to_excel('svm_classifier '+ str(fold)+'.xlsx')
    return average_accuracy

In [None]:
# Use cross validation for greater accuracy
kfold = KFold(n_splits = 5)  
fold = 0; average_accuracy = 0;

for train, test in kfold.split(negative_bow):
    fold = fold+1
    features_training_set = negative_bow[train].toarray().tolist()  # get training data
    features_training_set += positive_bow[train].toarray().tolist() # get training data

    features_test_set = negative_bow[test].toarray().tolist()       # get testing data
    features_test_set += positive_bow[test].toarray().tolist()      # get testing data
    
    labels_training_set = negative_labels[train].tolist()           # get training label(class) data
    labels_training_set += positive_labels[train].tolist()          # get training label(class) data
    
    labels_test_set = negative_labels[test].tolist()                # get testing label(class) data
    labels_test_set += positive_labels[test].tolist()               # get testing label(class) data

    average_accuracy = svm_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy ,fold)
    
print('---------- Completed SVM Classifier --------')

### SVM Linear Classifier

In [None]:
def svm_linear_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy, fold): #good
    clf = svm.LinearSVC()
    clf = clf.fit(features_training_set, labels_training_set)
    pred = clf.predict(features_test_set)
    print("Linear SVM Classifier Accuracy:",metrics.accuracy_score(labels_test_set, pred))
    average_accuracy += metrics.accuracy_score(labels_test_set, pred)
    print("Average accuracy: ", average_accuracy/fold)
    print("")
    print(metrics.classification_report(labels_test_set, pred))
    print("")
    print("Confusion Matrix: ")
    print(metrics.confusion_matrix(labels_test_set, pred))
    print("")
        # Convert class label lists to dictionary 
    dict = {'Actual': labels_test_set, 'Predicted': pred} 
    df = pd.DataFrame(dict)
    # Save data to excel
    df.to_excel('svm_linear_classifier '+ str(fold)+'.xlsx')
    return average_accuracy

In [None]:
# Use cross validation for greater accuracy
kfold = KFold(n_splits = 5)  
fold = 0; average_accuracy = 0;

for train, test in kfold.split(negative_bow):
    fold = fold+1
    features_training_set = negative_bow[train].toarray().tolist()  # get training data
    features_training_set += positive_bow[train].toarray().tolist() # get training data

    features_test_set = negative_bow[test].toarray().tolist()       # get testing data
    features_test_set += positive_bow[test].toarray().tolist()      # get testing data
    
    labels_training_set = negative_labels[train].tolist()           # get training label(class) data
    labels_training_set += positive_labels[train].tolist()          # get training label(class) data
    
    labels_test_set = negative_labels[test].tolist()                # get testing label(class) data
    labels_test_set += positive_labels[test].tolist()               # get testing label(class) data
    
    average_accuracy = svm_linear_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy ,fold)

print('---------- Completed SVM Linear Classifier --------')

### Create Output Data

From the above different modules, we can say that Multinomial Calssifier is the best module which shows accuracy till 86%. Lets create the output file using the same module. 