In [1]:
import nltk
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer       #lemmatization
from nltk.corpus.reader import NOUN, VERB, ADJ, ADV
from collections import defaultdict
from nltk.corpus import wordnet

from nltk.tokenize import word_tokenize
import string #punctuation
from nltk.corpus import stopwords
import os #read documents
import re #url

from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from random import shuffle
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import GaussianNB             #Import Gaussian Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics                            #Import scikit-learn metrics module for accuracy calculation
from sklearn.ensemble import RandomForestClassifier    #RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier #GradientBoostingClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFE              #Recursive Feature Elimination
from sklearn import metrics

stop_en = stopwords.words("english")
punctuation_translator = str.maketrans("","",string.punctuation)

In [2]:
def read_document_and_remove_messy_text(path, stop_words, punctuation_translator):
    file = open(path, 'rb');
    content = file.read()
    words = word_tokenize(str(content))
    
    words_without_punctuation = []  
    for word in words:
        word = word.translate(punctuation_translator)
        if len(word)>2:  #don't add empty strings or irrelevant one
            words_without_punctuation.append(word)
    #print(words_without_punctuation)
    
    words_without_stop_words_and_punctuation = [word for word in words_without_punctuation if not word in stop_en]
    #print(words_without_stop_words_and_punctuation)
    
    words_without_stop_words_punctuation_and_url = []
    for word in words_without_stop_words_and_punctuation:
        word = re.sub(r"http\S+", "", word)
        words_without_stop_words_punctuation_and_url.append(word)
    
    #print(words_without_stop_words_punctuation_and_url)
    lemmatizer = WordNetLemmatizer()
    tag_map = defaultdict(lambda : wordnet.NOUN)
    tag_map['N'] = wordnet.NOUN
    tag_map['J'] = wordnet.ADJ
    tag_map['V'] = wordnet.VERB
    tag_map['R'] = wordnet.ADV
    
    tags = nltk.pos_tag(words_without_stop_words_punctuation_and_url)
    lemmas = [lemmatizer.lemmatize(token,tag_map[tag[0]]) for token,tag in tags]
    
    return lemmas


In [None]:
def clean_all_documents(folder_path, stop_words, punctuation_translator):
    file_names = os.listdir(folder_path)
    documents = [read_document_and_remove_messy_text(folder_path+'/'+file_name, stop_words, punctuation_translator) 
                 for file_name in file_names]
    return documents

negative_docs = clean_all_documents("txt_sentoken/neg",stop_en, punctuation_translator)
positive_docs = clean_all_documents("txt_sentoken/pos",stop_en, punctuation_translator)

In [None]:
all_documents_string_array = []
for i in range(len(negative_docs)):
    sentence = ' '.join(negative_docs[i])
    all_documents_string_array.append(sentence)
for i in range(len(positive_docs)):
    sentence = ' '.join(positive_docs[i])
    all_documents_string_array.append(sentence)

vectorizer = CountVectorizer(max_features = 5000, binary=True) #play with this value
def vectorize_occurences(corpus, vectorizer):
    X = vectorizer.fit_transform(corpus)
    return X

def calculate_frequencies(corpus):
    tf_transformer = TfidfTransformer(use_idf=False).fit(corpus)
    X = tf_transformer.transform(corpus)
    return X

bag_of_words_occurences = vectorize_occurences(all_documents_string_array, vectorizer)
bag_of_words_frequencies = calculate_frequencies(bag_of_words_occurences)
#print(bag_of_words_frequencies)    #https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [None]:
number_of_documents_for_each_class = int(len(negative_docs))
negative_labels = np.zeros((1,number_of_documents_for_each_class), dtype=int)[0]
positive_labels = np.ones((1,number_of_documents_for_each_class), dtype=int)[0]
#1000 0 for negatives and 1000 1 for positives

negative_bow = bag_of_words_frequencies[:number_of_documents_for_each_class]
positive_bow = bag_of_words_frequencies[number_of_documents_for_each_class:]

In [None]:
#Create a Multinomial Naive Bayes Classifier
gnb = MultinomialNB()           #good
def multinomial_calssifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy, fold):
    #Train the model using the training sets
    gnb.fit(features_training_set, labels_training_set)
    #Predict the response for test dataset
    pred = gnb.predict(features_test_set)
    # Model Accuracy, how often is the classifier correct?
    print("Multinomial Naive Bayes Accuracy:",metrics.accuracy_score(labels_test_set, pred))
    average_accuracy += metrics.accuracy_score(labels_test_set, pred)
    print("Average accuracy: ", average_accuracy/fold)
    print("")
    print(metrics.classification_report(labels_test_set, pred))
    print("")
    print("Confusion Matrix: ")
    print(metrics.confusion_matrix(labels_test_set, pred))
    print("")
    return average_accuracy

In [None]:
def random_forest_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy, fold):   #best
    clf = RandomForestClassifier(n_estimators = 1000)
    #rfe = RFE(estimator=clf, step=0.5)
    clf = clf.fit(features_training_set, labels_training_set)     #maybe use rfe instean of clf??
    pred = clf.predict(features_test_set)
    print("Random Forest Classifier Accuracy:",metrics.accuracy_score(labels_test_set, pred))
    average_accuracy += metrics.accuracy_score(labels_test_set, pred)
    print("Average accuracy: ", average_accuracy/fold)
    print("")
    print(metrics.classification_report(labels_test_set, pred))
    print("")
    return average_accuracy

In [None]:
def gradient_boosting_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy, fold):  #asa si asa
    clf = GradientBoostingClassifier(n_estimators=100)
    clf = clf.fit(features_training_set, labels_training_set)
    pred = clf.predict(features_test_set)
    print("Gradient Boosting Classifier Accuracy:",metrics.accuracy_score(labels_test_set, pred))
    average_accuracy += metrics.accuracy_score(labels_test_set, pred)
    print("Average accuracy: ", average_accuracy/fold)
    print("")
    print(metrics.classification_report(labels_test_set, pred))
    print("")
    return average_accuracy

In [None]:
def svm_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy, fold):  #best
    clf = svm.SVC(gamma='scale')
    clf = clf.fit(features_training_set, labels_training_set)
    pred = clf.predict(features_test_set)
    print("SVM Classifier Accuracy:",metrics.accuracy_score(labels_test_set, pred))
    average_accuracy += metrics.accuracy_score(labels_test_set, pred)
    print("Average accuracy: ", average_accuracy/fold)
    print("")
    print(metrics.classification_report(labels_test_set, pred))
    print("")
    return average_accuracy

In [None]:
def svm_linear_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy, fold): #good
    clf = svm.LinearSVC()
    clf = clf.fit(features_training_set, labels_training_set)
    pred = clf.predict(features_test_set)
    print("Linear SVM Classifier Accuracy:",metrics.accuracy_score(labels_test_set, pred))
    average_accuracy += metrics.accuracy_score(labels_test_set, pred)
    print("Average accuracy: ", average_accuracy/fold)
    print("")
    print(metrics.classification_report(labels_test_set, pred))
    print("")
    return average_accuracy

In [None]:
def logistic_regression_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy, fold):
    clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
    clf = clf.fit(features_training_set, labels_training_set)
    pred = clf.predict(features_test_set)
    print("Logistic Regression Classifier Accuracy:",metrics.accuracy_score(labels_test_set, pred))
    average_accuracy += metrics.accuracy_score(labels_test_set, pred)
    print("Average accuracy: ", average_accuracy/fold)
    print("")
    print(metrics.classification_report(labels_test_set, pred))
    print("")
    return average_accuracy

### Multinomial Calssifier

In [None]:
# Use cross validation for greater accuracy
kfold = KFold(n_splits = 5)  
fold = 0; average_accuracy = 0;

for train, test in kfold.split(negative_bow):
    fold = fold+1
    features_training_set = negative_bow[train].toarray().tolist()  
    features_training_set += positive_bow[train].toarray().tolist()

    features_test_set = negative_bow[test].toarray().tolist()       
    features_test_set += positive_bow[test].toarray().tolist()
    
    labels_training_set = negative_labels[train].tolist()           
    labels_training_set += positive_labels[train].tolist()
    
    labels_test_set = negative_labels[test].tolist()               
    labels_test_set += positive_labels[test].tolist()
    
    average_accuracy = multinomial_calssifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy ,fold)

    print('---------- Completed Multinomial Calssifier --------')

### Random Forest Classifier

In [None]:
# Use cross validation for greater accuracy
kfold = KFold(n_splits = 5)  
fold = 0; average_accuracy = 0;

for train, test in kfold.split(negative_bow):
    fold = fold+1
    features_training_set = negative_bow[train].toarray().tolist()  # 1800 x 5000  this 5000 depends -> 1250
    features_training_set += positive_bow[train].toarray().tolist()

    features_test_set = negative_bow[test].toarray().tolist()       # 200 x 5000
    features_test_set += positive_bow[test].toarray().tolist()
    
    labels_training_set = negative_labels[train].tolist()           # 1 x 1800
    labels_training_set += positive_labels[train].tolist()
    
    labels_test_set = negative_labels[test].tolist()                # 1 x 200
    labels_test_set += positive_labels[test].tolist()
    
    #average_accuracy = random_forest_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy ,fold)
    
    print('---------- Completed Random Forest Classifier --------')

### Gradient Boosting Classifier

In [None]:
# Use cross validation for greater accuracy
kfold = KFold(n_splits = 5)  
fold = 0; average_accuracy = 0;

for train, test in kfold.split(negative_bow):
    fold = fold+1
    features_training_set = negative_bow[train].toarray().tolist()  # 1800 x 5000  this 5000 depends -> 1250
    features_training_set += positive_bow[train].toarray().tolist()

    features_test_set = negative_bow[test].toarray().tolist()       # 200 x 5000
    features_test_set += positive_bow[test].toarray().tolist()
    
    labels_training_set = negative_labels[train].tolist()           # 1 x 1800
    labels_training_set += positive_labels[train].tolist()
    
    labels_test_set = negative_labels[test].tolist()                # 1 x 200
    labels_test_set += positive_labels[test].tolist()
    
    average_accuracy = gradient_boosting_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy ,fold)
    
    print('---------- Completed Gradient Boosting Classifier --------')

### Logistic Regression Classifier

In [None]:
# Use cross validation for greater accuracy
kfold = KFold(n_splits = 5)  
fold = 0; average_accuracy = 0;

for train, test in kfold.split(negative_bow):
    fold = fold+1
    features_training_set = negative_bow[train].toarray().tolist()  # 1800 x 5000  this 5000 depends -> 1250
    features_training_set += positive_bow[train].toarray().tolist()

    features_test_set = negative_bow[test].toarray().tolist()       # 200 x 5000
    features_test_set += positive_bow[test].toarray().tolist()
    
    labels_training_set = negative_labels[train].tolist()           # 1 x 1800
    labels_training_set += positive_labels[train].tolist()
    
    labels_test_set = negative_labels[test].tolist()                # 1 x 200
    labels_test_set += positive_labels[test].tolist()
    
    average_accuracy = logistic_regression_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy ,fold) 
    
    print('---------- Completed Logistic Regression Classifier --------')

### SVM Classifier

In [None]:
# Use cross validation for greater accuracy
kfold = KFold(n_splits = 5)  
fold = 0; average_accuracy = 0;

for train, test in kfold.split(negative_bow):
    fold = fold+1
    features_training_set = negative_bow[train].toarray().tolist()  # 1800 x 5000  this 5000 depends -> 1250
    features_training_set += positive_bow[train].toarray().tolist()

    features_test_set = negative_bow[test].toarray().tolist()       # 200 x 5000
    features_test_set += positive_bow[test].toarray().tolist()
    
    labels_training_set = negative_labels[train].tolist()           # 1 x 1800
    labels_training_set += positive_labels[train].tolist()
    
    labels_test_set = negative_labels[test].tolist()                # 1 x 200
    labels_test_set += positive_labels[test].tolist()

    average_accuracy = svm_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy ,fold)
    
    print('---------- Completed SVM Classifier --------')

### SVM Linear Classifier

In [None]:
# Use cross validation for greater accuracy
kfold = KFold(n_splits = 5)  
fold = 0; average_accuracy = 0;

for train, test in kfold.split(negative_bow):
    fold = fold+1
    features_training_set = negative_bow[train].toarray().tolist()  
    features_training_set += positive_bow[train].toarray().tolist()

    features_test_set = negative_bow[test].toarray().tolist()       
    features_test_set += positive_bow[test].toarray().tolist()
    
    labels_training_set = negative_labels[train].tolist()           
    labels_training_set += positive_labels[train].tolist()
    
    labels_test_set = negative_labels[test].tolist()                
    labels_test_set += positive_labels[test].tolist()
    
    average_accuracy = svm_linear_classifier(features_training_set, labels_training_set, features_test_set, labels_test_set, average_accuracy ,fold)

    print('---------- Completed SVM Linear Classifier --------')