In [1]:
from __future__ import division
import pandas as pd
import sys
from sys import argv
import re
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
%matplotlib inline
import time

In [2]:
def open_bow_df(bow_name):
    
    bow_df_file = pd.read_csv(bow_name, sep = "\t")
    
    return bow_df_file

In [31]:
def split_train_test(bow_df):

    data = bow_df[["Real_ID_x", "Real_ID_y", "Text"]]
    labels = bow_df["Mode"]

    test_s = 0.3 #input("What is the size of test? (0-1, float): ")
    ran_state = 1993 #input("Set random state (int): ")

    data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=test_s, random_state=ran_state)
    data_train, data_test, labels_train, labels_test = data_train.reset_index(drop=True), data_test.reset_index(drop=True), labels_train.reset_index(drop=True), labels_test.reset_index(drop=True) 

    return data_train, data_test, labels_train, labels_test

In [32]:
def texts_to_words( raw_text ):
	# Function to convert a raw text to a string of words
	# The input is a single string (a raw text), and 
	# the output is a single string (a preprocessed text)
	#
	# 2. Remove non-letters        
	letters_only = re.sub("[^a-zA-Z0-9]", " ", raw_text) 
	#
	# 3. Convert to lower case, split into individual words
	words = letters_only.lower().split()                             
	#
	# 4. In Python, searching a set is much faster than searching
	#   a list, so convert the stop words to a set
	stops = set(stopwords.words("english"))                  
	# 
	# 5. Remove stop words
	meaningful_words = [w for w in words if not w in stops]   
	#
	# 6. Join the words back into one string separated by space, 
	# and return the result.
	return( " ".join( meaningful_words ))

In [33]:
def bag_of_words_and_prediction(bow_df, feature_count):

    data_train, data_test, labels_train, labels_test = split_train_test(bow_df)

    # Get the number of reviews based on the dataframe column size
    num_texts = data_train["Text"].size

    # Initialize an empty list to hold the clean reviews
    print "Cleaning and parsing the training set article sentences...\n"
    clean_train_texts = []
    for i in xrange( 0, num_texts ):
        # If the index is evenly divisible by 1000, print a message
        # if( (i+1)%100 == 0 ):
            # print "Texts %d of %d\n" % ( i+1, num_texts )                                                                    
        clean_train_texts.append( texts_to_words( data_train["Text"][i] ))

    print "Creating the bag of words...\n"
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.  
    vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = feature_count) 

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    train_data_features = vectorizer.fit_transform(clean_train_texts)

    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    train_data_features = train_data_features.toarray()

    num_texts = len(data_test["Text"])
    clean_test_texts = [] 

    print "Cleaning and parsing the test set movie reviews...\n"
    for i in xrange(0,num_texts):
        # if( (i+1) % 1000 == 0 ):
            # print "Review %d of %d\n" % (i+1, num_texts)
        clean_texts = texts_to_words( data_test["Text"][i] )
        clean_test_texts.append( clean_texts )

    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = vectorizer.transform(clean_test_texts)
    test_data_features = test_data_features.toarray()

    print "Done! Produced train, test split with ", feature_count, " features"

    return train_data_features, test_data_features, labels_train, labels_test

In [34]:
def get_accuracy(l_new, l_te):
    """Calculates the accuracy of predicted labels, based on the given labels

    INPUT: New(Predicted) Labels, Test Labels

    OUTPUT: Accuracy in percent """

    acc = 0
    for i in range(len(l_te)):
        if l_new[i] == l_te[i]:
            acc += 1
    acc = acc/len(l_te)

    return 1-acc

In [35]:
def linear_reg_model(train_features, test_features, labels_train, labels_test):
    from sklearn.linear_model import LinearRegression
    start_time = time.time()
    
    print "Training Linear Regression model... "
    linear = LinearRegression(normalize = True)
    
    model = linear.fit(train_features, labels_train)
    
    print "Predicting based on model... "
    result = model.predict(test_features)
    
    print "Calculating error..."
    error = get_accuracy(result, labels_test)
    
    print "The error of linear regression (normalized) ", error
    
    print "Done"
    
    finish_time = time.time() - start_time
    print "This took %s seconds" %(finish_time)
    return result, error, finish_time

In [36]:
def logistic_reg_model(train_features, test_features, labels_train, labels_test):
    from sklearn.linear_model import LogisticRegression
    start_time = time.time()
    
    print "Training Logistic Regression model... "
    logistic = LogisticRegression()
    
    model = logistic.fit(train_features, labels_train)
    
    print "Predicting based on model... "
    result = model.predict(test_features)
    
    print "Calculating error..."
    error = get_accuracy(result, labels_test)
    
    print "The error of logistic regression ", error
    
    print "Done"
    
    finish_time = time.time() - start_time
    print "This took %s seconds" %(finish_time)    
    return result, error, finish_time

In [37]:
def SVM_model(train_features, test_features, labels_train, labels_test):
    from sklearn import svm
    start_time = time.time()
    print "Training Support Vector Machines model... "
    SVM = svm.SVC()
    
    model = SVM.fit(train_features, labels_train)
    
    print "Predicting based on model... "
    result = model.predict(test_features)
    
    print "Calculating error..."
    error = get_accuracy(result, labels_test)
    
    print "The error of SVM ", error
    
    print "Done"
    
    finish_time = time.time() - start_time
    print "This took %s seconds" %(finish_time)
    return result, error, finish_time

In [38]:
def random_forest_model(train_features, test_features, labels_train, labels_test):
    from sklearn.ensemble import RandomForestClassifier
    start_time = time.time()
    print "Training the random forest..."
    forest = RandomForestClassifier(n_estimators = 100, random_state = 23) 

    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    forest = forest.fit( train_features, labels_train)
    
    print "Predicting based on model..."
    result = forest.predict(test_features)
    
    print "Calculating error..."
    error = get_accuracy(result, labels_test)
    
    print "The error of random forest with 100 trees is ", error
    
    print "Done"
    
    finish_time = time.time() - start_time
    print "This took %s seconds" %(finish_time)
    return result, error, finish_time

In [39]:
def neural_network_model(train_features, test_features, labels_train, labels_test):
    from sklearn.neural_network import MLPClassifier
    start_time = time.time()
    print "Training neural network..."
    clf = MLPClassifier(solver='adam', alpha=1e-5, random_state=1)

    NN = clf.fit(train_data_features, labels_train)
    
    print "Predicting based on model..."
    result = clf.predict(test_data_features)
    
    print "Calculating error..."
    error = get_accuracy(result, labels_test)
    
    print "The error of Neural Network with ", error
    
    print "Done"
    
    finish_time = time.time() - start_time
    print "This took %s seconds" %(finish_time)
    return result, error, finish_time

In [40]:
def Gaussian_Naive_Bayes_model(train_features, test_features, labels_train, labels_test):
    from sklearn.naive_bayes import GaussianNB
    start_time = time.time()
    print "Training Gaussian Naive Bayes model... "
    GNB = GaussianNB()
    
    model = GNB.fit(train_features, labels_train)
    
    print "Predicting based on model... "
    result = model.predict(test_features)
    
    print "Calculating error..."
    error = get_accuracy(result, labels_test)
    
    print "The error of Gaussian Naive Bayes ", error
    
    print "Done"
    
    finish_time = time.time() - start_time
    print "This took %s seconds" %(finish_time)
    return result, error, finish_time

In [41]:
def KNN_model(train_features, test_features, labels_train, labels_test):
    from sklearn.neighbors import KNeighborsClassifier
    start_time = time.time()
    print "Training KNN model... "
    KNN = KNeighborsClassifier(n_neighbors = 5)
    
    model = KNN.fit(train_features, labels_train)
    
    print "Predicting based on model... "
    result = model.predict(test_features)
    
    print "Calculating error..."
    error = get_accuracy(result, labels_test)
    
    print "The error of KNN ", error
    
    print "Done"
    
    finish_time = time.time() - start_time
    print "This took %s seconds" %(finish_time)
    return result, error, finish_time

In [42]:
bag_of_words_df = open_bow_df("./Results/Bag_of_Words_df.tsv")

In [43]:
col_names = ["Calssification Model", "Feature Count", "Error", "Time"]
model_df = pd.DataFrame(columns = col_names)

In [44]:
train_data_features, test_data_features, labels_train, labels_test = bag_of_words_and_prediction(bag_of_words_df, 1000)
_, ran_for_error_1000, ran_for_time_1000 = random_forest_model(train_data_features, test_data_features, labels_train, labels_test)
_, NN_error_1000, NN_time_1000 = neural_network_model(train_data_features, test_data_features, labels_train, labels_test)
_, linear_error_1000, linear_time_1000 = linear_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, logistic_error_1000, logistic_time_1000 = logistic_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, SVM_error_1000, SVM_time_1000 = SVM_model(train_data_features, test_data_features, labels_train, labels_test)
_, GNB_error_1000, GNB_time_1000 = Gaussian_Naive_Bayes_model(train_data_features, test_data_features, labels_train, labels_test)
_, KNN_error_1000, KNN_time_1000 = KNN_model(train_data_features, test_data_features, labels_train, labels_test)
model_df.loc[0] = ["Random Forest (100 trees, random state = 23)", 1000, ran_for_error_1000, ran_for_time_1000]
model_df.loc[1] = ["Neural Network", 1000, NN_error_1000, NN_time_1000]
model_df.loc[2] = ["Linear Regression", 1000, linear_error_1000, linear_time_1000]
model_df.loc[3] = ["Logistic Regression", 1000, logistic_error_1000, logistic_time_1000]
model_df.loc[4] = ["Support Vector Machines", 1000, SVM_error_1000, SVM_time_1000]
model_df.loc[5] = ["Gaussian Naive Bayes", 1000, GNB_error_1000, GNB_time_1000]
model_df.loc[6] = ["K Nearest Neighbor (5)", 1000, KNN_error_1000, KNN_time_1000]
model_df.to_csv("Model_df.tsv", sep = '\t', index = False)
del train_data_features, test_data_features, labels_train, labels_test

Cleaning and parsing the training set article sentences...

Creating the bag of words...

Cleaning and parsing the test set movie reviews...

Done! Produced train, test split with  1000  features
Training the random forest...
Predicting based on model...
Calculating error...
The error of random forest with 100 trees is  0.162581445858
Done
This took 36.5700960159 seconds
Training neural network...
Predicting based on model...
Calculating error...
The error of Neural Network with  0.177060709484
Done
This took 27.4707429409 seconds
Training Linear Regression model... 
Predicting based on model... 
Calculating error...
The error of linear regression (normalized)  1.0
Done
This took 3.02640604973 seconds
Training Logistic Regression model... 
Predicting based on model... 
Calculating error...
The error of logistic regression  0.189368083566
Done
This took 3.09586000443 seconds
Training Support Vector Machines model... 
Predicting based on model... 
Calculating error...
The error of SVM  0

In [45]:
train_data_features, test_data_features, labels_train, labels_test = bag_of_words_and_prediction(bag_of_words_df, 2000)
_, ran_for_error_2000, ran_for_time_2000 = random_forest_model(train_data_features, test_data_features, labels_train, labels_test)
_, NN_error_2000, NN_time_2000 = neural_network_model(train_data_features, test_data_features, labels_train, labels_test)
_, linear_error_2000, linear_time_2000 = linear_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, logistic_error_2000, logistic_time_2000 = logistic_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, SVM_error_2000, SVM_time_2000 = SVM_model(train_data_features, test_data_features, labels_train, labels_test)
_, GNB_error_2000, GNB_time_2000 = Gaussian_Naive_Bayes_model(train_data_features, test_data_features, labels_train, labels_test)
_, KNN_error_2000, KNN_time_2000 = KNN_model(train_data_features, test_data_features, labels_train, labels_test)
model_df.loc[7] = ["Random Forest (100 trees, random state = 23)", 2000, ran_for_error_2000, ran_for_time_2000]
model_df.loc[8] = ["Neural Network", 2000, NN_error_2000, NN_time_2000]
model_df.loc[9] = ["Linear Regression", 2000, linear_error_2000, linear_time_2000]
model_df.loc[10] = ["Logistic Regression", 2000, logistic_error_2000, logistic_time_2000]
model_df.loc[11] = ["Support Vector Machines", 2000, SVM_error_2000, SVM_time_2000]
model_df.loc[12] = ["Gaussian Naive Bayes", 2000, GNB_error_2000, GNB_time_2000]
model_df.loc[13] = ["K Nearest Neighbor (5)", 2000, KNN_error_2000, KNN_time_2000]
model_df.to_csv("Model_df.tsv", sep = '\t', index = False)
del train_data_features, test_data_features, labels_train, labels_test

Cleaning and parsing the training set article sentences...

Creating the bag of words...

Cleaning and parsing the test set movie reviews...

Done! Produced train, test split with  2000  features
Training the random forest...
Predicting based on model...
Calculating error...
The error of random forest with 100 trees is  0.163615678974
Done
This took 71.2901079655 seconds
Training neural network...
Predicting based on model...
Calculating error...
The error of Neural Network with  0.182231875065
Done
This took 98.7518360615 seconds
Training Linear Regression model... 
Predicting based on model... 
Calculating error...
The error of linear regression (normalized)  1.0
Done
This took 13.6381788254 seconds
Training Logistic Regression model... 
Predicting based on model... 
Calculating error...
The error of logistic regression  0.186472230841
Done
This took 1.86303782463 seconds
Training Support Vector Machines model... 
Predicting based on model... 
Calculating error...
The error of SVM  0

In [46]:
train_data_features, test_data_features, labels_train, labels_test = bag_of_words_and_prediction(bag_of_words_df, 3000)
_, ran_for_error_3000, ran_for_time_3000 = random_forest_model(train_data_features, test_data_features, labels_train, labels_test)
_, NN_error_3000, NN_time_3000 = neural_network_model(train_data_features, test_data_features, labels_train, labels_test)
_, linear_error_3000, linear_time_3000 = linear_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, logistic_error_3000, logistic_time_3000 = logistic_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, SVM_error_3000, SVM_time_3000 = SVM_model(train_data_features, test_data_features, labels_train, labels_test)
_, GNB_error_3000, GNB_time_3000 = Gaussian_Naive_Bayes_model(train_data_features, test_data_features, labels_train, labels_test)
_, KNN_error_3000, KNN_time_3000 = KNN_model(train_data_features, test_data_features, labels_train, labels_test)
model_df.loc[14] = ["Random Forest (100 trees, random state = 23)", 3000, ran_for_error_3000, ran_for_time_3000]
model_df.loc[15] = ["Neural Network", 3000, NN_error_3000, NN_time_3000]
model_df.loc[16] = ["Linear Regression", 3000, linear_error_3000, linear_time_3000]
model_df.loc[17] = ["Logistic Regression", 3000, logistic_error_3000, logistic_time_3000]
model_df.loc[18] = ["Support Vector Machines", 3000, SVM_error_3000, SVM_time_3000]
model_df.loc[19] = ["Gaussian Naive Bayes", 3000, GNB_error_3000, GNB_time_3000]
model_df.loc[20] = ["K Nearest Neighbor (5)", 3000, KNN_error_3000, KNN_time_3000]
model_df.to_csv("Model_df.tsv", sep = '\t', index = False)
del train_data_features, test_data_features, labels_train, labels_test

Cleaning and parsing the training set article sentences...

Creating the bag of words...

Cleaning and parsing the test set movie reviews...

Done! Produced train, test split with  3000  features
Training the random forest...
Predicting based on model...
Calculating error...
The error of random forest with 100 trees is  0.160099286379
Done
This took 99.9864411354 seconds
Training neural network...
Predicting based on model...
Calculating error...
The error of Neural Network with  0.179336022339
Done
This took 116.081527948 seconds
Training Linear Regression model... 
Predicting based on model... 
Calculating error...
The error of linear regression (normalized)  1.0
Done
This took 31.860863924 seconds
Training Logistic Regression model... 
Predicting based on model... 
Calculating error...
The error of logistic regression  0.187196194022
Done
This took 2.18341302872 seconds
Training Support Vector Machines model... 
Predicting based on model... 
Calculating error...
The error of SVM  0.

In [47]:
train_data_features, test_data_features, labels_train, labels_test = bag_of_words_and_prediction(bag_of_words_df, 4000)
_, ran_for_error_4000, ran_for_time_4000 = random_forest_model(train_data_features, test_data_features, labels_train, labels_test)
_, NN_error_4000, NN_time_4000 = neural_network_model(train_data_features, test_data_features, labels_train, labels_test)
_, linear_error_4000, linear_time_4000 = linear_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, logistic_error_4000, logistic_time_4000 = logistic_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, SVM_error_4000, SVM_time_4000 = SVM_model(train_data_features, test_data_features, labels_train, labels_test)
_, GNB_error_4000, GNB_time_4000 = Gaussian_Naive_Bayes_model(train_data_features, test_data_features, labels_train, labels_test)
_, KNN_error_4000, KNN_time_4000 = KNN_model(train_data_features, test_data_features, labels_train, labels_test)
model_df.loc[21] = ["Random Forest (100 trees, random state = 23)", 4000, ran_for_error_4000, ran_for_time_4000]
model_df.loc[22] = ["Neural Network", 4000, NN_error_4000, NN_time_4000]
model_df.loc[23] = ["Linear Regression", 4000, linear_error_4000, linear_time_4000]
model_df.loc[24] = ["Logistic Regression", 4000, logistic_error_4000, logistic_time_4000]
model_df.loc[25] = ["Support Vector Machines", 4000, SVM_error_4000, SVM_time_4000]
model_df.loc[26] = ["Gaussian Naive Bayes", 4000, GNB_error_4000, GNB_time_4000]
model_df.loc[27] = ["K Nearest Neighbor (5)", 4000, KNN_error_4000, KNN_time_4000]
model_df.to_csv("Model_df.tsv", sep = '\t', index = False)
del train_data_features, test_data_features, labels_train, labels_test

Cleaning and parsing the training set article sentences...

Creating the bag of words...

Cleaning and parsing the test set movie reviews...

Done! Produced train, test split with  4000  features
Training the random forest...
Predicting based on model...
Calculating error...
The error of random forest with 100 trees is  0.161236942807
Done
This took 135.078104019 seconds
Training neural network...
Predicting based on model...
Calculating error...
The error of Neural Network with  0.183679801427
Done
This took 151.013469934 seconds
Training Linear Regression model... 
Predicting based on model... 
Calculating error...
The error of linear regression (normalized)  1.0
Done
This took 73.6077618599 seconds
Training Logistic Regression model... 
Predicting based on model... 
Calculating error...
The error of logistic regression  0.185437997725
Done
This took 2.41880202293 seconds
Training Support Vector Machines model... 
Predicting based on model... 
Calculating error...
The error of SVM  0

In [48]:
train_data_features, test_data_features, labels_train, labels_test = bag_of_words_and_prediction(bag_of_words_df, 5000)
_, ran_for_error_5000, ran_for_time_5000 = random_forest_model(train_data_features, test_data_features, labels_train, labels_test)
_, NN_error_5000, NN_time_5000 = neural_network_model(train_data_features, test_data_features, labels_train, labels_test)
_, linear_error_5000, linear_time_5000 = linear_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, logistic_error_5000, logistic_time_5000 = logistic_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, SVM_error_5000, SVM_time_5000 = SVM_model(train_data_features, test_data_features, labels_train, labels_test)
_, GNB_error_5000, GNB_time_5000 = Gaussian_Naive_Bayes_model(train_data_features, test_data_features, labels_train, labels_test)
_, KNN_error_5000, KNN_time_5000 = KNN_model(train_data_features, test_data_features, labels_train, labels_test)
model_df.loc[28] = ["Random Forest (100 trees, random state = 23)", 5000, ran_for_error_5000, ran_for_time_5000]
model_df.loc[29] = ["Neural Network", 5000, NN_error_5000, NN_time_5000]
model_df.loc[30] = ["Linear Regression", 5000, linear_error_5000, linear_time_5000]
model_df.loc[31] = ["Logistic Regression", 5000, logistic_error_5000, logistic_time_5000]
model_df.loc[32] = ["Support Vector Machines", 5000, SVM_error_5000, SVM_time_5000]
model_df.loc[33] = ["Gaussian Naive Bayes", 5000, GNB_error_5000, GNB_time_5000]
model_df.loc[34] = ["K Nearest Neighbor (5)", 5000, KNN_error_5000, KNN_time_5000]
model_df.to_csv("Model_df.tsv", sep = '\t', index = False)
del train_data_features, test_data_features, labels_train, labels_test

Cleaning and parsing the training set article sentences...

Creating the bag of words...

Cleaning and parsing the test set movie reviews...

Done! Produced train, test split with  5000  features
Training the random forest...
Predicting based on model...
Calculating error...
The error of random forest with 100 trees is  0.15823766677
Done
This took 175.911018848 seconds
Training neural network...
Predicting based on model...
Calculating error...
The error of Neural Network with  0.185334574413
Done
This took 183.063563108 seconds
Training Linear Regression model... 
Predicting based on model... 
Calculating error...
The error of linear regression (normalized)  1.0
Done
This took 120.363672972 seconds
Training Logistic Regression model... 
Predicting based on model... 
Calculating error...
The error of logistic regression  0.180473678767
Done
This took 2.56825780869 seconds
Training Support Vector Machines model... 
Predicting based on model... 
Calculating error...
The error of SVM  0.

In [49]:
train_data_features, test_data_features, labels_train, labels_test = bag_of_words_and_prediction(bag_of_words_df, 6000)
_, ran_for_error_6000, ran_for_time_6000 = random_forest_model(train_data_features, test_data_features, labels_train, labels_test)
_, NN_error_6000, NN_time_6000 = neural_network_model(train_data_features, test_data_features, labels_train, labels_test)
_, linear_error_6000, linear_time_6000 = linear_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, logistic_error_6000, logistic_time_6000 = logistic_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, SVM_error_6000, SVM_time_6000 = SVM_model(train_data_features, test_data_features, labels_train, labels_test)
_, GNB_error_6000, GNB_time_6000 = Gaussian_Naive_Bayes_model(train_data_features, test_data_features, labels_train, labels_test)
_, KNN_error_6000, KNN_time_6000 = KNN_model(train_data_features, test_data_features, labels_train, labels_test)
model_df.loc[35] = ["Random Forest (100 trees, random state = 23)", 6000, ran_for_error_6000, ran_for_time_6000]
model_df.loc[36] = ["Neural Network", 6000, NN_error_6000, NN_time_4000]
model_df.loc[37] = ["Linear Regression", 6000, linear_error_6000, linear_time_6000]
model_df.loc[38] = ["Logistic Regression", 6000, logistic_error_6000, logistic_time_6000]
model_df.loc[39] = ["Support Vector Machines", 6000, SVM_error_6000, SVM_time_6000]
model_df.loc[40] = ["Gaussian Naive Bayes", 6000, GNB_error_6000, GNB_time_6000]
model_df.loc[41] = ["K Nearest Neighbor (5)", 6000, KNN_error_6000, KNN_time_6000]
model_df.to_csv("Model_df.tsv", sep = '\t', index = False)
del train_data_features, test_data_features, labels_train, labels_test

Cleaning and parsing the training set article sentences...

Creating the bag of words...

Cleaning and parsing the test set movie reviews...

Done! Produced train, test split with  6000  features
Training the random forest...
Predicting based on model...
Calculating error...
The error of random forest with 100 trees is  0.161754059365
Done
This took 240.822900057 seconds
Training neural network...
Predicting based on model...
Calculating error...
The error of Neural Network with  0.177888095977
Done
This took 339.812659979 seconds
Training Linear Regression model... 
Predicting based on model... 
Calculating error...
The error of linear regression (normalized)  1.0
Done
This took 265.331923962 seconds
Training Logistic Regression model... 
Predicting based on model... 
Calculating error...
The error of logistic regression  0.180473678767
Done
This took 3.36719989777 seconds
Training Support Vector Machines model... 
Predicting based on model... 
Calculating error...
The error of SVM  0

In [None]:
train_data_features, test_data_features, labels_train, labels_test = bag_of_words_and_prediction(bag_of_words_df, 7000)
_, ran_for_error_7000, ran_for_time_7000 = random_forest_model(train_data_features, test_data_features, labels_train, labels_test)
_, NN_error_7000, NN_time_7000 = neural_network_model(train_data_features, test_data_features, labels_train, labels_test)
_, linear_error_7000, linear_time_7000 = linear_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, logistic_error_7000, logistic_time_7000 = logistic_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, SVM_error_7000, SVM_time_7000 = SVM_model(train_data_features, test_data_features, labels_train, labels_test)
_, GNB_error_7000, GNB_time_7000 = Gaussian_Naive_Bayes_model(train_data_features, test_data_features, labels_train, labels_test)
_, KNN_error_7000, KNN_time_7000 = KNN_model(train_data_features, test_data_features, labels_train, labels_test)
model_df.loc[42] = ["Random Forest (100 trees, random state = 23)", 7000, ran_for_error_7000, ran_for_time_7000]
model_df.loc[43] = ["Neural Network", 7000, NN_error_7000, NN_time_7000]
model_df.loc[44] = ["Linear Regression", 7000, linear_error_7000, linear_time_7000]
model_df.loc[45] = ["Logistic Regression", 7000, logistic_error_7000, logistic_time_7000]
model_df.loc[46] = ["Support Vector Machines", 7000, SVM_error_7000, SVM_time_7000]
model_df.loc[47] = ["Gaussian Naive Bayes", 7000, GNB_error_7000, GNB_time_7000]
model_df.loc[48] = ["K Nearest Neighbor (5)", 7000, KNN_error_7000, KNN_time_7000]
model_df.to_csv("Model_df.tsv", sep = '\t', index = False)
del train_data_features, test_data_features, labels_train, labels_test

Cleaning and parsing the training set article sentences...

Creating the bag of words...

Cleaning and parsing the test set movie reviews...

Done! Produced train, test split with  7000  features
Training the random forest...
Predicting based on model...
Calculating error...
The error of random forest with 100 trees is  0.159995863068
Done
This took 267.895978928 seconds
Training neural network...
Predicting based on model...
Calculating error...
The error of Neural Network with  0.180059985521
Done
This took 408.439840078 seconds
Training Linear Regression model... 
Predicting based on model... 
Calculating error...
The error of linear regression (normalized)  1.0
Done
This took 512.624369144 seconds
Training Logistic Regression model... 
Predicting based on model... 
Calculating error...
The error of logistic regression  0.180783948702
Done
This took 4.40998387337 seconds
Training Support Vector Machines model... 
Predicting based on model... 
Calculating error...
The error of SVM  0

In [None]:
train_data_features, test_data_features, labels_train, labels_test = bag_of_words_and_prediction(bag_of_words_df, 8000)
_, ran_for_error_8000, ran_for_time_8000 = random_forest_model(train_data_features, test_data_features, labels_train, labels_test)
_, NN_error_8000, NN_time_8000 = neural_network_model(train_data_features, test_data_features, labels_train, labels_test)
_, linear_error_8000, linear_time_8000 = linear_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, logistic_error_8000, logistic_time_8000 = logistic_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, SVM_error_8000, SVM_time_8000 = SVM_model(train_data_features, test_data_features, labels_train, labels_test)
_, GNB_error_8000, GNB_time_8000 = Gaussian_Naive_Bayes_model(train_data_features, test_data_features, labels_train, labels_test)
_, KNN_error_8000, KNN_time_8000 = KNN_model(train_data_features, test_data_features, labels_train, labels_test)
model_df.loc[49] = ["Random Forest (100 trees, random state = 23)", 8000, ran_for_error_8000, ran_for_time_8000]
model_df.loc[50] = ["Neural Network", 8000, NN_error_8000, NN_time_8000]
model_df.loc[51] = ["Linear Regression", 8000, linear_error_8000, linear_time_8000]
model_df.loc[52] = ["Logistic Regression", 8000, logistic_error_8000, logistic_time_8000]
model_df.loc[53] = ["Support Vector Machines", 8000, SVM_error_8000, SVM_time_8000]
model_df.loc[54] = ["Gaussian Naive Bayes", 8000, GNB_error_8000, GNB_time_8000]
model_df.loc[55] = ["K Nearest Neighbor (5)", 8000, KNN_error_8000, KNN_time_8000]
model_df.to_csv("Model_df.tsv", sep = '\t', index = False)
del train_data_features, test_data_features, labels_train, labels_test

Cleaning and parsing the training set article sentences...

Creating the bag of words...

Cleaning and parsing the test set movie reviews...

Done! Produced train, test split with  8000  features
Training the random forest...


In [None]:
train_data_features, test_data_features, labels_train, labels_test = bag_of_words_and_prediction(bag_of_words_df, 9000)
_, ran_for_error_9000, ran_for_time_9000 = random_forest_model(train_data_features, test_data_features, labels_train, labels_test)
_, NN_error_9000, NN_time_9000 = neural_network_model(train_data_features, test_data_features, labels_train, labels_test)
_, linear_error_9000, linear_time_9000 = linear_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, logistic_error_9000, logistic_time_9000 = logistic_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, SVM_error_9000, SVM_time_9000 = SVM_model(train_data_features, test_data_features, labels_train, labels_test)
_, GNB_error_9000, GNB_time_9000 = Gaussian_Naive_Bayes_model(train_data_features, test_data_features, labels_train, labels_test)
_, KNN_error_9000, KNN_time_9000 = KNN_model(train_data_features, test_data_features, labels_train, labels_test)
model_df.loc[56] = ["Random Forest (100 trees, random state = 23)", 9000, ran_for_error_9000, ran_for_time_9000]
model_df.loc[57] = ["Neural Network", 9000, NN_error_9000, NN_time_9000]
model_df.loc[58] = ["Linear Regression", 9000, linear_error_9000, linear_time_9000]
model_df.loc[59] = ["Logistic Regression", 9000, logistic_error_9000, logistic_time_9000]
model_df.loc[60] = ["Support Vector Machines", 9000, SVM_error_9000, SVM_time_9000]
model_df.loc[61] = ["Gaussian Naive Bayes", 9000, GNB_error_9000, GNB_time_9000]
model_df.loc[62] = ["K Nearest Neighbor (5)", 9000, KNN_error_9000, KNN_time_9000]
model_df.to_csv("Model_df.tsv", sep = '\t', index = False)
del train_data_features, test_data_features, labels_train, labels_test

In [None]:
train_data_features, test_data_features, labels_train, labels_test = bag_of_words_and_prediction(bag_of_words_df, 10000)
_, ran_for_error_10000, ran_for_time_10000 = random_forest_model(train_data_features, test_data_features, labels_train, labels_test)
_, NN_error_10000, NN_time_10000 = neural_network_model(train_data_features, test_data_features, labels_train, labels_test)
_, linear_error_10000, linear_time_10000 = linear_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, logistic_error_10000, logistic_time_10000 = logistic_reg_model(train_data_features, test_data_features, labels_train, labels_test)
_, SVM_error_10000, SVM_time_10000 = SVM_model(train_data_features, test_data_features, labels_train, labels_test)
_, GNB_error_10000, GNB_time_10000 = Gaussian_Naive_Bayes_model(train_data_features, test_data_features, labels_train, labels_test)
_, KNN_error_10000, KNN_time_10000 = KNN_model(train_data_features, test_data_features, labels_train, labels_test)
model_df.loc[63] = ["Random Forest (100 trees, random state = 23)", 10000, ran_for_error_10000, ran_for_time_10000]
model_df.loc[64] = ["Neural Network", 10000, NN_error_10000, NN_time_10000]
model_df.loc[65] = ["Linear Regression", 10000, linear_error_10000, linear_time_10000]
model_df.loc[66] = ["Logistic Regression", 10000, logistic_error_10000, logistic_time_10000]
model_df.loc[67] = ["Support Vector Machines", 10000, SVM_error_10000, SVM_time_10000]
model_df.loc[68] = ["Gaussian Naive Bayes", 10000, GNB_error_10000, GNB_time_10000]
model_df.loc[69] = ["K Nearest Neighbor (5)", 10000, KNN_error_10000, KNN_time_10000]
model_df.to_csv("Model_df.tsv", sep = '\t', index = False)
del train_data_features, test_data_features, labels_train, labels_test

In [None]:
print "Done!"