In [2]:
import math
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier

# Reading Data

In [3]:
def segmentWords(s): 
    return s.split()

def readFile(fileName):
    # Function for reading file
    # input: filename as string
    # output: contents of file as list containing single words
    contents = []
    f = open(fileName)
    for line in f:
        contents.append(line)
    f.close()
    result = segmentWords('\n'.join(contents))
    return result

#### Create a Dataframe containing the counts of each word in a file

In [4]:
d = []

for c in os.listdir("data_training"):
    directory = "data_training/" + c
    for file in os.listdir(directory):
        words = readFile(directory + "/" + file)
        e = {x:words.count(x) for x in words}
        e['__FileID__'] = ...
        e['__CLASS__'] = ...
        # Added following line to include a column with the review's label (pos or neg)
        e['__Label__'] = c
        d.append(e)

Create a dataframe from d - make sure to fill all the nan values with zeros.

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html


In [5]:
# Create the dataframe for the dataset
df = pd.DataFrame(d)

In [6]:
# Clean the NaN values from data
df_clean = df.fillna(0)
df_clean.shape

(1600, 45674)

#### Split data into training and validation set 

* Sample 80% of your dataframe to be the training data

* Let the remaining 20% be the validation data (you can filter out the indicies of the original dataframe that weren't selected for the training data)

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [7]:
# Split into train and test sets
data_train = df_clean.sample(frac=0.8, random_state=33)
data_test = df_clean.drop(data_train.index, axis=0)
print(data_train.shape, data_test.shape)

(1280, 45674) (320, 45674)


* Split the dataframe for both training and validation data into x and y dataframes - where y contains the labels and x contains the words

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [8]:
# Split into x and y training and test sets
train_x, train_y = data_train.drop(['__Label__', '__FileID__', '__CLASS__'], axis=1), data_train[['__Label__']]
test_x, test_y = data_test.drop(['__Label__', '__FileID__', '__CLASS__'], axis=1), data_test[['__Label__']]
train_y, test_y = np.reshape(train_y, 1280), np.reshape(test_y, 320)

# Logistic Regression

#### Basic Logistic Regression
* Use sklearn's linear_model.LogisticRegression() to create your model.
* Fit the data and labels with your model.
* Score your model with the same data and labels.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [9]:
# Create Logistic model and fit the training data
model = LogisticRegression()
model.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
# Predict values for test set
pred_y = model.predict(test_x)

In [11]:
# Score to see the accuracy of our model
print(model.score(test_x, test_y))

0.8375


#### Changing Parameters

In [None]:
# We can change the regularization term to learn how it affects the results based on the following lambda values
lambdas = [0.001, 0.01, 0.1, 1, 5, 10, 100, 0.05]

for lamb in lambdas:
    model = LogisticRegression('l2', C=lamb)
    model.fit(train_x, train_y)
    pred_y = model.predict(test_x)
    score = model.score(test_x, test_y)
    print(lamb, score)

  y = column_or_1d(y, warn=True)


0.001 0.7625
0.01 0.834375


#### Feature Selection
* In the backward stepsize selection method, you can remove coefficients and the corresponding x columns, where the coefficient is more than a particular amount away from the mean - you can choose how far from the mean is reasonable.

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html#
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.where.html
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.std.html
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.mean.html

In [407]:
# from sklearn.feature_selection import chi2

# while True:
#     scores, pvalues = chi2(train_x, train_y)
#     pvalues = [p for p in pvalues if p == p]
#     maxp = max(pvalues)
#     print(maxp, len(train_x.columns))
#     if maxp < 0.05:
#         break
#     feature_removal = []
#     threshold = maxp - 0.05
#     for i in range(len(pvalues)):
#         if pvalues[i] > threshold:
#             feature_removal.append(i)
#     for feature in feature_removal:
#         train_x = train_x.drop(train_x.columns[feature], axis=1)

In [421]:
# Split into x and y training and test sets
train_x, train_y = data_train.drop(['__Label__', '__FileID__', '__CLASS__'], axis=1), data_train[['__Label__']]
test_x, test_y = data_test.drop(['__Label__', '__FileID__', '__CLASS__'], axis=1), data_test[['__Label__']]
train_y, test_y = np.reshape(train_y, 1280), np.reshape(test_y, 320)

In [422]:
model = LogisticRegression(C=0.05)
model.fit(train_x, train_y)
weights = model.coef_
avg_weight = np.mean(weights)
std_weight = np.std(weights)

stds = np.arange(1, 4.4, 0.05)
max_score = 0
std_num = 1
for x in stds:
    feature_removal = []
    for i in range(len(weights[0])):
        if (weights[0][i] > avg_weight + x*std_weight) or (weights[0][i]  < avg_weight - x*std_weight):
            feature_removal.append(train_x.columns[i])

    new_train_x = train_x.drop(feature_removal, axis=1)
    new_test_x = test_x.drop(feature_removal, axis=1)

    model = LogisticRegression(C=0.05)
    model.fit(new_train_x, train_y)
    pred_y = model.predict(new_test_x)
    score = model.score(new_test_x, test_y)
    if score > max_score:
        max_score = score
        std_val = x
        
print(max_score, x)

  y = column_or_1d(y, warn=True)


0.78125 4.35


How did you select which features to remove? Why did that reduce overfitting?

#### In this step, we first tried to implement backward feature selection using the method of iteratively removing the features with highest p-value until we have reach an optimum point decided by us. We chose p-values less than 5% are optimal and once there are no more p-values greater than or equal to 5%, we are done with our feature selection. However, this approach is very time intensive and even then, calculating p-values iteratively is not entirely reliable since once you remove one feature, the p-values for the remaining features could change in the next iteration.

#### The second method, which worked a lot better, follows the method of removing all features with insignificant weights. Insignificance for a particular weight is determined by number of standard deviations that weight is from the average of all weights. After testing a large range of distances from the mean, the best value we found was dropping all features with weights more than 4 standard deviations from the mean which resulted in an accuracy score of 78%. The accuracy of 78% was achieved by dropping all weights more than 0.7 std dev's from the mean as well which shows how much the model can change with regards to feature selection.

# Single Decision Tree

#### Basic Decision Tree

* Initialize your model as a decision tree with sklearn.
* Fit the data and labels to the model.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html


#### Changing Parameters
* To test out which value is optimal for a particular parameter, you can either loop through various values or look into sklearn.model_selection.GridSearchCV

References:


http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

How did you choose which parameters to change and what value to give to them? Feel free to show a plot.

Why is a single decision tree so prone to overfitting?

# Random Forest Classifier

#### Basic Random Forest

* Use sklearn's ensemble.RandomForestClassifier() to create your model.
* Fit the data and labels with your model.
* Score your model with the same data and labels.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html


#### Changing Parameters

What parameters did you choose to change and why?

How does a random forest classifier prevent overfitting better than a single decision tree?