**HEADER**

In [241]:
# Filename:     hw9_sxp170022.ipynb
# Date:         4/17/21
# Author:       Sanjeev Penupala
# Email:        sanjeev.penupala@utdallas.edu
# Course:       CS 4395.0W1
# Copyright     2021, All Rights Reserved
#
# Description:
#
#       A multi-classification problem on the Federalist papers using different 
#       classification technqiues
#

**IMPORTS**

In [242]:
%%capture
from nltk.corpus import stopwords
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
import nltk
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
nltk.download('stopwords')

**STEP 1**

Read in the csv file using pandas. Convert the author column to categorical data.
Display the first few rows. Display the counts by author.

In [243]:
df = pd.read_csv('federalist.csv')
df['author'] = df['author'].astype('category')
encoding = dict(enumerate(df['author'].cat.categories)) # mapping from categorical number to author
df['author'] = df['author'].cat.codes

In [244]:
pprint(encoding)

{0: 'HAMILTON',
 1: 'HAMILTON AND MADISON',
 2: 'HAMILTON OR MADISON',
 3: 'JAY',
 4: 'MADISON'}


In [245]:
df.head() # first few rows from dataframe

Unnamed: 0,author,text
0,0,FEDERALIST. No. 1 General Introduction For the...
1,3,FEDERALIST No. 2 Concerning Dangers from Forei...
2,3,FEDERALIST No. 3 The Same Subject Continued (C...
3,3,FEDERALIST No. 4 The Same Subject Continued (C...
4,3,FEDERALIST No. 5 The Same Subject Continued (C...


In [246]:
df['author'].value_counts() # counts of the authors

0    49
4    15
2    11
3     5
1     3
Name: author, dtype: int64

**STEP 2**

Divide into train and test, with 80% in train. Use random state 1234. 
Display the shape of train and test.

In [247]:
# Split dataset into train and test sets
X, y = df['text'], df['author']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=1234)

print('Shape of Train Set:', X_train.shape)
print('Shape of Test Set: ', X_test.shape)

Shape of Train Set: (66,)
Shape of Test Set:  (17,)


**STEP 3**

Process the text by removing stop words and performing tf-idf vectorization, fit to the training data only, and applied to train and test. 
Output the training set shape and the test set shape.

In [248]:
# remove stop words and apply tfidf vectorization
stopwords = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(stop_words=stopwords)

vec_X_train = vectorizer.fit_transform(X_train)  # fit and transform the train data
vec_X_test = vectorizer.transform(X_test)        # transform only the test data

print('train size:', vec_X_train.shape)
print('test size:', vec_X_test.shape)

train size: (66, 7876)
test size: (17, 7876)


**STEP 4**

Try a Bernoulli Naïve Bayes model. What is your accuracy on the test set?

In [249]:
# Train on Bernoulli Naive Bayes model
clf = BernoulliNB()
clf.fit(vec_X_train, y_train)
predicted = clf.predict(vec_X_test)

In [250]:
# Evaluation
print("Model Accuracy:", accuracy_score(y_test, predicted), '\n')
print("Classifcation Report:\n", classification_report(y_test, predicted), '\n')
print("Confusion Matrix: [Actual Value X Predicted Value]\n", confusion_matrix(y_test, predicted))

Model Accuracy: 0.5882352941176471 

Classifcation Report:
               precision    recall  f1-score   support

           0       0.59      1.00      0.74        10
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         2

    accuracy                           0.59        17
   macro avg       0.15      0.25      0.19        17
weighted avg       0.35      0.59      0.44        17
 

Confusion Matrix: [Actual Value X Predicted Value]
 [[10  0  0  0]
 [ 3  0  0  0]
 [ 2  0  0  0]
 [ 2  0  0  0]]


**STEP 5**

The results from step 4 will be disappointing. The classifier just guessed the
predominant class, Hamilton, every time. 

Looking at the train data shape above, there are 7876 unique words in the vocabulary. This may be too much, and many of those words may not be helpful. 


*   Redo the vectorization with max_features option set to use only the 1000 most frequent words. In addition to the words, add bigrams
as a feature.
*   Try Naïve Bayes again on the new train/test vectors and compare your
results.

In [251]:
# remove stop words, limit features to 1000, add bigrams , and then apply tfidf vectorization
vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=1000, ngram_range=(1,2))

vec_X_train = vectorizer.fit_transform(X_train)  # fit and transform the train data
vec_X_test = vectorizer.transform(X_test)        # transform only the test data

print('train size:', vec_X_train.shape)
print('test size:', vec_X_test.shape)

train size: (66, 1000)
test size: (17, 1000)


In [252]:
# Train on Bernoulli Naive Bayes model, with modified vectorization
clf = BernoulliNB()
clf.fit(vec_X_train, y_train)
predicted = clf.predict(vec_X_test)

In [253]:
# Evaluation
print("Model Accuracy:", accuracy_score(y_test, predicted), '\n')
print("Classifcation Report:\n", classification_report(y_test, predicted), '\n')
print("Confusion Matrix: [Actual Value X Predicted Value]\n", confusion_matrix(y_test, predicted))

Model Accuracy: 0.9411764705882353 

Classifcation Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95        10
           2       1.00      1.00      1.00         3
           3       1.00      0.50      0.67         2
           4       1.00      1.00      1.00         2

    accuracy                           0.94        17
   macro avg       0.98      0.88      0.90        17
weighted avg       0.95      0.94      0.93        17
 

Confusion Matrix: [Actual Value X Predicted Value]
 [[10  0  0  0]
 [ 0  3  0  0]
 [ 1  0  1  0]
 [ 0  0  0  2]]


**STEP 6**

Try logistic regression. Adjust at least one parameter in the LogisticRegression() model to see if you can improve results over having no parameters. What are your results?

In [254]:
# Train on Logistic Regression model, with modified vectorization
clf = LogisticRegression(solver='lbfgs', class_weight='balanced')
clf.fit(vec_X_train, y_train)
predicted = clf.predict(vec_X_test)

In [255]:
# Evaluation
print("Model Accuracy:", accuracy_score(y_test, predicted), '\n')
print("Classifcation Report:\n", classification_report(y_test, predicted), '\n')
print("Confusion Matrix: [Actual Value X Predicted Value]\n", confusion_matrix(y_test, predicted))

Model Accuracy: 0.7647058823529411 

Classifcation Report:
               precision    recall  f1-score   support

           0       0.83      1.00      0.91        10
           2       0.67      0.67      0.67         3
           3       1.00      0.50      0.67         2
           4       0.00      0.00      0.00         2

    accuracy                           0.76        17
   macro avg       0.62      0.54      0.56        17
weighted avg       0.73      0.76      0.73        17
 

Confusion Matrix: [Actual Value X Predicted Value]
 [[10  0  0  0]
 [ 0  2  0  1]
 [ 1  0  1  0]
 [ 1  1  0  0]]


**STEP 7**

Try a neural network. Try different topologies until you get good results. What is your final accuracy?

In [256]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(100,10), random_state=1234)
clf.fit(vec_X_train, y_train)
predicted = clf.predict(vec_X_test)

In [257]:
# Evaluation
print("Model Accuracy:", accuracy_score(y_test, predicted), '\n')
print("Classifcation Report:\n", classification_report(y_test, predicted), '\n')
print("Confusion Matrix: [Actual Value X Predicted Value]\n", confusion_matrix(y_test, predicted))

Model Accuracy: 0.8235294117647058 

Classifcation Report:
               precision    recall  f1-score   support

           0       0.83      1.00      0.91        10
           2       1.00      0.67      0.80         3
           3       0.00      0.00      0.00         2
           4       0.67      1.00      0.80         2

    accuracy                           0.82        17
   macro avg       0.62      0.67      0.63        17
weighted avg       0.75      0.82      0.77        17
 

Confusion Matrix: [Actual Value X Predicted Value]
 [[10  0  0  0]
 [ 0  2  0  1]
 [ 2  0  0  0]
 [ 0  0  0  2]]
