# Portfolio Assignment 3: Exploring NLTK
Author: Six Wires

Instructor: Mazidi

Subject: CS 4396

Date: November 6, 2022

In [None]:
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd
nltk.download('stopwords')
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('book')
nltk.download('punkt')
nltk.download('omw-1.4')

## Data Exploration
Below we will import and perform some basic data exploration on the articles of federation papers. 

In [None]:
# read in the data
df = pd.read_csv("federalist.csv")

# convert author column to categorical type
df["author"] = df.author.astype("category")
df.head()

Unnamed: 0,author,text
0,HAMILTON,FEDERALIST. No. 1 General Introduction For the...
1,JAY,FEDERALIST No. 2 Concerning Dangers from Forei...
2,JAY,FEDERALIST No. 3 The Same Subject Continued (C...
3,JAY,FEDERALIST No. 4 The Same Subject Continued (C...
4,JAY,FEDERALIST No. 5 The Same Subject Continued (C...


In [None]:
# get author counts from df
AUTHOR_COUNTS = {}

for index, row in df.iterrows():
  if row["author"] in AUTHOR_COUNTS.keys():
    AUTHOR_COUNTS[row["author"]] += 1
  else:
    AUTHOR_COUNTS[row["author"]] = 1

In [None]:
# print author counts
for author in AUTHOR_COUNTS.keys():
  print(f"{author}: {AUTHOR_COUNTS[author]}")

HAMILTON: 49
JAY: 5
MADISON: 15
HAMILTON AND MADISON: 3
HAMILTON OR MADISON: 11


## Setting up our model
### Cleaning the data

In [None]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords = set(stopwords.words('english'))
NBvectorizer = TfidfVectorizer(stop_words=stopwords)

### Splitting the data

In [None]:
X = df.text
y = df.author

In [None]:
# peek at x
X.head()

0    FEDERALIST. No. 1 General Introduction For the...
1    FEDERALIST No. 2 Concerning Dangers from Forei...
2    FEDERALIST No. 3 The Same Subject Continued (C...
3    FEDERALIST No. 4 The Same Subject Continued (C...
4    FEDERALIST No. 5 The Same Subject Continued (C...
Name: text, dtype: object

Break into train and test sets:

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1234)
X_train.shape

(66,)

In [None]:
# apply tfidf vectorizer
X_train = NBvectorizer.fit_transform(X_train)  # fit and transform the train data
X_test = NBvectorizer.transform(X_test)        # transform only the test data

### Shape analysis


In [None]:
print('train size:', X_train.shape)
print(X_train.toarray()[:5])

print('\ntest size:', X_test.shape)
print(X_test.toarray()[:5])


train size: (66, 7876)
[[0.         0.         0.02956872 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.03741484 0.         0.        ]]

test size: (17, 7876)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.02314673 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


### Setting up Naive Bayes model

In [None]:
from sklearn.naive_bayes import BernoulliNB

naive_bayes = BernoulliNB()
naive_bayes.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# make predictions on the test data
pred = naive_bayes.predict(X_test)

print('accuracy score: ', accuracy_score(y_test, pred))

accuracy score:  0.5882352941176471


## Attempt 2
The accuracy is extremely low since there are a diverse set of words. We will repeat with only the top 1000 words

In [None]:
# create new vectors
vectorizer_max = TfidfVectorizer(stop_words=stopwords, max_features=1000)

# set up X and y
X = vectorizer_max.fit_transform(df.text)
y = df.author

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=1234)

In [None]:
from sklearn.naive_bayes import BernoulliNB

naive_bayes2 = BernoulliNB()
naive_bayes2.fit(X_train, y_train)

In [None]:
# make predictions on the test data
pred = naive_bayes2.predict(X_test)

print('accuracy score: ', accuracy_score(y_test, pred))

accuracy score:  0.8823529411764706


### Analysis:
The accuracy on the bernoulli naive bayes model produced much higher accuracy.

### Logistic Regression
We'll now attempt to perform the same predictions using logistic regression.

In [None]:
# import dependencies
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss

In [None]:
# set up X and y
X = df.text
y = df.author

# divide into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=1234)

# vectorizer
vectorizer = TfidfVectorizer(binary=True)
X_train = vectorizer.fit_transform(X_train)  # fit and transform the train data
X_test = vectorizer.transform(X_test)        # transform only the test data

#train
classifier = LogisticRegression(solver='lbfgs', class_weight='balanced')
classifier.fit(X_train, y_train)

# evaluate
pred = classifier.predict(X_test)
print('accuracy score: ', accuracy_score(y_test, pred))

accuracy score:  0.5882352941176471


For the accuracy of logistic regression, we got about the same as the classic multinomial naive bayes method.

### Attempt 2
Lets try this again with the multiclass parameter.

In [None]:
# set up X and y
X = df.text
y = df.author

# divide into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=1234)

# vectorizer
vectorizer = TfidfVectorizer(binary=True)
X_train = vectorizer.fit_transform(X_train)  # fit and transform the train data
X_test = vectorizer.transform(X_test)        # transform only the test data

#train
classifier = LogisticRegression(solver='lbfgs', multi_class='multinomial')
classifier.fit(X_train, y_train)

# evaluate
pred = classifier.predict(X_test)
print('accuracy score: ', accuracy_score(y_test, pred))

accuracy score:  0.5882352941176471


As we can see, this didn't improve the overall accuracy.

## Neural Networks
We'll now attempt to perform classification using neural networks

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss


pipe1 = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('neuralnet', MLPClassifier(solver='lbfgs', alpha=1e-5,
                   hidden_layer_sizes=(15, 7), random_state=1)),
         ])

pipe1.fit(X, y)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('neuralnet',
                 MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15, 7),
                               random_state=1, solver='lbfgs'))])

In [None]:
# evaluate on test data
pred = pipe1.predict(X)

from sklearn import metrics

print("Confusion matrix:\n", metrics.confusion_matrix(y, pred))

import numpy as np
print("\nOverall accuracy: ", np.mean(pred==y))

Confusion matrix:
 [[49  0  0  0  0]
 [ 0  3  0  0  0]
 [ 0  0 11  0  0]
 [ 0  0  0  5  0]
 [ 0  0  0  0 15]]

Overall accuracy:  1.0


As we can see, this had the best accuracy out of all of the models.