# Author Classification

## import dependenceis

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stopwords = stopwords.words('english')

## read data

In [3]:
texts = pd.read_csv('federalist.csv')
texts

Unnamed: 0,author,text
0,HAMILTON,FEDERALIST. No. 1 General Introduction For the...
1,JAY,FEDERALIST No. 2 Concerning Dangers from Forei...
2,JAY,FEDERALIST No. 3 The Same Subject Continued (C...
3,JAY,FEDERALIST No. 4 The Same Subject Continued (C...
4,JAY,FEDERALIST No. 5 The Same Subject Continued (C...
...,...,...
78,HAMILTON,FEDERALIST No. 79 The Judiciary Continued From...
79,HAMILTON,FEDERALIST No. 80 The Powers of the Judiciary ...
80,HAMILTON,"FEDERALIST. No. 81 The Judiciary Continued, an..."
81,HAMILTON,FEDERALIST No. 82 The Judiciary Continued From...


In [4]:
# convert author colum to label column
authors = set(texts['author'])
authors = {author : idx for author, idx in zip(authors, range(len(authors)))}
texts['author'] = texts['author'].map(lambda x: authors[x])
texts.head()

Unnamed: 0,author,text
0,3,FEDERALIST. No. 1 General Introduction For the...
1,4,FEDERALIST No. 2 Concerning Dangers from Forei...
2,4,FEDERALIST No. 3 The Same Subject Continued (C...
3,4,FEDERALIST No. 4 The Same Subject Continued (C...
4,4,FEDERALIST No. 5 The Same Subject Continued (C...


In [5]:
# count documents by author
texts['author'].value_counts()

3    49
0    15
1    11
4     5
2     3
Name: author, dtype: int64

## Split data into training and validation sets

In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts['text'], texts['author'], test_size = 0.2, random_state = 1234)

In [7]:
print("Traing text shape", train_texts.shape)
print("Validataion text shape", val_texts.shape)

Traing text shape (66,)
Validataion text shape (17,)


In [9]:
print("Traing label shape", train_labels.shape)
print("Validataion label shape", val_labels.shape)

Traing label shape (66,)
Validataion label shape (17,)


## Text Processing

In [168]:
# initializer tf-idf vectorizor
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = stopwords)
vectorizer.fit(train_texts)

TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [169]:
train_X = vectorizer.transform(train_texts)
val_X = vectorizer.transform(val_texts)

## Bernoulli Naiive Bayes

In [56]:
from sklearn.naive_bayes import BernoulliNB
nv = BernoulliNB()

In [57]:
# train Naiive Bayes
nv.fit(train_X, train_labels)

BernoulliNB()

In [58]:
# evaluate Naiive Bayes

# with training set
print("Training accuracy", nv.score(train_X, train_labels))

# with validation set
print("Training accuracy", nv.score(val_X, val_labels))

Training accuracy 0.7727272727272727
Training accuracy 0.5882352941176471


In [60]:
len(vectorizer.vocabulary_)

7876

#### Reduce max-features to 1000

In [299]:
vectorizer = TfidfVectorizer(stop_words = stopwords, 
                            max_features = 1000,
                            ngram_range = (1,2))
vectorizer.fit(train_texts)

TfidfVectorizer(max_features=1000,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [300]:
train_X = vectorizer.transform(train_texts)
val_X = vectorizer.transform(val_texts)

In [301]:
vectorizer.vocabulary_

{'federalist': 364,
 'executive': 331,
 'department': 231,
 'new': 599,
 'york': 998,
 'packet': 641,
 'tuesday': 935,
 '1788': 0,
 'people': 659,
 'state': 864,
 'constitution': 187,
 'proposed': 725,
 'government': 409,
 'claims': 132,
 'next': 600,
 'attention': 79,
 'hardly': 426,
 'part': 644,
 'system': 890,
 'could': 202,
 'attended': 78,
 'greater': 415,
 'difficulty': 244,
 'perhaps': 661,
 'less': 520,
 'judgment': 484,
 'seem': 821,
 'taken': 892,
 'upon': 947,
 'opposition': 632,
 'intended': 473,
 'president': 697,
 'united': 943,
 'states': 866,
 'merely': 565,
 'full': 397,
 'establish': 305,
 'resources': 789,
 'even': 311,
 'authorities': 80,
 'magistrate': 540,
 'instances': 469,
 'governor': 411,
 'superior': 881,
 'dignity': 245,
 'king': 495,
 'great': 414,
 'britain': 97,
 'shown': 839,
 'us': 949,
 'foreign': 380,
 'scarcely': 814,
 'future': 401,
 'attempts': 76,
 'might': 568,
 'rather': 751,
 'said': 808,
 'object': 607,
 'render': 774,
 'necessary': 591,
 'ta

In [302]:
# retrain with 1000 max-features
nv = BernoulliNB()
# train Naiive Bayes
nv.fit(train_X, train_labels)

BernoulliNB()

In [303]:
# evaluate Naiive Bayes

# with training set
print("Training accuracy", nv.score(train_X, train_labels))

# with validation set
print("Training accuracy", nv.score(val_X, val_labels))

Training accuracy 1.0
Training accuracy 0.9411764705882353


## Logistic Regression

In [304]:
from sklearn.linear_model import LogisticRegression

In [305]:
class_count = texts['author'].value_counts()
class_count /= sum(class_count)
class_weight = {label : weight for label, weight in 
                zip(list(class_count.index), class_count)}

#### Base line

In [306]:
# no parameter
log_reg = LogisticRegression()

In [307]:
log_reg.fit(train_X, train_labels)

LogisticRegression()

In [308]:
# evaluate Naiive Bayes

# with training set
print("Training accuracy", log_reg.score(train_X, train_labels))

# with validation set
print("Validation accuracy", log_reg.score(val_X, val_labels))

Training accuracy 0.7121212121212122
Validation accuracy 0.5882352941176471


#### Tuning

In [309]:
log_reg = LogisticRegression(multi_class = 'multinomial',
                             C = 1,
                            class_weight = class_weight)

In [310]:
log_reg.fit(train_X, train_labels)

LogisticRegression(C=1,
                   class_weight={0: 0.18072289156626506, 1: 0.13253012048192772,
                                 2: 0.5903614457831325, 3: 0.060240963855421686,
                                 4: 0.03614457831325301},
                   multi_class='multinomial')

In [311]:
# evaluate Naiive Bayes

# with training set
print("Training accuracy", log_reg.score(train_X, train_labels))

# with validation set
print("Validation accuracy", log_reg.score(val_X, val_labels))

Training accuracy 0.5909090909090909
Validation accuracy 0.5882352941176471


Compared to Naiive Bayes, Logistic Regression does not reach the highest accuracy based on my experiments. However, my tuning for Logistic Regression succeeded in reducing overfitting.

## Neural Network

In [312]:
from sklearn.neural_network import MLPClassifier

In [313]:
clf = MLPClassifier(hidden_layer_sizes = (200, 50),
                   activation = 'relu', solver = 'lbfgs',
                    random_state = 1234, verbose = True,
                    validation_fraction = 0.1,
                    early_stopping = True, alpha = 10e-4,
                   tol = 10e-5, max_iter = 300)

In [314]:
clf.fit(train_X, train_labels)

MLPClassifier(alpha=0.001, early_stopping=True, hidden_layer_sizes=(200, 50),
              max_iter=300, random_state=1234, solver='lbfgs', verbose=True)

In [315]:
# evaluate MLP

# with training set
print("Training accuracy", clf.score(train_X, train_labels))

# with validation set
print("Validation accuracy", clf.score(val_X, val_labels))

Training accuracy 1.0
Validation accuracy 0.8235294117647058


MLP Classifier is likely to perform competently with Naiive Bayes. By adjusting solver = 'lbgs' due to the small dataset and lowering the L2 penalty, the validataion_accuracy reaches at 82%. 