In [None]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

import nltk
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import PlaintextCorpusReader

import plotly as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt 
import plotly.express as pex
import holoviews as hv
hv.extension('bokeh')

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

## Viewing a pre-creating matrix of functional word frequencies

In [None]:
df = pd.read_csv(r'/kaggle/input/federalist-papers/fedPapers85.csv')
df.head(5)

In [None]:
df.author.value_counts().plot(kind='pie')

In [None]:
# I mispelled some file names
df = df.replace({'HM_fed_18.txt':'Hamilton_and_Madion_fed_18.txt','HM_fed_19.txt':'Hamilton_and_Madion_fed_19.txt','HM_fed_20.txt':'Hamilton_and_Madion_fed_20.txt'})

## Creating new features from raw text files

In [None]:
# Read all papers into a corpus using NLTK
paper_corpus = PlaintextCorpusReader(r'/kaggle/input/federalist-papers/FedPapersCorpus/FedPapersCorpus', '.*')

### Sentences, Words, Characters, Word Length, Sentence Length

In [None]:
# Using loops here because the order of files in the corpus and dataframe are different
# There may be a more optimized way to vectorize this operation
#    by joining the dataframe with the corpus if required

df['chars'] = [len(paper_corpus.raw(fileids=[f])) for f in df.filename]
df['words'] = [len(paper_corpus.words(fileids=[f])) for f in df.filename]
df['sents'] = [len(paper_corpus.sents(fileids=[f])) for f in df.filename]
df['word_len'] = df.chars/df.words
df['sent_len'] = df.words/df.sents

## Exploratory Analysis

In [None]:
auths = ['Hamilton','Madison','Jay','dispt']

In [None]:
fig, ax = plt.subplots(2, 2)
for i, auth in enumerate(auths):
    ax[i//2, i%2].hist(df[df.author == auth].words)
    ax[i//2, i%2].set_title(auth + " paper length", weight='bold', size=12)
    ax[i//2, i%2].set_xlabel("Number of Words")
    ax[i//2, i%2].set_ylabel("Quantity of Papers")
    ax[i//2, i%2].set_xlim(0, 7000)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(2, 2)
for i, auth in enumerate(auths):
    ax[i//2, i%2].hist(df[df.author == auth].sent_len)
    ax[i//2, i%2].set_title(auth + " avg sent length", weight='bold', size=12)
    ax[i//2, i%2].set_xlabel("Avg Sentence Length")
    ax[i//2, i%2].set_ylabel("Quantity of Papers")
    ax[i//2, i%2].set_xlim(25, 45)
plt.tight_layout()

In [None]:
# Wordclouds for each author
for auth in auths:
    print(auth)
    text = paper_corpus.raw(fileids = list(df[df.author == auth].filename))
    wc = WordCloud().generate(text)
    plt.figure(figsize=(15,10))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.show()

## K means clustering with all functional words

In [None]:
# Scaling
scaler = MinMaxScaler()
c = df.drop(['author', 'filename'], axis=1).columns
df[c] = scaler.fit_transform(df[c])

In [None]:
vals = df.drop(['author', 'filename'], axis=1).values
doc_cluster = KMeans(n_clusters = 4)
doc_cluster.fit(vals)
labs = doc_cluster.labels_
centroids = doc_cluster.cluster_centers_

In [None]:
df['label'] =  labs
df.head(2)

## Visualizing with Sankey

In [None]:
counter_df = df[['author','label']]
counter_df['count'] = 1
counter_df = counter_df.groupby(['author','label']).agg('count').reset_index()
counter_df.label = counter_df.label.astype(str)

In [None]:
plot = hv.Sankey(counter_df, kdims=["author", "label"], vdims=["count"])
plot.opts(label_position='left',edge_color='author')

## Using Decision Trees to classify

In [None]:
test_df = df[df.author == 'dispt']
train_df= df[df.author != 'dispt']

X_test = test_df.drop(['author','filename'],axis=1)
X_train = train_df.drop(['author','filename'],axis=1)
y_test = test_df.author
y_train = train_df.author

In [None]:
paper_tree = DecisionTreeClassifier()
paper_tree.fit(X_train, y_train)

## Visualizing Decision Tree

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(paper_tree,
                  feature_names = X_train.columns,
                  class_names = np.sort(y_train.unique()))

In [None]:
y_pred = paper_tree.predict(X_test)
list(y_pred)

## Madison is historically credited as the author of the disputed papers!