# Corpora Clustering

We're trying to import text files from 7-8 different authors, each with roughly 10 texts, and we want to merge the corpuses to classify who is writing...

In [1]:
import spacy
import re
import os
import glob
import collections
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def cleanup_document(document):
    document = document.replace("\t", " ")
    document = document.replace("\n", ".")
    document = document.replace("\r", ".")
    document = document.replace("\r\n", ".")
    document = document.replace("\n\r", ".")
    document = document.replace("...", ".")
    document = document.replace("..", ". ")
    
    return document.split(".") # return all of the sentences in the corpus, each of which is a "document"

In [3]:
cwd = os.getcwd()

In [4]:
nlp = spacy.load("en")

In [5]:
directories = {
    "Abernathy": cwd + "/data/AmericanNationCorpus/data/written_2/non-fiction/OUP/Abernathy/",
    "Berk": cwd + "/data/AmericanNationCorpus/data/written_2/non-fiction/OUP/Berk/",
    "Castro": cwd + "/data/AmericanNationCorpus/data/written_2/non-fiction/OUP/Castro/",
    "Fletcher": cwd + "/data/AmericanNationCorpus/data/written_2/non-fiction/OUP/Fletcher/",
    "Kauffman": cwd + "/data/AmericanNationCorpus/data/written_2/non-fiction/OUP/Kauffman/",
    "Rybczynski": cwd + "/data/AmericanNationCorpus/data/written_2/non-fiction/OUP/Rybczynski/"
}

In [6]:
corpora = collections.defaultdict(list)

In [7]:
for name, directory in directories.items():
    os.chdir(directory)
    # use glob to find all text files, and load them in
    text_paths = glob.iglob("*.txt")
    
    for text_path in text_paths:
        
        document = open(text_path).read()
        clean_document = cleanup_document(document)
        
        corpora[name].extend(clean_document)
        
# go back to the original directory
os.chdir(cwd)

## Feature Engineering: TF-IDF
The next sections will create the TF-IDF matrix as our features

In [8]:
# convert our items into a dataframe
rows = []
for name, sentences in corpora.items():
    for sentence in sentences:
        rows.append({"source": name, "sentence": sentence})


sentences_df = pd.DataFrame(rows, columns=["sentence", "source"])

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer sentences
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


# Applying the vectorizer
tfidf = vectorizer.fit_transform(sentences_df["sentence"])

In [30]:
len(sentences_df)

17164

In [10]:
# create a pandas df from this data, using the feature names
df = pd.DataFrame(tfidf.todense(), columns=vectorizer.get_feature_names())

In [11]:
df["CLASS_LABEL"] = sentences_df["source"]

In [12]:
# downsample to 50 features for computational efficiency
from sklearn.decomposition import PCA

downsampled_features = PCA(n_components=50).fit_transform(df.drop(columns=["CLASS_LABEL"]))

In [13]:
downsampled_df = pd.DataFrame(downsampled_features)

In [14]:
downsampled_df["CLASS_LABEL"] = df["CLASS_LABEL"]

In [15]:
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split

## Cluster Time!

In the cells below we will use the following Clustering techniques to and compare their ARI scores to each other

- KMeans
- MeanShift
- SpectralClustering

In [16]:
features = downsampled_df.drop(columns=["CLASS_LABEL"])
target = downsampled_df["CLASS_LABEL"]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25)

### K-Means

In [17]:
k_means_clusterer = KMeans(n_clusters=len(set(downsampled_df["CLASS_LABEL"])), n_jobs=8)

In [18]:
# get the predictive data
k_means_data = k_means_clusterer.fit_predict(X_train)

In [19]:
k_means_score = adjusted_rand_score(y_train, k_means_data)
print("ARI Score (K-Means): {}".format(k_means_score))

ARI Score (K-Means): 0.0599961656125327


### Mean Shift

In [48]:
from sklearn.cluster import estimate_bandwidth

bandwidth = estimate_bandwidth(X_train, quantile=0.2, n_samples=500)


shift_clusterer = MeanShift(bandwidth=bandwidth, cluster_all=False, n_jobs=8)

In [49]:
shift_data = shift_clusterer.fit_predict(X_train)

In [42]:
# plt.scatter(X_train.values[:, 0], X_train.values[:, 1], c=y_train)
# plt.show()

print('Comparing the assigned categories to the ones in the data:')
print(pd.crosstab(y_train, shift_data))

Comparing the assigned categories to the ones in the data:
col_0           0
CLASS_LABEL      
Abernathy    2247
Berk         2195
Castro       2111
Fletcher     1820
Kauffman     3695
Rybczynski    805


In [22]:
shift_score = adjusted_rand_score(y_train, shift_data)
print("ARI Score (Mean Shift): {}".format(shift_score))

ARI Score (Mean Shift): 0.0


### Spectral Clustering

In [23]:
spectral_clusterer = SpectralClustering(len(set(downsampled_df["CLASS_LABEL"])), n_jobs=8)

In [24]:
spectral_data = spectral_clusterer.fit_predict(X_train)

In [33]:
spectral_score = adjusted_rand_score(y_train, spectral_data)
print("ARI Score (Spectral Clustering): {}".format(spectral_score))

ARI Score (Spectral Clustering): 0.08584146214367562


### Wait a sec, this is hilariously random

Maybe let's try a different technique

## Supervised Learning
In the next few cells we'll try to build a supervised model that performs using features that we select using an unsupervised technique (PCA)

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

In [62]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7960848287112561