In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import scattertext as st
import re, io
from pprint import pprint
from scipy.stats import rankdata, hmean, norm
import spacy.en
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML('<style>.container {width:98% !important; },</style>'))

In [2]:
# Parse with regex
# nlp = spacy.en.English()
nlp = st.whitespace_nlp_with_sentences

In [3]:
# Read in Data
spooky_df = pd.read_csv('train.csv')
spooky_df.iloc[0]

id                                                  id26305
text      This process, however, afforded me no means of...
author                                                  EAP
Name: 0, dtype: object

In [4]:
spooky_df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [5]:
spooky_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19579 entries, 0 to 19578
Data columns (total 3 columns):
id        19579 non-null object
text      19579 non-null object
author    19579 non-null object
dtypes: object(3)
memory usage: 459.0+ KB


In [6]:
# Count nuber of excerpts from each author
print("Document Count")
print(spooky_df.groupby('author')['text'].count())
print("Word Count")
spooky_df.groupby('author').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum())
spooky_df['parsed'] = spooky_df.text.apply(nlp)

Document Count
author
EAP    7900
HPL    5635
MWS    6044
Name: text, dtype: int64
Word Count


In [7]:
# Calculate a baseline classification score
baseline = spooky_df[spooky_df.author=='EAP'].count()/spooky_df.count()
baseline

id        0.403494
text      0.403494
author    0.403494
parsed    0.403494
dtype: float64

In [8]:
#Create a Pandas DataFrame based a scattertext parsed with regex
corpus = st.CorpusFromParsedDocuments(spooky_df, category_col='author', parsed_col='parsed').build()
term_freq_df = corpus.get_term_freq_df()
term_freq_df.head()

Unnamed: 0_level_0,EAP freq,HPL freq,MWS freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
this,1288,487,828
process,7,8,1
however,252,52,52
afforded,20,4,14
me,885,614,1471


In [9]:
#Relationship between words and individual authors
term_freq_df['EAP_precision'] = term_freq_df['EAP freq'] * 1./(term_freq_df['EAP freq'] + term_freq_df['HPL freq']+ term_freq_df['MWS freq'])
term_freq_df['EAP_recall'] = term_freq_df['EAP freq'] * 1./term_freq_df['EAP freq'].sum()
term_freq_df['EAP_f_score'] = term_freq_df.apply(lambda x: (hmean([x['EAP_precision'], x['EAP_recall']])
                                                                   if x['EAP_precision'] > 0 and x['EAP_recall'] > 0 
                                                                   else 0), axis=1)                                                        
term_freq_df['HPL_precision'] = term_freq_df['HPL freq'] * 1./(term_freq_df['HPL freq'] + term_freq_df['EAP freq'] + term_freq_df['MWS freq'])
term_freq_df['HPL_recall'] = term_freq_df['HPL freq'] * 1./term_freq_df['HPL freq'].sum()
term_freq_df['HPL_f_score'] = term_freq_df.apply(lambda x: (hmean([x['HPL_precision'], x['HPL_recall']])
                                                                   if x['HPL_precision'] > 0 and x['HPL_recall'] > 0 
                                                                   else 0), axis=1)                                                        
term_freq_df['MWS_precision'] = term_freq_df['MWS freq'] *1./(term_freq_df['MWS freq'] + term_freq_df['EAP freq'] + term_freq_df['HPL freq'])
term_freq_df['MWS_recall'] = term_freq_df['MWS freq'] * 1./term_freq_df['MWS freq'].sum()
term_freq_df['MWS_f_score'] = term_freq_df.apply(lambda x: (hmean([x['MWS_precision'], x['MWS_recall']])
                                                                   if x['MWS_precision'] > 0 and x['MWS_recall'] > 0 
                                                                   else 0), axis=1)                                                        
term_freq_df.sort_values(by='EAP_f_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,EAP freq,HPL freq,MWS freq,EAP_precision,EAP_recall,EAP_f_score,HPL_precision,HPL_recall,HPL_f_score,MWS_precision,MWS_recall,MWS_f_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
the,14823,10897,9648,0.419108,0.037996,0.069675,0.308103,0.035415,0.063528,0.272789,0.029647,0.053482
of,8875,5834,6131,0.425864,0.022749,0.043191,0.279942,0.018961,0.035516,0.294194,0.01884,0.035412
and,5659,6075,6109,0.317155,0.014506,0.027743,0.34047,0.019744,0.037323,0.342375,0.018772,0.035593
to,4725,3241,4820,0.369545,0.012112,0.023455,0.25348,0.010533,0.020226,0.376975,0.014811,0.028503
a,4686,3295,2710,0.438313,0.012012,0.023383,0.308203,0.010709,0.020698,0.253484,0.008328,0.016125
in,4084,2727,2595,0.434191,0.010469,0.020444,0.289921,0.008863,0.0172,0.275888,0.007974,0.0155
i,3787,2707,4316,0.350324,0.009707,0.018891,0.250416,0.008798,0.016998,0.39926,0.013263,0.025673
of the,2842,1486,1217,0.512534,0.007285,0.014366,0.267989,0.00483,0.009488,0.219477,0.00374,0.007354
it,2308,1398,1180,0.47237,0.005916,0.011686,0.286124,0.004544,0.008945,0.241506,0.003626,0.007145
that,2306,2016,2089,0.359694,0.005911,0.011631,0.31446,0.006552,0.012837,0.325846,0.006419,0.012591


In [10]:
# Remove stop words by standardizing and normalizing 
def normcdf(x):
    return norm.cdf(x, x.mean(), x.std())
term_freq_df['EAP_precision_normcdf'] = normcdf(term_freq_df['EAP_precision'])
term_freq_df['EAP_recall_normcdf'] = normcdf(term_freq_df['EAP_recall'])
term_freq_df['EAP_scaled_f_score'] = hmean([term_freq_df['EAP_precision_normcdf'], term_freq_df['EAP_recall_normcdf']])
term_freq_df['HPL_precision_normcdf'] = normcdf(term_freq_df['HPL_precision'])
term_freq_df['HPL_recall_normcdf'] = normcdf(term_freq_df['HPL_recall'])
term_freq_df['HPL_scaled_f_score'] = hmean([term_freq_df['HPL_precision_normcdf'], term_freq_df['HPL_recall_normcdf']])
term_freq_df['MWS_precision_normcdf'] = normcdf(term_freq_df['MWS_precision'])
term_freq_df['MWS_recall_normcdf'] = normcdf(term_freq_df['MWS_recall'])
term_freq_df['MWS_scaled_f_score'] = hmean([term_freq_df['MWS_precision_normcdf'], term_freq_df['MWS_recall_normcdf']])


term_freq_df.sort_values(by='EAP_scaled_f_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,EAP freq,HPL freq,MWS freq,EAP_precision,EAP_recall,EAP_f_score,HPL_precision,HPL_recall,HPL_f_score,MWS_precision,...,MWS_f_score,EAP_precision_normcdf,EAP_recall_normcdf,EAP_scaled_f_score,HPL_precision_normcdf,HPL_recall_normcdf,HPL_scaled_f_score,MWS_precision_normcdf,MWS_recall_normcdf,MWS_scaled_f_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
l,82,3,2,0.942529,0.00021,0.00042,0.034483,1e-05,1.9e-05,0.022989,...,1.2e-05,0.899855,0.964323,0.930974,0.252351,0.520896,0.339991,0.255813,0.507834,0.340237
madame,74,0,3,0.961039,0.00019,0.000379,0.0,0.0,0.0,0.038961,...,1.8e-05,0.906912,0.947792,0.926901,0.228011,0.484925,0.310177,0.267962,0.519512,0.353559
de,130,7,14,0.860927,0.000333,0.000666,0.046358,2.3e-05,4.5e-05,0.092715,...,8.6e-05,0.864088,0.99801,0.926233,0.261039,0.568543,0.357799,0.310865,0.64473,0.419475
balloon,72,1,3,0.947368,0.000185,0.000369,0.013158,3e-06,6e-06,0.039474,...,1.8e-05,0.901737,0.942827,0.921824,0.237139,0.496916,0.321061,0.268356,0.519512,0.353903
mr,167,28,11,0.81068,0.000428,0.000856,0.135922,9.1e-05,0.000182,0.053398,...,6.8e-05,0.838152,0.999896,0.911907,0.331143,0.789315,0.466552,0.279187,0.61151,0.383353
altogether,89,11,6,0.839623,0.000228,0.000456,0.103774,3.6e-05,7.1e-05,0.056604,...,3.7e-05,0.853461,0.975014,0.910197,0.305108,0.615212,0.407915,0.28171,0.554406,0.373589
dupin,58,0,0,1.0,0.000149,0.000297,0.0,0.0,0.0,0.0,...,0.0,0.920554,0.897062,0.908656,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953
of course,97,15,7,0.815126,0.000249,0.000497,0.12605,4.9e-05,9.7e-05,0.058824,...,4.3e-05,0.840569,0.983807,0.906565,0.323053,0.660267,0.433839,0.283463,0.565958,0.377736
upon the,395,66,40,0.788423,0.001013,0.002022,0.131737,0.000215,0.000428,0.07984,...,0.000245,0.825695,1.0,0.904527,0.327703,0.974206,0.490434,0.30032,0.871329,0.446683
character,103,12,15,0.792308,0.000264,0.000528,0.092308,3.9e-05,7.8e-05,0.115385,...,9.2e-05,0.827912,0.988522,0.901117,0.296048,0.62665,0.402122,0.329811,0.655577,0.438846


In [11]:
# Calculate degree of similarity with distance from extrame corners as corner score.
term_freq_df['EAP_corner_score'] = corpus.get_corner_scores('EAP')
term_freq_df['HPL_corner_score'] = corpus.get_corner_scores('HPL')
term_freq_df['MWS_corner_score'] = corpus.get_corner_scores('MWS')
term_freq_df.sort_values(by='EAP_corner_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,EAP freq,HPL freq,MWS freq,EAP_precision,EAP_recall,EAP_f_score,HPL_precision,HPL_recall,HPL_f_score,MWS_precision,...,EAP_scaled_f_score,HPL_precision_normcdf,HPL_recall_normcdf,HPL_scaled_f_score,MWS_precision_normcdf,MWS_recall_normcdf,MWS_scaled_f_score,EAP_corner_score,HPL_corner_score,MWS_corner_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
dupin,58,0,0,1.0,0.000149,0.000297,0.0,0.0,0.0,0.0,...,0.908656,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946521,0.105044,0.109398
marie,50,0,0,1.0,0.000128,0.000256,0.0,0.0,0.0,0.0,...,0.88987,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946518,0.105048,0.109401
jupiter,45,0,0,1.0,0.000115,0.000231,0.0,0.0,0.0,0.0,...,0.875601,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946516,0.105051,0.109404
the automaton,41,0,0,1.0,0.000105,0.00021,0.0,0.0,0.0,0.0,...,0.862639,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946514,0.105056,0.109407
monsieur,39,0,0,1.0,0.0001,0.0002,0.0,0.0,0.0,0.0,...,0.855614,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946513,0.105058,0.109409
ellison,29,0,0,1.0,7.4e-05,0.000149,0.0,0.0,0.0,0.0,...,0.814649,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946502,0.105081,0.109429
maelzel,29,0,0,1.0,7.4e-05,0.000149,0.0,0.0,0.0,0.0,...,0.814649,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946502,0.105081,0.109429
bug,29,0,0,1.0,7.4e-05,0.000149,0.0,0.0,0.0,0.0,...,0.814649,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946502,0.105081,0.109429
color,29,0,0,1.0,7.4e-05,0.000149,0.0,0.0,0.0,0.0,...,0.814649,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946502,0.105081,0.109429
the prefect,28,0,0,1.0,7.2e-05,0.000144,0.0,0.0,0.0,0.0,...,0.809992,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.9465,0.105086,0.109433


In [12]:
#top 10 terms for each author
term_freq_df = corpus.get_term_freq_df()
term_freq_df['EAP Score'] = corpus.get_scaled_f_scores('EAP')
term_freq_df['HPL Score'] = corpus.get_scaled_f_scores('HPL')
term_freq_df['MWS Score'] = corpus.get_scaled_f_scores('MWS')

print("Top 10 Edgar Allen Poe terms")
pprint(list(term_freq_df.sort_values(by='EAP Score', ascending=False).index[:10]))
print("Top 10 HP Lovecraft terms")
pprint(list(term_freq_df.sort_values(by='HPL Score', ascending=False).index[:10]))
print("Top 10 Mary Shelley terms")
pprint(list(term_freq_df.sort_values(by='MWS Score', ascending=False).index[:10]))



Top 10 Edgar Allen Poe terms
['dupin',
 'madame',
 'l',
 'balloon',
 'marie',
 'jupiter',
 'the balloon',
 'the automaton',
 'monsieur',
 'automaton']
Top 10 HP Lovecraft terms
['gilman',
 'innsmouth',
 'arkham',
 'whateley',
 'later',
 'despite',
 'aout',
 'outside',
 'jermyn',
 'because of']
Top 10 Mary Shelley terms
['perdita',
 'adrian',
 'idris',
 'raymond',
 'windsor',
 'elizabeth',
 'misery',
 'miserable',
 'sister',
 'endeavoured']


In [13]:
html = produce_scattertext_explorer(corpus,
                                    category='EAP',
                                    category_name='EAP',
                                    not_category_name='HPL''MWS',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.scale,
                                    metadata=spooky_df['author'])
file_name = 'output/spookyauthorsscattertext.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [14]:
# Evalate corpus based on log scales
html = produce_scattertext_explorer(corpus,
                                    category='EAP',
                                    category_name='EAP',
                                    not_category_name='HPL'or'MWS',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.log_scale_standardize,
                                    metadata=spooky_df['author'])
file_name = 'output/spookyauthorsscattertextLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [38]:
# Rank terms based on percentile
html = produce_scattertext_explorer(corpus,
                                    category='EAP',
                                    category_name='EAP',
                                    not_category_name='HPL'or'MWS',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.percentile,
                                    metadata=spooky_df['author'])
file_name = 'output/spookyauthorsscattertextpercentile.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [16]:
# Apply jitter function to evaluate grouped words for each position
html = produce_scattertext_explorer(corpus,
                                    category='EAP',
                                    category_name='EAP',
                                    not_category_name='HPL'or'MWS',
                                    width_in_pixels=1000,
                                    jitter=.1,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.log_scale_standardize,
                                    metadata=spooky_df['author'])
file_name = 'output/spookyauthorsscattertextLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [18]:
# Break ties alphabeticaly
html = produce_scattertext_explorer(corpus,
                                    category='EAP',
                                    category_name='EAP',
                                    not_category_name='HPL'or'MWS',
                                    width_in_pixels=1000,
                                    jitter=.1,
                                    minimum_term_frequency=5,
                                    metadata=spooky_df['author'],
                                    term_significance = st.LogOddsRatioUninformativeDirichletPrior())
file_name = 'output/spookyauthorsscattertextRankDefault.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [23]:
def scale(ar): 
    return (ar - ar.min()) / (ar.max() - ar.min())

def zero_centered_scale(ar):
    scores = np.zeros(len(ar))
    scores[ar > 0] = scale(ar[ar > 0])
    scores[ar < 0] = -scale(-ar[ar < 0])
    return (scores + 1) / 2.

frequencies_scaled = scale(np.log(spooky_df.sum(axis=1).values))

  # Remove the CWD from sys.path while we load stuff.
  
  


In [27]:
freq_df = corpus.get_term_freq_df().rename(columns={'EAP freq': 'y_EAP', 'HPL freq': 'y_HPL', 'MWS freq':'y_MWS'})
a_w = 0.01
y_i, y_j, y_k = freq_df['y_EAP'].values, freq_df['y_HPL'].values, freq_df['y_MWS'].values

In [28]:
n_i, n_j = y_i.sum(), y_j.sum()
a_0 = len(freq_df) * a_w
delta_i_j = (  np.log((y_i + a_w) / (n_i + a_0 - y_i - a_w))
                 - np.log((y_j + a_w) / (n_j + a_0 - y_j - a_w)))
var_delta_i_j = ( 1./(y_i + a_w) + 1./(y_i + a_0 - y_i - a_w)
                    + 1./(y_j + a_w) + 1./(n_j + a_0 - n_j - a_w))
zeta_i_j = delta_i_j/np.sqrt(var_delta_i_j)
max_abs_zeta = max(zeta_i_j.max(), -zeta_i_j.min())
zeta_scaled_for_charting = ((((zeta_i_j > 0).astype(float) * (zeta_i_j/max_abs_zeta))*0.5 + 0.5)
                            + ((zeta_i_j < 0).astype(float) * (zeta_i_j/max_abs_zeta) * 0.5))

In [34]:
# Visualize data using corner scores
corner_scores = corpus.get_corner_scores('EAP')
html = produce_scattertext_explorer(corpus,
                                    category='EAP',
                                    category_name='EAP',
                                    not_category_name='HPL' or 'MWS',
                                    minimum_term_frequency=5,
                                    width_in_pixels=1000,
                                    x_coords=frequencies_scaled,
                                    y_coords=corner_scores,
                                    scores=corner_scores,
                                    sort_by_dist=False,
                                    metadata=spooky_df['author'],
                                    x_label='Log Frequency',
                                    y_label='Corner Scores')
file_name = 'output/CornervsLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

CoordinatesNotRightException: Length of x_cords must be the same as the number of terms in the term_doc_matrix.