In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import scattertext as st
import re, io
from pprint import pprint
from scipy.stats import rankdata, hmean, norm
import spacy.en
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML('<style>.container {width:98% !important; },</style>'))

In [2]:
# Parse with regex
nlp = st.whitespace_nlp_with_sentences

In [3]:
spooky_df = pd.read_csv('train.csv')

In [4]:
spooky_df.iloc[0]

id                                                  id26305
text      This process, however, afforded me no means of...
author                                                  EAP
Name: 0, dtype: object

In [5]:
spooky_df.describe()

Unnamed: 0,id,text,author
count,19579,19579,19579
unique,19579,19579,3
top,id10436,At length one of the most aged of the women sa...,EAP
freq,1,1,7900


In [6]:
print("Document Count")
print(spooky_df.groupby('author')['text'].count())
print("Word Count")
spooky_df.groupby('author').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum())
spooky_df['parsed'] = spooky_df.text.apply(nlp)

Document Count
author
EAP    7900
HPL    5635
MWS    6044
Name: text, dtype: int64
Word Count


In [10]:
baseline = spooky_df[spooky_df.author=='EAP'].count()/spooky_df.count()
baseline

id        0.403494
text      0.403494
author    0.403494
parsed    0.403494
dtype: float64

In [11]:
spooky_df.head()

Unnamed: 0,id,text,author,parsed
0,id26305,"This process, however, afforded me no means of...",EAP,"This process, however, afforded me no means of..."
1,id17569,It never once occurred to me that the fumbling...,HPL,It never once occurred to me that the fumbling...
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"In his left hand was a gold snuff box, from wh..."
3,id27763,How lovely is spring As we looked from Windsor...,MWS,How lovely is spring As we looked from Windsor...
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,"Finding nothing else, not even gold, the Super..."


In [12]:
corpus = st.CorpusFromParsedDocuments(spooky_df, category_col='author', parsed_col='parsed').build()

In [15]:
corpus

<scattertext.ParsedCorpus.ParsedCorpus at 0x2008d275f28>

In [13]:
#defining term frequency within documents
term_freq_df = corpus.get_term_freq_df()

In [14]:
term_freq_df.head()

Unnamed: 0_level_0,EAP freq,HPL freq,MWS freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
this,1288,487,828
process,7,8,1
however,252,52,52
afforded,20,4,14
me,885,614,1471


In [27]:
def normcdf(x):
    return norm.cdf(x, x.mean(), x.std())
term_freq_df['EAP_precision_normcdf'] = normcdf(term_freq_df['EAP_precision'])
term_freq_df['EAP_freq_pct_normcdf'] = normcdf(term_freq_df['EAP_freq_pct'])
term_freq_df['EAP_scaled_f_score'] = hmean([term_freq_df['EAP_precision_normcdf'], term_freq_df['EAP_freq_pct_normcdf']])
term_freq_df['HPL_precision_normcdf'] = normcdf(term_freq_df['HPL_precision'])
term_freq_df['HPL_freq_pct_normcdf'] = normcdf(term_freq_df['HPL_freq_pct'])
term_freq_df['HPL_scaled_f_score'] = hmean([term_freq_df['HPL_precision_normcdf'], term_freq_df['HPL_freq_pct_normcdf']])
term_freq_df['MWS_precision_normcdf'] = normcdf(term_freq_df['MWS_precision'])
term_freq_df['MWS_freq_pct_normcdf'] = normcdf(term_freq_df['MWS_freq_pct'])
term_freq_df['MWS_scaled_f_score'] = hmean([term_freq_df['MWS_precision_normcdf'], term_freq_df['MWS_freq_pct_normcdf']])


term_freq_df.sort_values(by='MWS_scaled_f_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,EAP freq,HPL freq,MWS freq,EAP_precision,HPL_precision,MWS_precision,EAP_freq_pct,EAP_hmean,HPL_freq_pct,HPL_hmean,...,MWS_hmean,EAP_precision_normcdf,EAP_freq_pct_normcdf,EAP_scaled_f_score,HPL_precision_normcdf,HPL_freq_pct_normcdf,HPL_scaled_f_score,MWS_precision_normcdf,MWS_freq_pct_normcdf,MWS_scaled_f_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
perdita,0,0,169,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.001038,0.206173,0.485743,0.289478,0.228011,0.484925,0.310177,0.948064,1.0,0.97334
adrian,0,0,141,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000866,0.206173,0.485743,0.289478,0.228011,0.484925,0.310177,0.948064,0.999979,0.97333
idris,0,0,109,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.00067,0.206173,0.485743,0.289478,0.228011,0.484925,0.310177,0.948064,0.999194,0.972958
raymond,0,2,270,0.0,0.007353,0.992647,0.0,0.0,7e-06,1.3e-05,...,0.001658,0.206173,0.485743,0.289478,0.233087,0.50891,0.319733,0.946212,1.0,0.972363
misery,4,0,101,0.038095,0.0,0.961905,1e-05,2.1e-05,0.0,0.0,...,0.000621,0.231208,0.521517,0.32038,0.228011,0.484925,0.310177,0.937898,0.998247,0.967132
windsor,0,0,73,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000449,0.206173,0.485743,0.289478,0.228011,0.484925,0.310177,0.948064,0.982105,0.964784
elizabeth,0,1,70,0.0,0.014085,0.985915,0.0,0.0,3e-06,6e-06,...,0.00043,0.206173,0.485743,0.289478,0.23779,0.496916,0.321657,0.944472,0.97786,0.960876
sister,3,3,81,0.034483,0.034483,0.931034,8e-06,1.5e-05,1e-05,1.9e-05,...,0.000498,0.228764,0.512578,0.316343,0.252351,0.520896,0.339991,0.928572,0.990192,0.958393
miserable,4,0,71,0.053333,0.0,0.946667,1e-05,2.1e-05,0.0,0.0,...,0.000436,0.241678,0.521517,0.330294,0.228011,0.484925,0.310177,0.93342,0.979361,0.955839
grief,7,2,81,0.077778,0.022222,0.9,1.8e-05,3.6e-05,7e-06,1.3e-05,...,0.000498,0.258998,0.548249,0.351801,0.243543,0.50891,0.329433,0.918149,0.990192,0.952811


In [28]:
term_freq_df['EAP_corner_score'] = corpus.get_corner_scores('EAP')
term_freq_df['HPL_corner_score'] = corpus.get_corner_scores('HPL')
term_freq_df['MWS_corner_score'] = corpus.get_corner_scores('MWS')
term_freq_df.sort_values(by='EAP_corner_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,EAP freq,HPL freq,MWS freq,EAP_precision,HPL_precision,MWS_precision,EAP_freq_pct,EAP_hmean,HPL_freq_pct,HPL_hmean,...,EAP_scaled_f_score,HPL_precision_normcdf,HPL_freq_pct_normcdf,HPL_scaled_f_score,MWS_precision_normcdf,MWS_freq_pct_normcdf,MWS_scaled_f_score,EAP_corner_score,HPL_corner_score,MWS_corner_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
dupin,58,0,0,1.0,0.0,0.0,0.000149,0.000297,0.0,0.0,...,0.908656,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946521,0.105044,0.109398
marie,50,0,0,1.0,0.0,0.0,0.000128,0.000256,0.0,0.0,...,0.88987,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946518,0.105048,0.109401
jupiter,45,0,0,1.0,0.0,0.0,0.000115,0.000231,0.0,0.0,...,0.875601,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946516,0.105051,0.109404
the automaton,41,0,0,1.0,0.0,0.0,0.000105,0.00021,0.0,0.0,...,0.862639,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946514,0.105056,0.109407
monsieur,39,0,0,1.0,0.0,0.0,0.0001,0.0002,0.0,0.0,...,0.855614,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946513,0.105058,0.109409
ellison,29,0,0,1.0,0.0,0.0,7.4e-05,0.000149,0.0,0.0,...,0.814649,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946502,0.105081,0.109429
maelzel,29,0,0,1.0,0.0,0.0,7.4e-05,0.000149,0.0,0.0,...,0.814649,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946502,0.105081,0.109429
bug,29,0,0,1.0,0.0,0.0,7.4e-05,0.000149,0.0,0.0,...,0.814649,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946502,0.105081,0.109429
color,29,0,0,1.0,0.0,0.0,7.4e-05,0.000149,0.0,0.0,...,0.814649,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.946502,0.105081,0.109429
the prefect,28,0,0,1.0,0.0,0.0,7.2e-05,0.000144,0.0,0.0,...,0.809992,0.228011,0.484925,0.310177,0.238847,0.484467,0.319953,0.9465,0.105086,0.109433


In [29]:
#top 10 terms for each author
term_freq_df = corpus.get_term_freq_df()
term_freq_df['EAP Score'] = corpus.get_scaled_f_scores('EAP')
term_freq_df['HPL Score'] = corpus.get_scaled_f_scores('HPL')
term_freq_df['MWS Score'] = corpus.get_scaled_f_scores('MWS')

print("Top 10 Edgar Allen Poe terms")
pprint(list(term_freq_df.sort_values(by='EAP Score', ascending=False).index[:10]))
print("Top 10 HP Lovecraft terms")
pprint(list(term_freq_df.sort_values(by='HPL Score', ascending=False).index[:10]))
print("Top 10 Mary Shelley terms")
pprint(list(term_freq_df.sort_values(by='MWS Score', ascending=False).index[:10]))



Top 10 Edgar Allen Poe terms
['dupin',
 'madame',
 'l',
 'balloon',
 'marie',
 'jupiter',
 'the balloon',
 'the automaton',
 'monsieur',
 'automaton']
Top 10 HP Lovecraft terms
['gilman',
 'innsmouth',
 'arkham',
 'whateley',
 'later',
 'despite',
 'aout',
 'outside',
 'jermyn',
 'because of']
Top 10 Mary Shelley terms
['perdita',
 'adrian',
 'idris',
 'raymond',
 'windsor',
 'elizabeth',
 'misery',
 'miserable',
 'sister',
 'endeavoured']


In [34]:
html = produce_scattertext_explorer(corpus,
                                    category='EAP',
                                    category_name='EAP',
                                    not_category_name='HPL',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.scale,
                                    metadata=spooky_df['author'])
file_name = 'output/spooky_authors_scattertext.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

FileNotFoundError: [Errno 2] No such file or directory: 'Data_Science/kaggle/spooky_authors/spookyauthorstoScattertextScale.html'