In [3]:
# importing some libraries
import nltk # the natural langauage toolkit
import pandas as pd # dataframes

# in order to read the data into a dataframe
texts = pd.read_csv("../spooky/train.csv")

#Approach 1
# to examine first few rows
texts.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
# split the data by author
byAuthor = texts.groupby("author")

### Now let us Tokenize our text, i.e., splitting into individual words

# author's word frequency
wordFreqByAuthor = nltk.probability.ConditionalFreqDist()

# for each author
for name, group in byAuthor:
    # now getting all of the sentences written and collapsing them into a single long string
    sentences = group['text'].str.cat(sep = ' ')
    
    # converting everything to lower case (so "The" and "the" get counted as the same word rather than two different words)
    sentences = sentences.lower()
    
    # split the text into individual tokens    
    tokens = nltk.tokenize.word_tokenize(sentences)
    
    # calculate the frequency of each token
    frequency = nltk.FreqDist(tokens)

    # add the frequencies for each author to our dictionary
    wordFreqByAuthor[name] = (frequency)
    
# In this way, I have a dictionary where each entry is the frequency distribution of words for a specific author.


In [5]:
wordFreqByAuthor[name]

FreqDist({'how': 168,
          'lovely': 60,
          'is': 693,
          'spring': 28,
          'as': 1097,
          'we': 610,
          'looked': 75,
          'from': 968,
          'windsor': 73,
          'terrace': 4,
          'on': 1044,
          'the': 9657,
          'sixteen': 9,
          'fertile': 14,
          'counties': 3,
          'spread': 26,
          'beneath': 36,
          ',': 12045,
          'speckled': 3,
          'by': 995,
          'happy': 85,
          'cottages': 10,
          'and': 6122,
          'wealthier': 1,
          'towns': 18,
          'all': 608,
          'in': 2597,
          'former': 36,
          'years': 125,
          'heart': 265,
          'cheering': 3,
          'fair': 32,
          '.': 5708,
          'a': 2711,
          'youth': 48,
          'passed': 119,
          'solitude': 43,
          'my': 2659,
          'best': 44,
          'spent': 44,
          'under': 87,
          'your': 440,
          'gentle': 7

In [7]:
# to find out how often the word "blood" is said by each author
for i in wordFreqByAuthor.keys():
    print("blood: " + i)
    print(wordFreqByAuthor[i].freq('blood'))

print()

# to find out how often the word "scream" is said by each author
for i in wordFreqByAuthor.keys():
    print("scream: " + i)
    print(wordFreqByAuthor[i].freq('scream'))
    
# print a blank line
print()

# to find out how often the word "fear" is said by each author
for i in wordFreqByAuthor.keys():
    print("fear: " + i)
    print(wordFreqByAuthor[i].freq('fear'))

blood: EAP
0.00014646397201676582
blood: HPL
0.00022992337803427008
blood: MWS
0.00022773011333545174

scream: EAP
1.7231055531384214e-05
scream: HPL
9.196935121370803e-05
scream: MWS
2.6480245736680435e-05

fear: EAP
0.00010338633318830528
fear: HPL
0.0005748084450856752
fear: MWS
0.0006196377502383222


In [8]:
# One way to guess authorship is to use the joint probabilty that each 
# author used each word in a given sentence.

# first, let's start with a test sentence
testSentence = "It was a dark and stormy night."

# and then lowercase & tokenize our test sentence
preProcessedTestSentence = nltk.tokenize.word_tokenize(testSentence.lower())

# create an empy dataframe to put our output in
testProbailities = pd.DataFrame(columns = ['author','word','probability'])

# For each author...
for i in wordFreqByAuthor.keys():
    # for each word in our test sentence...
    for j  in preProcessedTestSentence:
        # find out how frequently the author used that word
        wordFreq = wordFreqByAuthor[i].freq(j)
        # and add a very small amount to every prob. so none of them are 0
        smoothedWordFreq = wordFreq + 0.000001
        # add the author, word and smoothed freq. to our dataframe
        output = pd.DataFrame([[i, j, smoothedWordFreq]], columns = ['author','word','probability'])
        testProbailities = testProbailities.append(output, ignore_index = True)

# empty dataframe for the probability that each author wrote the sentence
testProbailitiesByAuthor = pd.DataFrame(columns = ['author','jointProbability'])

# now let's group the dataframe with our frequency by author
for i in wordFreqByAuthor.keys():
    # get the joint probability that each author wrote each word
    oneAuthor = testProbailities.query('author == "' + i + '"')
    jointProbability = oneAuthor.product(numeric_only = True)[0]
    
    # and add that to our dataframe
    output = pd.DataFrame([[i, jointProbability]], columns = ['author','jointProbability'])
    testProbailitiesByAuthor = testProbailitiesByAuthor.append(output, ignore_index = True)

# and our winner is...
testProbailitiesByAuthor.loc[testProbailitiesByAuthor['jointProbability'].idxmax(),'author']


'HPL'