In [51]:
import os, csv, math, random
import pandas as pd
import numpy as np

from collections import Counter

cwd = os.getcwd()
print('Current working directory: ' + cwd + '\n')
      
relativepath = os.path.join('..', 'data', 'weekfour', 'poefic.csv')
poefic = pd.read_csv(relativepath)
poefic.head()

Current working directory: /Users/rdubnic2/Documents/lis590dsh/Code



Unnamed: 0,date,author,title,genre,reception,text
0,1908,"Robins, Elizabeth,",The convert,fiction,elite,"looked like decent artisans, but more who bore..."
1,1871,"Lytton, Edward Bulwer Lytton,",The coming race,fiction,elite,"called the "" Easy Time "" (with which what I ma..."
2,1872,"Butler, Samuel,","Erewhon, or, Over the range",fiction,elite,the curtain ; on this I let it drop and retrea...
3,1900,"Barrie, J. M.",Tommy and Grizel,fiction,elite,"at you !"" he said. ""Dear eyes, "" said she. ""Th..."
4,1873,"Ritchie, Anne Thackeray,",Old Kensington,fiction,elite,"furious; I have not dared tell her, poor creat..."


# Week 5 homework.

In class, we built a classifier that detected Trump's authorship of tweets from his account. Repeat that work for the poefic dataset. Build a classifier that distinguishes poetry from fiction. 

Almost all the code you need is in the notebook we used in class. Copy functions from that notebook and paste them here, altering them as necessary so that they use the metadata available in the poefic frame. Include a function that does five-fold crossvalidation. Play around with different settings of p (the number of features included in the model) to see how high you can get the accuracy.

Then, at the end of the notebook, write a short paragraph of commentary. How much accuracy do you get? Why do you think that accuracy for this classification task is higher or lower than it was on the Trump tweet data? (You might want to inspect the data itself, using Excel or a text editor.)

In [52]:
def poefic_test(a_data_frame, rowidx):
    if 'poetry' in a_data_frame['genre'][rowidx]:
        return 'poetry'
    elif 'fiction' in a_data_frame['genre'][rowidx]:
        return 'fiction'
    else:
        return 'other'
    
poefic_text = poefic['text']

genre = []
fold = []
for idx in poefic.index:
    genre.append(poefic_test(poefic, idx))
    fold.append(random.sample(list(range(5)), 1)[0])
genre = pd.Series(genre, index = poefic.index)
fold = pd.Series(fold, index = poefic.index)

pf_df = pd.concat([poefic_text, genre, fold], axis = 1)
pf_df.columns = ['text', 'genre', 'fold']

# limit the dataframe to columns with either android or iphone;
# exclude 'other'
pf_df = pf_df[(pf_df['genre'] == 'poetry') | (pf_df['genre'] == 'fiction')]
pf_df.head()

Unnamed: 0,text,genre,fold
0,"looked like decent artisans, but more who bore...",fiction,4
1,"called the "" Easy Time "" (with which what I ma...",fiction,1
2,the curtain ; on this I let it drop and retrea...,fiction,4
3,"at you !"" he said. ""Dear eyes, "" said she. ""Th...",fiction,1
4,"furious; I have not dared tell her, poor creat...",fiction,0


In [53]:
testset = pf_df[pf_df['fold'] == 4]
trainingset = pf_df[pf_df['fold'] != 4]
print('Training set includes ' + str(trainingset.shape[0]))
print('Test set includes ' + str(testset.shape[0]))

Training set includes 819
Test set includes 208


In [54]:
def tokenize(astring):
    ''' Breaks a string into words, and counts them.
    Designed so it strips punctuation and lowercases everything,
    but doesn't separate hashtags and at-signs.
    '''
    wordcounts = Counter()
    # create a counter to hold the counts
    
    tokens = astring.split()
    for t in tokens:
        word = t.strip(',.!?:;-—()<>[]/"\'').lower()
        wordcounts[word] += 1
        
    return wordcounts

def create_vocab(seq_of_strings, n):
    ''' Given a sequence of text snippets, this function
    returns the n most common words. We'll use this to
    create a limited 'vocabulary'.
    '''
    vocab = Counter()
    for astring in seq_of_strings:
        counts = tokenize(astring)
        vocab = vocab + counts
    topn = [x[0] for x in vocab.most_common(n)]
    return topn

In [55]:
print('android = poetry')
print('iphone = fiction')
print('more positive means more likely poetry')
print('more negative means more likely fiction')

def categorize(df, rowidx):
    if df.loc[rowidx, 'genre'] == 'poetry':
        return 'poetry'
    elif df.loc[rowidx, 'genre'] == 'fiction':
        return 'fiction'
    else:
        print('error: neither fiction nor poetry')
        return 'other'

def get_priors(df):
    source_counts = df.groupby('genre').count()['text']
    print(source_counts)
    poetry_odds = source_counts['poetry'] / source_counts['fiction']
    fiction_odds = source_counts['fiction'] / source_counts['poetry']
    return math.log(poetry_odds), math.log(fiction_odds)

def train_nb_model(df, p):
    vocab = create_vocab(df['text'], p)
    vocabset = set(vocab)
    # we make a set because membership-checking is faster
    # in sets; but we also hold onto the list, which is ordered
    
    poetry_prior, fiction_prior = get_priors(df)
    
    poetry_counts = Counter()
    fiction_counts = Counter()
    
    for i in df.index:
        work = df['text'][i]
        work_counts = tokenize(work)
        category = categorize(df, i)
        if category == 'fiction':
            fiction_counts = fiction_counts + work_counts
        elif category == 'poetry':
            poetry_counts = poetry_counts + work_counts
    
    # Now let's organize these Counters into a DataFrame
    
    fiction = pd.Series(1, index = vocab)
    poetry = pd.Series(1, index = vocab)
    # notice initializing to 1 -- Laplacian smoothing
    
    for word, count in poetry_counts.items():
        if word in vocabset:
            poetry[word] += count
    
    for word, count in fiction_counts.items():
        if word in vocabset:
            fiction[word] += count
    
    all_prob = (fiction + poetry) / (np.sum(fiction) + np.sum(poetry))
    
    fiction_prob = fiction / np.sum(fiction)
    poetry_prob = poetry / np.sum(poetry)
    
    # note that when we sum up the fiction and poetry
    # columns, we are also summing up all the Laplacian 1's
    # we initially added to them
    
    model = pd.concat([fiction, poetry, all_prob, 
                       fiction_prob, poetry_prob], axis = 1) 
        
    model.columns = ['fiction', 'poetry', 'all_prob', 'fiction_prob', 'poetry_prob']
    
    # The next step is unnecessary, and will not be found in
    # most published versions of naive Bayes. I'm providing it
    # because it may help you understand the logic of the
    # algorithm.
    
    model['fiction_norm'] = fiction_prob / all_prob
    model['poetry_norm'] = poetry_prob / all_prob
    
    
    model['log_fiction'] = [math.log(x) for x in model['fiction_norm']]
    model['log_poetry'] = [math.log(x) for x in model['poetry_norm']]
    return vocab, poetry_prior, fiction_prior, model

vocab, poetry_prior, fiction_prior, model = train_nb_model(trainingset, 75)
model.head()

# print(poetry_prior, fiction_prior)

android = poetry
iphone = fiction
more positive means more likely poetry
more negative means more likely fiction
genre
fiction    290
poetry     529
Name: text, dtype: int64


Unnamed: 0,fiction,poetry,all_prob,fiction_prob,poetry_prob,fiction_norm,poetry_norm,log_fiction,log_poetry
the,17489,37399,0.120523,0.100245,0.133114,0.831751,1.104477,-0.184222,0.099372
,12196,22072,0.075245,0.069906,0.078561,0.92904,1.044064,-0.073603,0.04312
and,10460,23659,0.074918,0.059955,0.08421,0.800279,1.12402,-0.222795,0.116912
of,9080,15490,0.053951,0.052045,0.055134,0.964687,1.021928,-0.035951,0.021691
to,9277,11732,0.046131,0.053175,0.041758,1.152678,0.905192,0.142088,-0.099608


In [75]:
pd.options.mode.chained_assignment = None

def apply_model(vocab, poetry_prior, fiction_prior, model, testset):
    right = 0
    wrong = 0
    vocabset = set(vocab)
    odds_poe = []
    odds_fic = []

    for i in testset.index:
        odds_poetry = poetry_prior
        odds_fiction = fiction_prior
        lit = testset['text'][i]
        lit_counts = tokenize(lit)
        for word, count in lit_counts.items():
            if word not in vocabset:
                continue
            odds_poetry += model.loc[word, 'log_poetry']
            odds_fiction += model.loc[word, 'log_fiction']
            
        if odds_poetry > odds_fiction:
            prediction = 'poetry'
        else:
            prediction = 'fiction'
        
        odds_poe.append(odds_poetry)
        odds_fic.append(odds_fiction)

        reality = categorize(testset, i)
        if reality != 'poetry' and reality != 'fiction':
            continue
        elif prediction == reality:
            right += 1
        else:
            wrong += 1

    print("Got " + str(right) + " rows right, and " + str(wrong) + " wrong.")
    accuracy = (right / (wrong + right)) * 100
    print("Accuracy was {0:.2f}%".format(accuracy))
    
    resultset = testset.copy()
    resultset['odds_poetry'] = odds_poe
    resultset['odds_fiction'] = odds_fic
    resultset = resultset.sort_values(by = 'odds_poetry')
    
    return resultset, accuracy

newtestset = apply_model(vocab, poetry_prior, fiction_prior, model, testset)

In [107]:
# newtestset

In [68]:
def five_fold_cross_valid(df, p):
    accuracies = []
    for i in range(5):
        pf_df_test_set = pf_df[pf_df['fold'] == i]
        pf_df_training_set = pf_df[pf_df['fold'] != i] 
        vocab, poetry_prior, fiction_prior, model = train_nb_model(pf_df_training_set, p)
        pf_df_test_set, accuracy = apply_model(vocab, poetry_prior, fiction_prior, model, testset)
        accuracies.append(accuracy)
    avg_acc = print('Average accuracy is ', round(sum(accuracies)/len(accuracies),2), '%')
    return avg_acc

In [106]:
five_fold_cross_valid(pf_df, 2000)

genre
fiction    273
poetry     531
Name: text, dtype: int64
Got 201 rows right, and 7 wrong.
Accuracy was 96.63%
genre
fiction    285
poetry     545
Name: text, dtype: int64
Got 201 rows right, and 7 wrong.
Accuracy was 96.63%
genre
fiction    293
poetry     529
Name: text, dtype: int64
Got 201 rows right, and 7 wrong.
Accuracy was 96.63%
genre
fiction    295
poetry     538
Name: text, dtype: int64
Got 201 rows right, and 7 wrong.
Accuracy was 96.63%
genre
fiction    290
poetry     529
Name: text, dtype: int64
Got 199 rows right, and 9 wrong.
Accuracy was 95.67%
Average accuracy is  96.44 %


## Brief Analysis of Results
This function is far more accurate than for the trump data (I was never able to get much above 75% accuracy, for some reason). After looking at the data, it's still not totally clear to me, but I can hazard a guess based on the age of the data. Both the poetry and fiction, being all from 1920 and before, are likely to be more homogenous within each category. Thought the size of the dataset is prohibitive to inspect each work individually, it's possible that this dataset is more rigid in structure and style than our modern writing could be. Perhaps even more impactful is the length of each work in the dataset. When classifying fiction or poetry on this data, we are given a whole lot more than 140 characters per example. Though the trump data has around 5000 entries and over 38,000 words, this data likely has over 11,000 more words in it, allowing for more training data for the algorithm. With p values in the range of 2,000, I was able to push average accuracy after the five-fold cross validation test to over 96%.