In [1]:
import json

f = open('data/vectorized_lines.json', 'r', encoding='utf-8')
vectorized_lines = json.load(f)
f.close()

print(len(vectorized_lines))
print(vectorized_lines[0])

58488
{'line_text': ['WAS', 'CAR', 'then', 'Crashawe', ';', 'or', 'WAS', 'Crashawe', 'CAR', ','], 'line_pos': ['vvd', 'nn1', 'av', 'nn1', '', 'cc', 'vvd', 'nn1', 'nn1', ''], 'line_arpabet': ['w/aa/z', 'k/aa/r', 'dh/eh/n', 'k/r/ae/sh/ey', '', 'ao/r', 'w/aa/z', 'k/r/ae/sh/ey', 'k/aa/r', ''], 'line_ipa': ['w/ɑ/z', 'k/ɑ/ɹ', 'ð/ɛ/n', 'k/ɹ/æ/ʃ/eɪ', '', 'ɔ/ɹ', 'w/ɑ/z', 'k/ɹ/æ/ʃ/eɪ', 'k/ɑ/ɹ', ''], 'line_hayes_features': ['011011001000100000011000001010/010011001000000001001000100010/100100101100000000000001000010', '010000100000100000000000000000/010011001000000001001000100010/100010001010000000001000000010', '100100101110000000000000000010/010010001001000000001000100010/100100100000000000101000000010', '010000100000100000000000000000/100010001010000000001000000010/010010001001000001001000100010/100000101110000000000001000000/010010001001010000001000101010', '', '011011001000000000011000100010/100010001010000000001000000010', '011011001000100000011000001010/010011001000000001001000100010/10010

In [2]:
lines_by_author = {}

for l in vectorized_lines:
    if l['author'] not in lines_by_author:
        lines_by_author[l['author']] = []
    lines_by_author[l['author']].append(l)
    
for author, lines in lines_by_author.items():
    print(author, len(lines))
    
author_names = sorted(list(lines_by_author.keys()))

print()
print(author_names)

Crashaw 8545
Donne 13346
Marvell 8418
Vaughan 8893
Rochester 3875
Herbert 5420
Herrick 9991

['Crashaw', 'Donne', 'Herbert', 'Herrick', 'Marvell', 'Rochester', 'Vaughan']


In [15]:
import numpy as np
import string
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

def lines_to_phoneme_features(lines, relative_frequency=False, consonants_only=False):
    
    all_lines_features = []
    
    for l in lines:
        
        l_features = [0 for a in range(0, 30)]
        n_features = 0
        
        hayes = '/'.join(l['line_hayes_features']).split('/')
        vc = '/'.join(l['line_v_c']).split('/')
        
        if len(hayes) == len(vc):
        
            for ha, h in enumerate(hayes):

                i_features = [int(c) for c in h]

                for a, i in enumerate(i_features):

                    if consonants_only == False or vc[ha] == 'C':

                        l_features[a] += i
                        n_features += i

            if relative_frequency == True:
                for a in range(0, len(l_features)):
                    l_features[a] = float(l_features[a]) / float(n_features)

            all_lines_features.append(l_features)

    return all_lines_features

# -----------------------------------------------------------------------------------

features_by_author = {}

for author, lines in lines_by_author.items():
    
    phoneme_features_by_author[author] = []
    
    for line in lines:
        
        try:
            features = []

            phonetic_data = lines_to_phoneme_features([line], 
                                                       relative_frequency=False,
                                                       consonants_only=False)
            features.append(phonetic_data[0])
                        
            features.append([w.strip().lower() for w in line['line_text'] 
                              if w.strip() > '' and w.strip().lower() not in string.punctuation and 
                                 w.strip().lower() not in sw])

            phoneme_features_by_author[author].append(features)
            
        except IndexError:
            pass
                        
print(len(phoneme_features_by_author))
print()
print(phoneme_features_by_author['Donne'][0])

7

[[17, 14, 7, 14, 15, 5, 19, 0, 24, 9, 8, 6, 8, 1, 1, 0, 0, 4, 4, 4, 19, 0, 0, 3, 10, 0, 4, 0, 23, 0], ['youth', 'strength', 'mirth', 'wit', 'time']]


In [19]:
import random
import numpy as np

from sklearn.naive_bayes import *
from sklearn import tree, svm
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.preprocessing import *
    
    
from gensim import corpora
from gensim.matutils import *

# -----------------------------------------------------------------------------------
        
for a in range(0, len(author_names)):
    
    for b in range(0, len(author_names)):
        
        if a == b:
            continue
            
        #if b != 0:
        #    continue
    
        print()
        print('----------------------------------------------')
        print()
        print(author_names[a], 'vs.', author_names[b])
        print()
        
        a_lines = phoneme_features_by_author[author_names[a]]
        b_lines = phoneme_features_by_author[author_names[b]]
        
        random.shuffle(a_lines)
        random.shuffle(b_lines)
        
        min_lines = len(a_lines)
        if len(b_lines) < min_lines:
            min_lines = len(b_lines)
        
        all_training_data = a_lines[:3000] + b_lines[:3000]
        all_testing_data = a_lines[3000:min_lines] + b_lines[3000: min_lines]
        
        training_labels = [author_names[a] for l in a_lines[:3000]] + \
                            [author_names[b] for l in b_lines[:3000]]
        testing_labels = [author_names[a] for l in a_lines[3000:min_lines]] + \
                            [author_names[b] for l in b_lines[3000:min_lines]]
            
        training_phone_data = [l[0] for l in all_training_data]
        testing_phone_data = [l[0] for l in all_testing_data]
        
        # ----------------------------------------------------------------------------

        clf = RandomForestClassifier().fit(training_phone_data, training_labels)
        
        score = clf.score(testing_phone_data, testing_labels)
        
        print('\t', 'phonemes RandomForestClassifier', score)
        
        # ----------------------------------------------------------------------------
        
        training_text_data = [l[1] for l in all_training_data]
        testing_text_data = [l[1] for l in all_testing_data]
        
        dictionary = corpora.Dictionary(training_text_data + testing_text_data)
        
        training_corpus = [dictionary.doc2bow(text) for text in training_text_data]
        testing_corpus = [dictionary.doc2bow(text) for text in testing_text_data]
        
        training_matrix = corpus2dense(training_corpus, len(dictionary))
        training_matrix = training_matrix.T
        
        testing_matrix = corpus2dense(testing_corpus, len(dictionary))
        testing_matrix = testing_matrix.T
        
        # ----------------------------------------------------------------------------

        clf = RandomForestClassifier().fit(training_matrix, training_labels)
        
        score = clf.score(testing_matrix, testing_labels)
        
        print('\t', 'text RandomForestClassifier', score)
        
        # ----------------------------------------------------------------------------
        
        
print()
print('ok!')


----------------------------------------------

Crashaw vs. Donne

	 phonemes RandomForestClassifier 0.7658250676284941
	 text RandomForestClassifier 0.7185752930568079

----------------------------------------------

Crashaw vs. Herbert

	 phonemes RandomForestClassifier 0.6495867768595042
	 text RandomForestClassifier 0.7256198347107438

----------------------------------------------

Crashaw vs. Herrick

	 phonemes RandomForestClassifier 0.630297565374211
	 text RandomForestClassifier 0.7192966636609558

----------------------------------------------

Crashaw vs. Marvell

	 phonemes RandomForestClassifier 0.6665741768405475
	 text RandomForestClassifier 0.6778579356270811

----------------------------------------------

Crashaw vs. Rochester

	 phonemes RandomForestClassifier 0.681350114416476
	 text RandomForestClassifier 0.717391304347826

----------------------------------------------

Crashaw vs. Vaughan

	 phonemes RandomForestClassifier 0.6412984670874662
	 text RandomForestC