In [1]:
import json

f = open('data/vectorized_lines.json', 'r', encoding='utf-8')
vectorized_lines = json.load(f)
f.close()

print(len(vectorized_lines))
print(vectorized_lines[0])

58488
{'line_text': ['WAS', 'CAR', 'then', 'Crashawe', ';', 'or', 'WAS', 'Crashawe', 'CAR', ','], 'line_pos': ['vvd', 'nn1', 'av', 'nn1', '', 'cc', 'vvd', 'nn1', 'nn1', ''], 'line_arpabet': ['w/aa/z', 'k/aa/r', 'dh/eh/n', 'k/r/ae/sh/ey', '', 'ao/r', 'w/aa/z', 'k/r/ae/sh/ey', 'k/aa/r', ''], 'line_ipa': ['w/ɑ/z', 'k/ɑ/ɹ', 'ð/ɛ/n', 'k/ɹ/æ/ʃ/eɪ', '', 'ɔ/ɹ', 'w/ɑ/z', 'k/ɹ/æ/ʃ/eɪ', 'k/ɑ/ɹ', ''], 'line_hayes_features': ['011011001000100000011000001010/010011001000000001001000100010/100100101100000000000001000010', '010000100000100000000000000000/010011001000000001001000100010/100010001010000000001000000010', '100100101110000000000000000010/010010001001000000001000100010/100100100000000000101000000010', '010000100000100000000000000000/100010001010000000001000000010/010010001001000001001000100010/100000101110000000000001000000/010010001001010000001000101010', '', '011011001000000000011000100010/100010001010000000001000000010', '011011001000100000011000001010/010011001000000001001000100010/10010

In [2]:
lines_by_author = {}

for l in vectorized_lines:
    if l['author'] not in lines_by_author:
        lines_by_author[l['author']] = []
    lines_by_author[l['author']].append(l)
    
for author, lines in lines_by_author.items():
    print(author, len(lines))
    
author_names = sorted(list(lines_by_author.keys()))

print()
print(author_names)

Crashaw 8545
Donne 13346
Marvell 8418
Vaughan 8893
Rochester 3875
Herbert 5420
Herrick 9991

['Crashaw', 'Donne', 'Herbert', 'Herrick', 'Marvell', 'Rochester', 'Vaughan']


In [5]:
import numpy as np

def lines_to_features(lines, relative_frequency=False, consonants_only=False):
    
    all_lines_features = []
    
    for l in lines:
        
        l_features = [0 for a in range(0, 30)]
        n_features = 0
        
        hayes = '/'.join(l['line_hayes_features']).split('/')
        vc = '/'.join(l['line_v_c']).split('/')
        
        if len(hayes) == len(vc):
        
            for ha, h in enumerate(hayes):

                i_features = [int(c) for c in h]

                for a, i in enumerate(i_features):

                    if consonants_only == False or vc[ha] == 'C':

                        l_features[a] += i
                        n_features += i

            if relative_frequency == True:
                for a in range(0, len(l_features)):
                    l_features[a] = float(l_features[a]) / float(n_features)

            all_lines_features.append(l_features)

    return all_lines_features

# -----------------------------------------------------------------------------------

features_by_author = {}

for author, lines in lines_by_author.items():
    features_by_author[author] = lines_to_features(lines, 
                                                   relative_frequency=True,
                                                   consonants_only=True)
    
print(len(features_by_author))

print()
print(features_by_author['Donne'][0])


7

[0.11678832116788321, 0.0364963503649635, 0.0364963503649635, 0.10218978102189781, 0.0364963503649635, 0.014598540145985401, 0.1386861313868613, 0.0, 0.10218978102189781, 0.06569343065693431, 0.051094890510948905, 0.0072992700729927005, 0.0364963503649635, 0.0, 0.0072992700729927005, 0.0, 0.0, 0.0, 0.029197080291970802, 0.014598540145985401, 0.06569343065693431, 0.0, 0.0, 0.021897810218978103, 0.0, 0.0, 0.021897810218978103, 0.0, 0.0948905109489051, 0.0]


In [8]:
import random
import numpy as np

from sklearn.naive_bayes import *
from sklearn import tree, svm
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.preprocessing import *
    
# -----------------------------------------------------------------------------------
        
for a in range(0, len(author_names)):
    
    print()
    print('----------------------------------------------')
    print()
    
    for b in range(0, len(author_names)):
        
        if a == b:
            continue
            
        if b != 0:
            continue
        
        print()
        print(author_names[a], 'vs.', author_names[b])
        print()
        
        a_lines = features_by_author[author_names[a]]
        b_lines = features_by_author[author_names[b]]
        
        random.shuffle(a_lines)
        random.shuffle(b_lines)
        
        min_lines = len(a_lines)
        if len(b_lines) < min_lines:
            min_lines = len(b_lines)
        
        training_data = a_lines[:3000] + b_lines[:3000]
        testing_data = a_lines[3000:min_lines] + b_lines[3000:min_lines]
        
        training_labels = [author_names[a] for l in a_lines[:3000]] + \
                            [author_names[b] for l in b_lines[:3000]]
        testing_labels = [author_names[a] for l in a_lines[3000:min_lines]] + \
                            [author_names[b] for l in b_lines[3000:min_lines]]
        
        #print('\t', 'a_lines', len(a_lines), 'b_lines', len(b_lines))
        #print()
        #print('\t', 'min_lines', min_lines)
        #print('\t', 'training_data', len(training_data))
        #print('\t', 'training_labels', len(training_labels))
        #print('\t', 'testing_data', len(testing_data))
        #print('\t', 'testing_labels', len(testing_labels))
        #print()
        
        # ----------------------------------------------------------------------------

        clf = BernoulliNB(fit_prior=False).fit(training_data, training_labels)

        score = clf.score(testing_data, testing_labels)

        print('\t', 'BernoulliNB False', score)
        
        # ----------------------------------------------------------------------------

        clf = MultinomialNB(fit_prior=False).fit(training_data, training_labels)

        score = clf.score(testing_data, testing_labels)

        print('\t', 'MultinomialNB False', score)
        
        # ----------------------------------------------------------------------------

        clf = tree.DecisionTreeClassifier()
        clf = clf.fit(training_data, training_labels)

        score = clf.score(testing_data, testing_labels)

        print('\t', 'DecisionTreeClassifier', score)
        
        # ----------------------------------------------------------------------------

        clf = svm.SVC(kernel='linear')
        clf = clf.fit(training_data, training_labels)

        score = clf.score(testing_data, testing_labels)
        
        print('\t', 'SVC', score)
        
        # ----------------------------------------------------------------------------

        clf = SGDClassifier()
        clf = clf.fit(training_data, training_labels)

        score = clf.score(testing_data, testing_labels)
        
        print('\t', 'SGDClassifier', score)
        
        # ----------------------------------------------------------------------------

        clf = RandomForestClassifier().fit(training_data, training_labels)
        
        score = clf.score(testing_data, testing_labels)
        
        print('\t', 'RandomForestClassifier', score)
        
        # ----------------------------------------------------------------------------

        estimators = [
            ('rfc', RandomForestClassifier()),
            ('dtc', tree.DecisionTreeClassifier()),
            ('svc', svm.SVC(kernel='linear', probability=True))]

        clf = VotingClassifier(estimators=estimators, voting='soft')
        clf.fit(training_data, training_labels)
        
        score = clf.score(testing_data, testing_labels)
        
        print('\t', 'VotingClassifier (1)', score)
        
        # ----------------------------------------------------------------------------

        estimators = [
            ('rfc', RandomForestClassifier()),
            ('svc', svm.SVC(kernel='linear', probability=True))]

        clf = VotingClassifier(estimators=estimators, voting='soft')
        clf.fit(training_data, training_labels)
        
        score = clf.score(testing_data, testing_labels)
        
        print('\t', 'VotingClassifier (2)', score)
        
        # ----------------------------------------------------------------------------
        
print()
print('ok!')


----------------------------------------------


----------------------------------------------


Donne vs. Crashaw

	 BernoulliNB False 0.5651036970243463
	 MultinomialNB False 0.5559963931469792
	 DecisionTreeClassifier 0.6010820559062218
	 SVC 0.5674481514878269
	 SGDClassifier 0.5407574391343553
	 RandomForestClassifier 0.6593327321911632
	 VotingClassifier (1) 0.6051397655545536
	 VotingClassifier (2) 0.6533814247069432

----------------------------------------------


Herbert vs. Crashaw

	 BernoulliNB False 0.5450413223140496
	 MultinomialNB False 0.5533057851239669
	 DecisionTreeClassifier 0.568801652892562
	 SVC 0.5628099173553719
	 SGDClassifier 0.5574380165289257
	 RandomForestClassifier 0.618595041322314
	 VotingClassifier (1) 0.5712809917355371
	 VotingClassifier (2) 0.6262396694214876

----------------------------------------------


Herrick vs. Crashaw

	 BernoulliNB False 0.5517583408476104
	 MultinomialNB False 0.5588818755635708
	 DecisionTreeClassifier 0.58097385031