In [1]:
def text2paragraphs(filename, min_size=1):
    """ A text contained in the file 'filename' will be read 
    and chopped into paragraphs.
    Paragraphs with a string length less than min_size will be ignored.
    A list of paragraph strings will be returned"""
    
    txt = open(filename).read()
    paragraphs = [para for para in txt.split("\n\n") if len(para) > min_size]
    return paragraphs


In [2]:
# the position of lables is very important
# it corresponds to a novel by that author within "files"
# the position of the author is also relevant, as it will correspond to metrics
# i.e. Samuel Butler's metrics are always returned in position 1
labels = ['Virginia Woolf', 'Samuel Butler', 'Herman Melville', 
          'David Herbert Lawrence', 'Daniel Defoe', 'James Joyce']


# names of books we have to train our machine model
files = ['night_and_day_virginia_woolf.txt', 'the_way_of_all_flash_butler.txt',
         'moby_dick_melville.txt', 'sons_and_lovers_lawrence.txt',
         'robinson_crusoe_defoe.txt', 'james_joyce_ulysses.txt']

# location of our books
path = "books/"


In [3]:
data = []
targets = []
counter = 0

# loop across all files we have downloaded
for fname in files:
    paras = text2paragraphs(path + fname, min_size=150) # return a book with paragraphs over 150 chars in a list
    data.extend(paras)
    targets += [counter] * len(paras)
    counter += 1


In [4]:
# cell is useless, because train_test_split will do the shuffling!

import random

data_targets = list(zip(data, targets))
# create random permutation on list:
data_targets = random.sample(data_targets, len(data_targets))

data, targets = list(zip(*data_targets))


In [5]:
from sklearn.model_selection import train_test_split

res = train_test_split(data, targets, 
                       train_size=0.8,
                       test_size=0.2,
                       random_state=42)
train_data, test_data, train_targets, test_targets = res 


In [6]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS)

vectors = vectorizer.fit_transform(train_data)

# creating a classifier
classifier = MultinomialNB(alpha=.01)
classifier.fit(vectors, train_targets)

vectors_test = vectorizer.transform(test_data)

predictions = classifier.predict(vectors_test)
accuracy_score = metrics.accuracy_score(test_targets, 
                                        predictions)
f1_score = metrics.f1_score(test_targets, 
                            predictions, 
                            average='macro')

print("accuracy score: ", accuracy_score)
print("F1-score: ", f1_score)


accuracy score:  0.8976592270005443
F1-score:  0.8916518690567288


In [7]:
# we want to use paragraphs from this 2nd Virginia Wolf 
paras = text2paragraphs(path + "the_voyage_out_virginia_woolf.txt", min_size=250)

# start on paragraph 100 and go to paragraph 500
first_para, last_para = 100, 500
vectors_test = vectorizer.transform(paras[first_para: last_para]) # pass a list of strings that will be used to make predictions against
#vectors_test = vectorizer.transform(["To be or not to be"])

predictions = classifier.predict(vectors_test) # make our predictions
print(predictions)
targets = [0] * (last_para - first_para)
accuracy_score = metrics.accuracy_score(targets, 
                                        predictions)
precision_score = metrics.precision_score(targets, 
                                          predictions, 
                                          average='macro')

f1_score = metrics.f1_score(targets, 
                            predictions, 
                            average='macro')

print("accuracy score: ", accuracy_score)
print("precision score: ", accuracy_score)
print("F1-score: ", f1_score)


[5 0 5 5 0 0 5 0 2 5 0 0 0 0 0 0 0 0 1 0 1 0 0 5 1 5 0 1 1 0 1 0 5 0 2 5 0
 2 2 5 0 0 0 0 0 3 2 0 0 0 0 4 2 5 2 0 0 0 0 1 0 5 5 0 0 2 0 0 0 0 5 5 5 0
 0 0 0 0 0 2 2 3 0 2 2 0 5 0 0 5 0 0 0 0 0 5 0 0 1 0 0 3 5 1 0 5 5 5 5 0 5
 0 0 0 0 0 0 1 2 0 0 0 5 0 1 2 2 2 0 5 0 3 0 1 3 0 0 5 1 5 1 0 0 0 0 0 0 0
 0 0 3 1 5 1 5 1 1 1 1 0 0 0 0 0 0 5 0 1 0 0 0 5 5 5 5 0 2 0 0 0 0 0 0 0 0
 5 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 5 5 5 0 0 0 5 5 5 2 0 5 0 5 0 0 0 0 5 0
 0 5 5 0 0 0 0 2 3 0 0 0 0 5 0 0 5 3 5 1 2 1 5 0 5 0 5 0 1 0 1 0 0 0 0 1 3
 1 1 0 5 5 5 5 2 0 0 0 0 5 3 2 2 0 1 0 0 0 0 0 0 3 0 4 0 0 0 0 1 5 0 0 0 1
 1 0 0 5 5 0 5 0 0 0 3 0 5 3 0 0 0 5 3 1 3 0 0 3 0 1 0 0 0 0 3 0 5 5 0 0 0
 3 3 5 0 3 0 0 0 1 0 1 0 0 3 3 2 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 2 2 2 0 0 0
 0 1 0 0 0 5 5 0 0 0 0 0 0 0 3 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0]
accuracy score:  0.5925
precision score:  0.5925
F1-score:  0.12401883830455258


In [8]:
# perform a probability test
predictions = classifier.predict_proba(vectors_test)
print(predictions)


[[6.12524202e-007 5.91966700e-006 3.80973259e-007 6.12828199e-005
  2.19812375e-015 9.99931804e-001]
 [9.97766658e-001 7.34052784e-004 3.27275230e-004 5.56794110e-012
  1.60611264e-015 1.17201353e-003]
 [1.33294239e-001 5.89881916e-009 1.58956672e-010 1.91859596e-008
  3.28153683e-014 8.66705736e-001]
 ...
 [9.99999977e-001 2.34238644e-008 7.88163862e-036 3.94541680e-022
  2.89831045e-040 5.27490514e-034]
 [9.99999992e-001 8.85364872e-010 9.08660065e-023 1.21163032e-041
  6.59322815e-061 7.12890787e-009]
 [1.00000000e+000 9.74176365e-064 1.75864272e-061 1.85308034e-086
  1.19368618e-111 5.94285617e-057]]


In [9]:
for i in range(0, 10):
    print(predictions[i], paras[i+first_para])


[6.12524202e-07 5.91966700e-06 3.80973259e-07 6.12828199e-05
 2.19812375e-15 9.99931804e-01] "That's the painful thing about pets," said Mr. Dalloway; "they die. The
first sorrow I can remember was for the death of a dormouse. I regret to
say that I sat upon it. Still, that didn't make one any the less sorry.
Here lies the duck that Samuel Johnson sat on, eh? I was big for my
age."
[9.97766658e-01 7.34052784e-04 3.27275230e-04 5.56794110e-12
 1.60611264e-15 1.17201353e-03] "Please tell me--everything." That was what she wanted to say. He had
drawn apart one little chink and showed astonishing treasures. It seemed
to her incredible that a man like that should be willing to talk to her.
He had sisters and pets, and once lived in the country. She stirred her
tea round and round; the bubbles which swam and clustered in the cup
seemed to her like the union of their minds.
[1.33294239e-01 5.89881916e-09 1.58956672e-10 1.91859596e-08
 3.28153683e-14 8.66705736e-01] The talk meanwhile raced pa