In [3]:
from nltk import tokenize
import numpy as np
import random
import pandas as pd

In [4]:
def split_text(filepath, min_char):
    
    # Load data into string variable and remove new line characters
    file = open(filepath, "r", encoding="utf8")
    text = file.read().replace('\n', ' ')
    text = text.replace('.”', '”.').replace('."', '".').replace('?”', '”?').replace('!”', '”!')
    text = text.replace('--', ' ').replace('. . .', '').replace('_', '')
    file.close()
    
    # Split text into a list of sentences
    sentences = tokenize.sent_tokenize(text)
    
    # Remove sentences that are less than min_char long
    sentences = [sent for sent in sentences if len(sent) >= min_char]

    return list(sentences)

In [5]:
# Set parameter values
min_char = 5

# Create lists
dickens = split_text('bis/A_Tale_of_Two_Cities.txt', min_char = min_char)\
         +split_text('bis/Great_Expectations.txt', min_char = min_char)
montgomery = split_text('bis/Anne_of_Avonlea.txt', min_char = min_char)\
         + split_text('bis/Anne_of_Green_Gables.txt', min_char = min_char)
stoker = split_text('bis/Dracula.txt', min_char = min_char)
austen = split_text('bis/Emma.txt', min_char = min_char)\
        +split_text('bis/Pride_and_Prejudice.txt', min_char = min_char)\
        +split_text('bis/Sense_and_Sensibility.txt', min_char = min_char)
grimm = split_text('bis/Grimms_Fairy Tales.txt', min_char = min_char)
swift = split_text('bis/Gullivers_Travels.txt', min_char = min_char)
thatcher = split_text('bis/Hard_Times.txt', min_char = min_char)
twain = split_text('bis/Huckleberry_Finn.txt', min_char = min_char)
malory = split_text('bis/Le_Morte_DArthur.txt', min_char = min_char)
alcott = split_text('bis/Little_Women.txt', min_char = min_char)
berens = split_text('bis/Myths_and_Legends_of_Ancient_Greece_and_Rome.txt', min_char = min_char)
doyle =  split_text('bis/The_Adventures_of_Sherlock_Holmes.txt', min_char = min_char)
augustine = split_text('bis/The_Confessions_of_Saint_Augustine.txt', min_char = min_char)
thoreau = split_text('bis/Walden.txt', min_char = min_char)
sinclair = split_text('bis/The_Jungle.txt', min_char = min_char)
bronte = split_text('bis/Wuthering_Heights.txt', min_char = min_char)



In [6]:
# Print length of each list

text_dict = {'Dickens': dickens, 'Montgomery': montgomery, 'Stoker': stoker, 'Austen': austen, 'Grimm': grimm,
            'Swift': swift, 'Thatcher': thatcher, 'Twain': twain, 'Malory': malory, 'Alcott': alcott, 'Berens': berens, 
            'Doyle': doyle, 'Augustine': augustine, 'Thoreau': thoreau, 'Sinclair' : sinclair, 'Bronte' : bronte}

for key in text_dict.keys():
    print(key, ':', len(text_dict[key]))

Dickens : 17458
Montgomery : 12274
Stoker : 8641
Austen : 19200
Grimm : 2796
Swift : 2840
Thatcher : 4364
Twain : 5839
Malory : 5997
Alcott : 9447
Berens : 4432
Doyle : 6475
Augustine : 3670
Thoreau : 4055
Sinclair : 6636
Bronte : 5262


In [7]:
# Set random seed
np.random.seed(1)

# Set length parameter
max_len = 3500

# Select sentences
names = [dickens, montgomery, stoker, austen, grimm, swift, thatcher, twain, malory, alcott, berens, doyle, augustine, sinclair, bronte]
combined = []

for name in names:
    name = np.random.choice(name, max_len, replace = True)
    combined += list(name)

print('The length of the combined list is:', len(combined))

The length of the combined list is: 52500


In [8]:
labels = ['Dickens']*max_len + ['Montgomery']*max_len + ['Stoker']*max_len + ['Austen']*max_len\
         + ['Grimm']*max_len + ['Swift']*max_len + ['Thatcher']*max_len + ['Twain']*max_len\
         + ['Malory']*max_len + ['Alcott']*max_len + ['Berens']*max_len + ['Doyle']*max_len\
         + ['Augustine']*max_len + ['Sinclair']*max_len + ['Bronte']*max_len 

print('The length of the labels list is:', len(labels))

The length of the labels list is: 52500


In [9]:
# Set random seed
random.seed(3)

# Randomly shuffle data
zipped = list(zip(combined, labels))
random.shuffle(zipped)
combined, labels = zip(*zipped)

In [10]:
# Create pandas dataframe
out_data = pd.DataFrame()
out_data['text'] = combined
out_data['author'] = labels

print(out_data.head())

                                                text  author
0  She certainly had not been in the wrong, and h...  Austen
1  Why did you speak to him, Edgar?’  ‘I didn’t,’...  Bronte
2  "Don't praise me, Meg, for I could box his ear...  Alcott
3  And if I may win thee, yet shall thy lady be m...  Malory
4  Mina and I fear to be idle, so we have been ov...  Stoker


In [11]:
# Export as a csv file
out_data.to_csv('author_data.csv', index=False)