In [1]:
from nltk import tokenize
import numpy as np
import random
import pandas as pd

In [2]:
def split_text(filepath, min_char):
    
    # Load data into string variable and remove new line characters
    file = open(filepath, "r", encoding="utf8")
    text = file.read().replace('\n', ' ')
    text = text.replace('.”', '”.').replace('."', '".').replace('?”', '”?').replace('!”', '”!')
    text = text.replace('--', ' ').replace('. . .', '').replace('_', '')
    file.close()
    
    # Split text into a list of sentences
    sentences = tokenize.sent_tokenize(text)
    
    # Remove sentences that are less than min_char long
    sentences = [sent for sent in sentences if len(sent) >= min_char]

    return list(sentences)

In [3]:
# Set parameter values
min_char = 5

# Create lists
dickens = split_text('bis/A_Tale_of_Two_Cities.txt', min_char = min_char)
montgomery = split_text('bis/Anne_of_Avonlea.txt', min_char = min_char)
stoker = split_text('bis/Dracula.txt', min_char = min_char)
austen = split_text('bis/Emma.txt', min_char = min_char)
swift = split_text('bis/Gullivers_Travels.txt', min_char = min_char)
thatcher = split_text('bis/Hard_Times.txt', min_char = min_char)
twain = split_text('bis/Huckleberry_Finn.txt', min_char = min_char)
alcott = split_text('bis/Little_Women.txt', min_char = min_char)
doyle =  split_text('bis/The_Adventures_of_Sherlock_Holmes.txt', min_char = min_char)
sinclair = split_text('bis/The_Jungle.txt', min_char = min_char)

In [4]:
# Print length of each list

text_dict = {'Dickens': dickens, 'Montgomery': montgomery, 'Stoker': stoker, 'Austen': austen, 'Swift': swift, 'Thatcher': thatcher, 'Twain': twain, 'Alcott': alcott, 
            'Doyle': doyle, 'Sinclair' : sinclair}

for key in text_dict.keys():
    print(key, ':', len(text_dict[key]))

Dickens : 7718
Montgomery : 5497
Stoker : 8641
Austen : 8473
Swift : 2840
Thatcher : 4364
Twain : 5839
Alcott : 9447
Doyle : 6475
Sinclair : 6636


In [5]:
# Set random seed
np.random.seed(1)

# Set length parameter
max_len = 4000

# Select sentences
names = [dickens, montgomery, stoker, austen, swift, thatcher, twain, alcott, doyle, sinclair]
combined = []

for name in names:
    name = np.random.choice(name, max_len, replace = True)
    combined += list(name)

print('The length of the combined list is:', len(combined))

The length of the combined list is: 40000


In [6]:
labels = ['Dickens']*max_len + ['Montgomery']*max_len + ['Stoker']*max_len + ['Austen']*max_len\
         + ['Swift']*max_len + ['Thatcher']*max_len + ['Twain']*max_len\
         + ['Alcott']*max_len + ['Doyle']*max_len\
         + ['Sinclair']*max_len

print('The length of the labels list is:', len(labels))

The length of the labels list is: 40000


In [7]:
# Set random seed
random.seed(3)

# Randomly shuffle data
zipped = list(zip(combined, labels))
random.shuffle(zipped)
combined, labels = zip(*zipped)

In [8]:
# Create pandas dataframe
out_data = pd.DataFrame()
out_data['text'] = combined
out_data['author'] = labels

print(out_data.head())

                                                text      author
0  Some of the "New Women" writers will some day ...      Stoker
1   I mean the fact that you told a falsehood today.  Montgomery
2                                says the bald-head.       Twain
3  "I shall marry whom I please, Aunt March, and ...      Alcott
4   “Perhaps it was nothing very dreadful after all.  Montgomery


In [9]:
# Export as a csv file
out_data.to_csv('author_data.csv', index=False)