In [1]:
import os
import random
import re
import sys
import string
from collections import Counter, defaultdict
from string import punctuation
from time import sleep
from tqdm.notebook import tqdm

import matplotlib
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

import enchant
import nltk
import spacy
from nltk.tag import StanfordNERTagger
from nltk.tokenize import sent_tokenize, word_tokenize
from spacy.lang.en.stop_words import STOP_WORDS

d = enchant.Dict("en_US")
spacy_nlp = spacy.load("en_core_web_sm")

%matplotlib inline

In [2]:
RAW_DATA_PATH = "/Users/spencerbraun/Documents/Stanford/CS 230 - Deep Learning/Project/CS_230_Project/data/raw/"
SAVE_DATA_PATH = "/Users/spencerbraun/Documents/Stanford/CS 230 - Deep Learning/Project/CS_230_Project/data/processed/"

## Helper Functions

In [4]:
def readCorpus(url):
    content = requests.get(url).content.decode('ascii', 'ignore')
    content_list = sent_tokenize(content.replace('\r\n', ' '))
    
    filtered_list = filterSentences(content_list)
    
    return filtered_list[100:]

In [5]:
def splitData(data, keepnum):
    random.seed(123)
    
    split1 = int(keepnum * 0.05)
    remain = split1 % 128
    split1 += remain

    random.shuffle(data)
    selectedData = data[0:keepnum]
    train = selectedData[split1:]
    test = selectedData[0:split1]
    
    return (train, test)

## ASAP Essays

In [2]:

aes_file = RAW_DATA_PATH + "asap-aes/training_set_rel3.tsv"
aes_list = [] 
with open(aes_file, encoding='utf-8', errors='ignore') as f:
    for line in f:
        aes_list.append(line.strip().split('\t'))


In [3]:
aes_df = pd.DataFrame(aes_list[1:], columns=aes_list[0])

In [4]:
num_cols = [
    'rater1_domain1', 'rater2_domain1', 'rater3_domain1', 
    'domain1_score', 'rater1_domain2', 'rater2_domain2', 'domain2_score'
]

aes_df[num_cols] = aes_df[num_cols].applymap(
    lambda x: np.nan if (x == "") or (x is None) else int(x)
)

In [6]:
aes_df["total_score"] = (aes_df["domain1_score"] + aes_df["domain2_score"].fillna(aes_df["domain1_score"]))/2

In [7]:
cols = ['essay_id', 'essay_set', 'essay', 'domain1_score', 'domain2_score', 'total_score']
aes_df[cols].head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,domain2_score,total_score
0,1,1,"""Dear local newspaper, I think effects compute...",8,,8.0
1,2,1,"""Dear @CAPS1 @CAPS2, I believe that using comp...",9,,9.0
2,3,1,"""Dear, @CAPS1 @CAPS2 @CAPS3 More and more peop...",7,,7.0
3,4,1,"""Dear Local Newspaper, @CAPS1 I have found tha...",10,,10.0
4,5,1,"""Dear @LOCATION1, I know having computers has ...",8,,8.0
...,...,...,...,...,...,...
12973,21626,8,""" In most stories mothers and daughters are ei...",35,,35.0
12974,21628,8,""" I never understood the meaning laughter is t...",32,,32.0
12975,21629,8,"""When you laugh, is @CAPS5 out of habit, or is...",40,,40.0
12976,21630,8,""" Trippin' on fe...",40,,40.0


In [11]:
aes_essays = aes_df.query("total_score > 1")["essay"].values.tolist()

In [12]:
def cleanAES(dataList):
    newList = []
    for sent in dataList:
        sent = re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)', '', sent)
        sent = sent.replace("\\","")
        sent = sent.replace("\\'","")
        sent = sent.strip().strip("'").strip('"')

        if len(sent) < 40:
            continue
        if '^' in sent:
            continue
        
        sent = ' '.join(sent.split())
        sent = sent.lower()#.decode('utf8', 'ignore')
        newList.append(sent + '\n')
    
    return newList

In [13]:
clean_aes = cleanAES(aes_essays)

In [14]:
d_check = lambda sent: map(lambda x: d.check(x), word_tokenize(sent))
split_aes = []
for essay in tqdm(clean_aes):
    split_up = sent_tokenize(essay)
    for sent in split_up:
        words = word_tokenize(sent)
        if not all(list(d_check(sent))):
            continue
        if len(words) > 30:
            continue
        if len(words) < 4:
            continue
        split_aes.append(sent+'\n')

100%|██████████| 10826/10826 [01:46<00:00, 101.41it/s]


In [154]:
with open(SAVE_DATA_PATH + "aes.txt", 'w') as f:
    f.writelines(split_aes)

In [153]:
len(split_aes)

84373

## Sophisticated Datasets

In [302]:
firstCorpus = [
    "http://www.gutenberg.org/cache/epub/5827/pg5827.txt", #Russell, The Problems of Philosophy
    "http://www.gutenberg.org/cache/epub/15718/pg15718.txt", #Bleyer, How To Write Special Feature Articles
    "https://www.gutenberg.org/files/492/492-0.txt", #Essays in the Art of Writing, by Robert Louis
    "https://www.gutenberg.org/files/37090/37090-0.txt", #Our Knowledge of the External World as a Field for Scientific Method in Philosoph, by Bertrand Russell
    "https://www.gutenberg.org/files/42580/42580-8.txt", #Expository Writing, by Mervin James Curl
    "http://www.gutenberg.org/cache/epub/2529/pg2529.txt", #The Analysis of Mind, by Bertrand Russell
    "https://www.gutenberg.org/files/38280/38280-0.txt", #Modern Essays, by Various
    "https://www.gutenberg.org/files/205/205-0.txt", #Walden, and On The Duty Of Civil Disobedience, by Henry David Thoreau
    "https://www.gutenberg.org/files/1022/1022-0.txt", #Walking, by Henry David Thoreau
    "http://www.gutenberg.org/cache/epub/34901/pg34901.txt",
    "https://www.gutenberg.org/files/98/98-0.txt",
    "http://www.gutenberg.org/cache/epub/32168/pg32168.txt",
    "https://www.gutenberg.org/files/766/766-0.txt",
    "https://www.gutenberg.org/files/1250/1250-0.txt",
    "https://www.gutenberg.org/files/140/140-0.txt",
    "https://www.gutenberg.org/files/1400/1400-0.txt",
    "https://www.gutenberg.org/files/215/215-0.txt", # London, call of the wild.
    "http://www.gutenberg.org/cache/epub/910/pg910.txt", #London White Fang
    "https://www.gutenberg.org/files/786/786-0.txt",
    "http://www.gutenberg.org/cache/epub/815/pg815.txt",
    "http://www.gutenberg.org/cache/epub/10378/pg10378.txt",
    "http://www.gutenberg.org/cache/epub/5123/pg5123.txt",
    "http://www.gutenberg.org/cache/epub/5669/pg5669.txt"
]


In [18]:
secondCorpus = [
    "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/1906/cardinal-1906.txt?sequence=3&isAllowed=y",
    "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/1658/WoolfWaves-1658.txt?sequence=4&isAllowed=y",
    "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/0172/moderns-0172.txt?sequence=4&isAllowed=y",
    "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/3246/3246.txt?sequence=8&isAllowed=y",
    "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/2042/joywoman-2042.txt?sequence=4&isAllowed=y",
    "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/3135/3135.txt?sequence=8&isAllowed=y",
    "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/1711/wiseman-1711.txt?sequence=4&isAllowed=y",
    "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/3245/3245.txt?sequence=8&isAllowed=y",
    "http://www.gutenberg.org/cache/epub/5827/pg5827.txt", #Russell, The Problems of Philosophy
    "http://www.gutenberg.org/cache/epub/15718/pg15718.txt", #Bleyer, How To Write Special Feature Articles
    "https://www.gutenberg.org/files/492/492-0.txt", #Essays in the Art of Writing, by Robert Louis
    "https://www.gutenberg.org/files/37090/37090-0.txt", #Our Knowledge of the External World as a Field for Scientific Method in Philosoph, by Bertrand Russell
    "https://www.gutenberg.org/files/42580/42580-8.txt", #Expository Writing, by Mervin James Curl
    "http://www.gutenberg.org/cache/epub/2529/pg2529.txt", #The Analysis of Mind, by Bertrand Russell
    "https://www.gutenberg.org/files/38280/38280-0.txt",
    "https://www.gutenberg.org/files/215/215-0.txt", # London, call of the wild.
    "http://www.gutenberg.org/cache/epub/910/pg910.txt",
    "https://www.gutenberg.org/files/25110/25110-0.txt",
    "http://www.gutenberg.org/cache/epub/32168/pg32168.txt",
    "http://www.gutenberg.org/cache/epub/16712/pg16712.txt",
    "http://www.gutenberg.org/cache/epub/7514/pg7514.txt",
    "http://www.gutenberg.org/cache/epub/18477/pg18477.txt",
    "http://www.gutenberg.org/cache/epub/5669/pg5669.txt",
    "http://www.gutenberg.org/cache/epub/5123/pg5123.txt",
    "http://www.gutenberg.org/cache/epub/10378/pg10378.txt",
    "https://www.gutenberg.org/files/140/140-0.txt",
    "http://www.gutenberg.org/cache/epub/44082/pg44082.txt"
]

In [20]:
def filterSentences(sentList):
    filteredList = []
    for sent in sentList:
        sent = sent.replace("\\","")
        sent = sent.replace("\\'","")


        if len(sent) < 40:
            continue
        if '^' in sent:
            continue
        if bool(re.search(r'\d', sent)):
            continue
        if bool(re.search(r"\b[A-Z][A-Z]+\b", sent)):
            continue
        if bool(re.search(r'\"', sent)):
            continue
        if bool(re.search(r'_', sent)):
            continue

        sent = sent.strip()
        sent = sent.lower()
        sent = ' '.join(sent.split())
        filteredList.append(sent + '\n')

    return filteredList

### First Corpus

In [66]:
allGuten = []
for url in firstCorpus:
    allGuten.append(readCorpus(url))

In [68]:
sum([len(x) for x in allGuten])

69955

In [69]:
allSophs = [y for x in allGuten for y in x]

In [375]:
with open(SAVE_DATA_PATH + "allsophs.txt", 'w') as f:
    f.writelines(allSophs)

In [None]:
with open(SAVE_DATA_PATH + "allsophs.txt", 'r') as f:
    allSophs = f.read_lines

### Second Corpus

In [22]:
allSecondCorpus = []
for url in secondCorpus:
    allSecondCorpus.append(readCorpus(url))

In [23]:
sum([len(x) for x in allSecondCorpus])

51291

In [118]:
allSophs = [y for x in allSecondCorpus for y in x]

In [None]:
with open(SAVE_DATA_PATH + "soph_2.txt", 'w') as f:
    f.writelines(allSophs)

### Second Corpus without Puctuation

In [24]:
punctSoph = [y for x in allSecondCorpus for y in x]

In [66]:
allSophs = list(map(removePunc, punctSoph))

In [68]:
with open(SAVE_DATA_PATH + "KMW_essays.txt", 'r') as f:
    kmw = f.readlines()
with open(SAVE_DATA_PATH + "aes.txt", 'r') as f:
    split_aes = f.readlines()

In [69]:
allnaive = kmw + split_aes[0:50000]

In [70]:
allnaive = list(map(removePunc, allnaive))

## Hewlett ASAP + Sophisticated with Tokens

### Process ASAP tokens

In [6]:

aes_file = RAW_DATA_PATH + "asap-aes/training_set_rel3.tsv"
aes_list = [] 
with open(aes_file, encoding='utf-8', errors='ignore') as f:
    for line in f:
        aes_list.append(line.strip().split('\t'))
        
aes_df = pd.DataFrame(aes_list[1:], columns=aes_list[0])    
num_cols = [
    'rater1_domain1', 'rater2_domain1', 'rater3_domain1', 
    'domain1_score', 'rater1_domain2', 'rater2_domain2','domain2_score'
]

aes_df[num_cols] = aes_df[num_cols].applymap(
    lambda x: np.nan if (x == "") or (x is None) else int(x)
)
aes_df["total_score"] = (1/2)*(
    aes_df["domain1_score"] + 
    aes_df["domain2_score"].fillna(aes_df["domain1_score"])
)

In [7]:
aes_essays = aes_df.query("total_score > 1")["essay"].values.tolist()

In [85]:
token_regex = r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[0-9]+)'

In [98]:
token_list = []
new_strings = []
for string in aes_essays:
    matches = re.findall(token_regex, string)
    token_list.extend(list(set(matches)))
    
    string = string.replace('@' , '')
    
    replacement = {x: "<" + re.sub('[0-9]', '', x) + ">" for x in matches}
    for match in matches:
        string = string.replace(match, replacement[match])
    new_strings.append(string)

In [99]:
general_tokens = list(set([re.sub('[0-9]', '', x) for x in token_list]))

In [100]:
general_tokens

['MONTH',
 'PERCENT',
 'STATE',
 'CAPS',
 'NUM',
 'TIME',
 'CITY',
 'LOCATION',
 'PERSON',
 'MONEY',
 'DR',
 'ORGANIZATION',
 'DATE']

In [104]:
def cleanAES(dataList):
    newList = []
    for sent in dataList:
        
        sent = sent.replace("\\'","")
        sent = sent.strip().strip("'").strip('"')
        sent = sent.replace("'", "")
        sent = sent.replace('"', "")
        sent = re.sub('([.,!?()])', r' \1 ', sent)
        sent = re.sub('\s{2,}', ' ', sent)
        sent = sent.replace(">s", ">")
        sent = sent.strip()
        

        if len(sent) < 40:
            continue
        if '^' in sent:
            continue
        
        sent = ' '.join(sent.split())
        sent = sent.lower()
        newList.append(sent + '\n')
    
    return newList

In [105]:
clean_aes = cleanAES(new_strings)

In [108]:
d_check = lambda words: map(lambda x: d.check(x), words)
split_aes = []
for essay in tqdm.notebook.tqdm(clean_aes):
    split_up = sent_tokenize(essay)
    for sent in split_up:
        words_ex_tokens = [x for x in sent.split() if x.upper().strip('<').strip('>') not in general_tokens]
        words = word_tokenize(' '.join(words_ex_tokens))
        if not all(list(d_check(words))):
            continue
        if len(words) > 30:
            continue
        if len(words) < 4:
            continue
        split_aes.append(sent+'\n')

HBox(children=(IntProgress(value=0, max=10826), HTML(value='')))




In [110]:
len(split_aes)

59886

### Gutenberg + Oxford

In [40]:
taggedCorpus = [
    "http://www.gutenberg.org/cache/epub/5827/pg5827.txt", #Russell, The Problems of Philosophy
    "http://www.gutenberg.org/cache/epub/15718/pg15718.txt", #Bleyer, How To Write Special Feature Articles
    "https://www.gutenberg.org/files/492/492-0.txt", #Essays in the Art of Writing, by Robert Louis
    "https://www.gutenberg.org/files/37090/37090-0.txt", #Our Knowledge of the External World as a Field for Scientific Method in Philosoph, by Bertrand Russell
    "https://www.gutenberg.org/files/42580/42580-8.txt", #Expository Writing, by Mervin James Curl
    "http://www.gutenberg.org/cache/epub/2529/pg2529.txt", #The Analysis of Mind, by Bertrand Russell
    "https://www.gutenberg.org/files/38280/38280-0.txt", #Modern Essays, by Various
    "https://www.gutenberg.org/files/205/205-0.txt", #Walden, and On The Duty Of Civil Disobedience, by Henry David Thoreau
    "https://www.gutenberg.org/files/1022/1022-0.txt", #Walking, by Henry David Thoreau
    "http://www.gutenberg.org/cache/epub/34901/pg34901.txt",
    "https://www.gutenberg.org/files/98/98-0.txt",
    "http://www.gutenberg.org/cache/epub/32168/pg32168.txt",
    "https://www.gutenberg.org/files/1250/1250-0.txt",
    "https://www.gutenberg.org/files/140/140-0.txt",
    "https://www.gutenberg.org/files/215/215-0.txt", # London, call of the wild.
    "http://www.gutenberg.org/cache/epub/910/pg910.txt", #London White Fang
    "http://www.gutenberg.org/cache/epub/10378/pg10378.txt",
    "http://www.gutenberg.org/cache/epub/5123/pg5123.txt",
    "http://www.gutenberg.org/cache/epub/5669/pg5669.txt",
    "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/1906/cardinal-1906.txt?sequence=3&isAllowed=y",
    "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/1658/WoolfWaves-1658.txt?sequence=4&isAllowed=y",
    "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/0172/moderns-0172.txt?sequence=4&isAllowed=y",
]


In [41]:
def filterSentences(sentList):
    filteredList = []
    for sent in sentList:
        sent = sent.replace("\\","")
        sent = sent.replace("\\'","")


        if len(sent) < 40:
            continue
        if '^' in sent:
            continue
        if bool(re.search(r"\b[A-Z][A-Z]+\b", sent)):
            continue
        if bool(re.search(r'_', sent)):
            continue

        sent = sent.strip()
        sent = ' '.join(sent.split())
        filteredList.append(sent + '\n')

    return filteredList

In [43]:
taggedTexts = []
for url in taggedCorpus:
    taggedTexts.append(readCorpus(url))

In [44]:
sum([len(x) for x in taggedTexts])

50860

In [45]:
allTaggedSophs = [y for x in taggedTexts for y in x]

#### StanfordNERTagger

In [13]:
jar = "stanford-ner-2018-10-16/stanford-ner-3.9.2.jar"
model = "stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz"


In [22]:
st = StanfordNERTagger(model, jar) 

In [71]:
TAG_HASH = {}

def tagSentence(sent):
    tokenize = word_tokenize(sent)
    tagged = st.tag(tokenize)
    
    tokens = dict([x for x in tagged if x[1] != 'O'])
    tokens = {x: "<" + y  + ">" for x,y in tokens.items() }
    new_sent = [tokens.get(x, x) for x in tokenize]
    for word, repl in tokens.items():
        TAG_HASH[word] = repl
    
    return(' '.join(new_sent) + '\n')

In [106]:
tagged_write_path = SAVE_DATA_PATH + 'tagged_data/'
batches = [x for x in range(len(allTaggedSophs)) if x % 1000 == 0]
batches.append(len(allTaggedSophs))

In [4]:
for i in tqdm.notebook.tqdm(range(len(batches)-1)):
    
    start = batches[i]
    stop = batches[i+1]
    batch = allTaggedSophs[start:stop]

    stanfordTaggedSophs = []
    for sent in tqdm.notebook.tqdm(batch):
        stanfordTaggedSophs.append(tagSentence(sent))
        
    with open(tagged_write_path + f'batch{i}', 'w') as f:
        f.writelines(stanfordTaggedSophs)

In [113]:
TAG_HASH

{'Descartes': '<PERSON>',
 'Detroit': '<LOCATION>',
 'United': '<LOCATION>',
 'States': '<LOCATION>',
 'Lawrence': '<PERSON>',
 'Kansas': '<ORGANIZATION>',
 'Blackmar': '<PERSON>',
 'the': '<ORGANIZATION>',
 'University': '<ORGANIZATION>',
 'of': '<ORGANIZATION>',
 'Leibniz': '<PERSON>',
 'Berkeley': '<LOCATION>',
 'Bishop': '<LOCATION>',
 'China': '<LOCATION>',
 'Bismarck': '<PERSON>',
 'Germany': '<LOCATION>',
 'London': '<LOCATION>',
 'England': '<LOCATION>',
 'Europe': '<LOCATION>',
 'Earth': '<LOCATION>'}

#### spacy NER

In [47]:
#https://spacy.io/api/annotation#named-entities
spacy2stanford = {
    'NORP': 'CAPS',
    'FAC': 'LOCATION',
    'ORG': 'ORGANIZATION',
    'GPE': 'STATE',
    'LOC': 'LOCATION',
    'PRODUCT': 'CAPS',
    'EVENT': 'CAPS',
    'WORK_OF_ART': 'CAPS',
    'LAW': 'CAPS',
    'LANGUAGE': 'CAPS',
    'QUANTITY': 'NUM',
    'ORDINAL': 'NUM',
    'CARDINAL': 'NUM'   
}

In [49]:
spacy_write_path = SAVE_DATA_PATH + 'tagged_data/'

In [51]:
SPACY_TOKENS = {}
def spacyTagger(sent):
    tokenize = word_tokenize(sent)
    document = spacy_nlp(sent)
    token_map = {}
    for element in document.ents:
        label = spacy2stanford.get(str(element.label_), str(element.label_))
        SPACY_TOKENS[str(element)] = label
        token_map[str(element)] = "<" + label + ">"
    
    new_sent = [token_map.get(x, x) for x in tokenize]
    
    return(' '.join(new_sent) + '\n')

In [52]:

spacyTaggedSophs = []
for sent in tqdm.notebook.tqdm(allTaggedSophs):
    spacyTaggedSophs.append(spacyTagger(sent))

with open(spacy_write_path + f'spacyTaggedSophs.txt', 'w') as f:
    f.writelines(spacyTaggedSophs)

HBox(children=(IntProgress(value=0, max=50860), HTML(value='')))




In [53]:
with open(spacy_write_path + 'spacyTaggedSophs.txt') as f:
    tagged = f.readlines()

In [140]:
tagged = list(map(str.lower, tagged))

In [142]:
keepshort = []
for sent in tagged:
    words = len(sent.split(' '))
    if words <= 30:
        keepshort.append(sent)

In [143]:
len(keepshort)

32155

In [146]:

keepnum = 32100
sophstrain, sophtest = splitData(keepshort, keepnum)

print(len(sophtest))
print(len(sophstrain))

with open(SAVE_DATA_PATH + "soph_test_tagged.txt", 'w') as f:
    f.writelines(sophtest)
with open(SAVE_DATA_PATH + "soph_train_tagged.txt", 'w') as f:
    f.writelines(sophstrain)

1674
30426


In [121]:
allnaive = [x for x in split_aes if len(x) > 20]
naiveshort = []
for sent in allnaive:
    words = len(sent.split(' '))
    if words <= 30:
        naiveshort.append(sent)

In [122]:
len(naiveshort)

58784

In [124]:
naivetrain, naivetest = splitData(naiveshort, keepnum)

print(len(naivetest))
print(len(naivetrain))

with open(SAVE_DATA_PATH + "naive_test_tagged.txt", 'w') as f:
    f.writelines(naivetest)
with open(SAVE_DATA_PATH + "naive_train_tagged.txt", 'w') as f:
    f.writelines(naivetrain)

1674
30426
32100


### Tagged without Punctuation

In [136]:
def removePunc(sent):
    punct = string.punctuation.replace('<', '').replace('>', '')
    sent = re.sub('['+punct+']', '', sent)
    sent = ' '.join(sent.split())
    return(sent + '\n')

In [147]:
tagged_nopunct = list(map(removePunc, tagged))

In [149]:
split_aes_nopunct = list(map(removePunc, split_aes))

In [151]:
short_soph_tagged = []
for sent in tagged_nopunct:
    words = len(sent.split(' '))
    if words <= 30:
        short_soph_tagged.append(sent)


sophstrain, sophtest = splitData(short_soph_tagged, keepnum)

print(len(sophtest))
print(len(sophstrain))

with open(SAVE_DATA_PATH + "soph_test_tagged_nopunct.txt", 'w') as f:
    f.writelines(sophtest)
with open(SAVE_DATA_PATH + "soph_train_tagged_nopunct.txt", 'w') as f:
    f.writelines(sophstrain)

1674
30426


In [152]:
allnaive = [x for x in split_aes_nopunct if len(x) > 20]
naiveshort_tag_np = []
for sent in allnaive:
    words = len(sent.split(' '))
    if words <= 30:
        naiveshort_tag_np.append(sent)
        
naivetrain, naivetest = splitData(naiveshort_tag_np, keepnum)

with open(SAVE_DATA_PATH + "naive_test_tagged_nopunct.txt", 'w') as f:
    f.writelines(naivetest)
with open(SAVE_DATA_PATH + "naive_train_tagged_nopunct.txt", 'w') as f:
    f.writelines(naivetrain)

1674
30426
32100


## My Kids Way Essays

In [24]:
data = []
for item in text:
    data.append(item.get_text().split('\n'))

In [99]:
paginated_links = "https://www.mykidsway.com/essays/page/{}/"

all_essays = req = requests.get("https://www.mykidsway.com/essays/")
essay_html = BeautifulSoup(all_essays.content, 'html.parser')
divs = essay_html.find_all("div", class_="hovereffect")


In [106]:
all_links = []
for content in divs:
    all_links.append(content.find("a").get("href"))

In [108]:
for i in range(1,20):
    new_page = paginated_links.format(str(i))
    
    all_essays = req = requests.get(new_page)
    essay_html = BeautifulSoup(all_essays.content, 'html.parser')
    divs = essay_html.find_all("div", class_="hovereffect")
    
    for content in divs:
        all_links.append(content.find("a").get("href"))
    sleep(1)

In [111]:
def getText(link):
    
    req = requests.get(link)
    soup = BeautifulSoup(req.content, 'html.parser')
    text = soup.find_all("span", itemprop="description")
    
    data = []
    for item in text:
        split_text = item.get_text().split('\n')
        total_len = sum([len(x) for x in split_text])
        if total_len > 2000:
            print("skipping ", link)
            continue
            
        for sentence in split_text:
            data.append(sentence)
    
    return data


In [120]:

all_sentences = []
for link in set(all_links):
    print(link)
    data_list = getText(link)
    for sentence in data_list:
        all_sentences.append(sentence)
        
    sleep(1)

In [136]:

def cleanKMW(data_list):
    newList = []
    for sent in data_list:
        if len(sent) < 40:
            continue
        if '^' in sent:
            continue
        if bool(re.search(r'\d', sent)):
            continue
            
        sent = sent.lower()
        newList.append(sent + '\n')
    
    return newList
        

In [137]:
cleanedKMW = cleanKMW(all_sentences)

In [138]:
reordered = []
for sent in cleanedKMW:
    split_sent = sent_tokenize(sent.strip())
    for sentence in split_sent:
        if (len(word_tokenize(sentence)) > 20) or (len(word_tokenize(sentence)) < 4):
            continue
        reordered.append(sentence + '\n')

In [140]:
with open(SAVE_DATA_PATH + "KMW_essays.txt", 'w') as f:
    f.writelines(reordered)

### Data Saving Process

In [76]:
def limitLength(dataList, maxlen):
    keepshort = []
    for sent in dataList:
        words = len(word_tokenize(sent))
        if words <= maxlen:
            keepshort.append(sent)
            
    return keepshort

In [80]:
def writeOut(data, fileroot, maxlen):
    keepshort = limitLength(dataList, maxlen)
    train, test = splitData(keepshort, keepnum)

    with open(SAVE_DATA_PATH + f"test_{fileroot}.txt", 'w') as f:
        f.writelines(test)
    with open(SAVE_DATA_PATH + f"train_{fileroot}.txt", 'w') as f:
        f.writelines(train)

1676
31732


In [46]:
with open(SAVE_DATA_PATH + "KMW_essays.txt", 'r') as f:
    kmw = f.readlines()
with open(SAVE_DATA_PATH + "aes.txt", 'r') as f:
    split_aes = f.readlines()

In [47]:
allnaive = kmw + split_aes[0:50000]

In [75]:
writeOut(allnaive, "naive_3", 35)

1676
31732
33408
