In [2]:
import numpy as np
import pandas as pd
import sys, os
import re
import json
from urllib.request import urlopen
import spacy

Read data (posted file on Github)

In [3]:
url = 'https://raw.githubusercontent.com/ram-senth/datasci-w266-2023-spring-team-story-bot/main/cleaned_merged_fairy_tales_without_eos.txt'
response = urlopen(url)
f_lines = [l.decode() for l in response.readlines()]
f_lines[:10], f_lines[-10:]

(['The Happy Prince.\n',
  'HIGH above the city, on a tall column, stood the statue of the Happy Prince.  He was gilded all over with thin leaves of fine gold, for eyes he had two bright sapphires, and a large red ruby glowed on his sword-hilt.\n',
  'He was very much admired indeed.  “He is as beautiful as a weathercock,” remarked one of the Town Councillors who wished to gain a reputation for having artistic tastes; “only not quite so useful,” he added, fearing lest people should think him unpractical, which he really was not.\n',
  '“Why can’t you be like the Happy Prince?” asked a sensible mother of her little boy who was crying for the moon.  “The Happy Prince never dreams of crying for anything.”\n',
  '“I am glad there is some one in the world who is quite happy,” muttered a disappointed man as he gazed at the wonderful statue.\n',
  '“He looks just like an angel,” said the Charity Children as they came out of the cathedral in their bright scarlet cloaks and their clean white pi

Use Ghiwa's code to parse stories and create a list with each story

In [4]:
# different strategy for parsing out titles
# extract rows that have between 1 and 6 words that are preceeded by an empty line as dictionary key
# extract number of lines in each story as value
def clean_line(l):
    # remove notes which starts with * or (\d) or between []
    l = re.sub(r'^\s*?\*.*|^\(\s*?\d+\s*?\).*', '', l) # first remove footnotes (lines that start with * or (\d))
    l = re.sub(r'\*|\(\s*?\d+\s*?\)|\[.*\]', ' ', l) # now remove special characters referencing a footnote on text
    # if only a space, then set it empty
    l = re.sub(r'^\s+$', '', l)
    # clean multiple spaces
    return re.sub(r'\s+', ' ', l).strip()

# control variables
title_dict = dict()
flag = 0
prev_title = ''
prev_num_words = 0
prev_index = 0
story_lines = []
stories = []
titles = []
i = 0

for row in f_lines:
    i += 1
    num_words = 0
    words = row.split()
    num_words = len(words)
    row_cln = clean_line(row.strip())
    if 0 < num_words < 7:
        if prev_num_words > 0 or '   ' in row:
            story_lines.append(row_cln)
        elif prev_index > 0:
            stories.append(' '.join(l.strip() for l in story_lines).strip())
            story_lines = []
            # put the tile on another list to check where each story falls in
            titles.append(prev_title)
            # Ghiwa's code
            title_dict[prev_title] = i - prev_index
            prev_index = i
            prev_title = row[:-2]
        else:
            # Ghiwa's code
            #title_dict[row[:-2]] = i
            prev_index = i
            prev_title = row[:-2]
    else:
        story_lines.append(row_cln)
    prev_num_words = num_words

# for some reason it doesn't read the last story
print(len(titles), len(stories))
titles[:10], stories[:10]

538 538


(['The Happy Prince',
  'The Nightingale and the Rose',
  'The Selfish Giant',
  'The Devoted Friend',
  'The Remarkable Rocket',
  "THE EMPEROR'S NEW CLOTHES",
  'THE SWINEHERD',
  'THE REAL PRINCESS',
  'THE SHOES OF FORTUNE',
  'I. A Beginning'],
 ['HIGH above the city, on a tall column, stood the statue of the Happy Prince. He was gilded all over with thin leaves of fine gold, for eyes he had two bright sapphires, and a large red ruby glowed on his sword-hilt. He was very much admired indeed. “He is as beautiful as a weathercock,” remarked one of the Town Councillors who wished to gain a reputation for having artistic tastes; “only not quite so useful,” he added, fearing lest people should think him unpractical, which he really was not. “Why can’t you be like the Happy Prince?” asked a sensible mother of her little boy who was crying for the moon. “The Happy Prince never dreams of crying for anything.” “I am glad there is some one in the world who is quite happy,” muttered a disapp

Load spaCy to divide by sentences

In [5]:
nlp = spacy.load('en_core_web_sm')
# spacy can only handle stories with less than 1M chars, but there are some with more than that
for i,story in enumerate(stories):
    if len(story)>1_000_000:
        print(i, titles[i], len(story))
# allow stories with more than 1M chars
nlp.max_length = 1_500_000

27 1 The Frog-King, or Iron Henry 1434962
283 CHAPTER ONE 1000763
285 MOBY-DICK; 1177997
288 I 1341059


In [None]:
# test spacy funcitonality to divide sentences
story0 = stories[0]
story0_sentences = [s.text for s in nlp(story0).sents]
story0_sentences

In [149]:
# apply to whole text corpus: this is quite time and memory intensive (took about 8mins on my laptop)
story_sentences = []
for story in stories:
    story_sentences.append([s.text for s in nlp(story).sents])
len(story_sentences)

538

Create datasets with multiple sentences as context

In [160]:
def crate_sentence_dataset(sentence_list, n_sentences=1):
    """Return records with variable and label fields. n_sentences define the number of sentences to use as context for the variable, the label is always 1 sentence"""
    dataset = []
    skip = n_sentences
    for i in range(len(sentence_list)):
        variable = ' '.join(sentence_list[i: i+skip])
        label = ' '.join(sentence_list[i+skip: i+skip+1])
        if not variable or not label:
            continue
        dataset.append(dict(variable=variable, label=label))
    return dataset

In [161]:
n_sentences = 1
dfs = []
for ss in story_sentences:
    sentence_records = crate_sentence_dataset(ss, n_sentences)
    dfs.append(pd.DataFrame.from_records(sentence_records))
df_all = pd.concat(dfs).reset_index(drop=True)
display(df_all)
df_all.to_csv(f'posptproc_corpus_spacy_s{n_sentences}.csv', index=False)

Unnamed: 0,variable,label
0,"HIGH above the city, on a tall column, stood t...",He was gilded all over with thin leaves of fin...
1,He was gilded all over with thin leaves of fin...,He was very much admired indeed.
2,He was very much admired indeed.,"“He is as beautiful as a weathercock,” remarke..."
3,"“He is as beautiful as a weathercock,” remarke...",“Why can’t you be like the Happy Prince?”
4,“Why can’t you be like the Happy Prince?”,asked a sensible mother of her little boy who ...
...,...,...
206185,I knew how he felt.,"I could ha' ate a wolf myself."""
206186,"I could ha' ate a wolf myself.""",Then they all moved in to the dinner table a s...
206187,Then they all moved in to the dinner table a s...,The shade of anxiety in madam's eye was caused...
206188,The shade of anxiety in madam's eye was caused...,"Eat and be glad, for seldom hath there been su..."


In [162]:
n_sentences = 2
dfs = []
for ss in story_sentences:
    sentence_records = crate_sentence_dataset(ss, n_sentences)
    dfs.append(pd.DataFrame.from_records(sentence_records))
df_all = pd.concat(dfs).reset_index(drop=True)
display(df_all)
df_all.to_csv(f'posptproc_corpus_spacy_s{n_sentences}.csv', index=False)

Unnamed: 0,variable,label
0,"HIGH above the city, on a tall column, stood t...",He was very much admired indeed.
1,He was gilded all over with thin leaves of fin...,"“He is as beautiful as a weathercock,” remarke..."
2,He was very much admired indeed. “He is as bea...,“Why can’t you be like the Happy Prince?”
3,"“He is as beautiful as a weathercock,” remarke...",asked a sensible mother of her little boy who ...
4,“Why can’t you be like the Happy Prince?” aske...,“The Happy Prince never dreams of crying for a...
...,...,...
205700,I tell you I was glad on't. I knew how he felt.,"I could ha' ate a wolf myself."""
205701,I knew how he felt. I could ha' ate a wolf mys...,Then they all moved in to the dinner table a s...
205702,"I could ha' ate a wolf myself."" Then they all ...",The shade of anxiety in madam's eye was caused...
205703,Then they all moved in to the dinner table a s...,"Eat and be glad, for seldom hath there been su..."


In [168]:
n_sentences = 3
dfs = []
for ss in story_sentences:
    sentence_records = crate_sentence_dataset(ss, n_sentences)
    dfs.append(pd.DataFrame.from_records(sentence_records))
df_all = pd.concat(dfs).reset_index(drop=True)
display(df_all)
df_all.to_csv(f'posptproc_corpus_spacy_s{n_sentences}.csv', index=False)

Unnamed: 0,variable,label
0,"HIGH above the city, on a tall column, stood t...","“He is as beautiful as a weathercock,” remarke..."
1,He was gilded all over with thin leaves of fin...,“Why can’t you be like the Happy Prince?”
2,He was very much admired indeed. “He is as bea...,asked a sensible mother of her little boy who ...
3,"“He is as beautiful as a weathercock,” remarke...",“The Happy Prince never dreams of crying for a...
4,“Why can’t you be like the Happy Prince?” aske...,“I am glad there is some one in the world who ...
...,...,...
205218,He shortened up so 'mazin' quick on that praye...,"I could ha' ate a wolf myself."""
205219,I tell you I was glad on't. I knew how he felt...,Then they all moved in to the dinner table a s...
205220,I knew how he felt. I could ha' ate a wolf mys...,The shade of anxiety in madam's eye was caused...
205221,"I could ha' ate a wolf myself."" Then they all ...","Eat and be glad, for seldom hath there been su..."
