In [1]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

In [2]:
# This function slices the unwanted text (introduction, etc.) at the beginning of the txt file 
# It takes a string 'my_str', and delete everything before the specified 'sub' 
def slicer_front(my_str,sub):
  index=my_str.find(sub)
  if index !=-1 :
        return my_str[index:] 
  else :
        raise Exception('Sub string not found!')

In [3]:
# This function slices the unwanted text (introduction, etc.) at the end of the txt file 
def slicer_back(my_str,sub):
  index=my_str.find(sub)
  if index !=-1 :
        return my_str[:index] 
  else :
        raise Exception('Sub string not found!')

### Read Data

In [4]:
def get_soup(target_url):
    r = requests.get(target_url)
    soup = BeautifulSoup(r.text, "html.parser")
    return soup

Read in from local file: poetry collection "Collected Poems by Dylan Thomas" 

In [5]:
corpus = []
with open('DylanThomas.txt',"r") as f:
    f= re.sub(r'\n\n\n\n\n.+\n\n\n','\n', f.read()) ### Remove titles of poems 
    lines = f.split('\n') ### Split into lines 
    for line in lines:
        if line.strip():
            corpus.append(line)  

Read in from website.

In [6]:
CP = get_soup('https://raw.githubusercontent.com/tfavory/pmlg-poem-generator/master/model_training/corpus.txt')

In [7]:
for line in CP.get_text().split('\n'):
    if line.strip():
        corpus.append(line)  

In [8]:
len(corpus)

38251

Read from the website: https://raw.githubusercontent.com/tfavory/pmlg-poem-generator/master/model_training/corpus.txt

Poetry Collection: Songs of Innocence and Songs of Experience

In [9]:
SE = get_soup('http://www.gutenberg.org/files/1934/1934-0.txt')

In [10]:
# Clean preface, conclusion and titles
SE_txt = slicer_back(slicer_front(SE.get_text(),'How sweet is the shepherd'),'***END OF THE PROJECT GUTENBERG') # Delete the preface and conclusion
SE_txt_c = re.sub('\r\n\r\n\r\n\r\n.+\r\n\r\n\r\n','\r\n',SE_txt) # Clean the titles

In [11]:
# append sentances
for line in SE_txt_c.split('\r\n'):
    if line.strip():
        corpus.append(line)  

In [12]:
len(corpus)

39141

read from website: http://www.gutenberg.org/cache/epub/8789/pg8789.txt

Divine Comedy by Dante

In [13]:
DE = get_soup('http://www.gutenberg.org/cache/epub/8789/pg8789.txt')

In [14]:
# Clean preface, conclusion and titles
DE_txt = slicer_back(slicer_front(DE.get_text(),'IN the midway of this our'),'End of Project Gutenberg') # Delete the preface and conclusion
DE_txt_c = re.sub('\r\n\r\n\r\n\r\n,+\r\n\r\n\r\n','\r\n',DE_txt) # Clean the titles

In [15]:
# append sentances
for line in DE_txt_c.split('\r\n'):
    if line.strip():
        corpus.append(line) 

In [16]:
len(corpus)

43823

read from website: http://www.gutenberg.org/cache/epub/21700/pg21700.txt

Don Juan

In [17]:
DJ = get_soup('http://www.gutenberg.org/cache/epub/21700/pg21700.txt')

In [18]:
# Clean preface, conclusion and titles
DJ_txt = slicer_back(slicer_front(DJ.get_text(),'I want a hero: an uncommon want,'),'End of the Project Gutenberg EBook') # Delete the preface and conclusion
DJ_txt_c = re.sub('\r\n\r\n\r\n\r\n\r\n.+\r\n\r\n  ','\r\n',DJ_txt) # Clean the titles

In [19]:
# append sentances
for line in DJ_txt_c.split('\r\n'):
    if line.strip():
        corpus.append(line) 
len(corpus)

59750

Read from website: http://www.gutenberg.org/cache/epub/6524/pg6524.txt

Stray Birds by Rabindranath Tagore

In [20]:
StaryB = get_soup('http://www.gutenberg.org/cache/epub/6524/pg6524.txt')

In [21]:
# Clean preface, conclusion and titles
StaryB_txt = slicer_back(slicer_front(StaryB.get_text(),'Stray birds of summer come to my window to sing and fly away.'),'End of the Project Gutenberg EBook') # Delete the preface and conclusion
StaryB_txt_c1 = re.sub('\r\n\r\n\r\n\r\n\r\n.+\r\n\r\n  ','\r\n',StaryB_txt) # Clean the titles
StaryB_txt_c = re.sub('[1-9]\d*','',StaryB_txt_c1) # Clean the numbers

In [22]:
# append sentances
for line in StaryB_txt_c.split('\r\n'):
    if line.strip():
        corpus.append(line) 
len(corpus)

60372

Read from website: http://www.gutenberg.org/cache/epub/30488/pg30488.txt

The Green Helmet and Other Poems by William Butler Yeats

In [23]:
GH = get_soup('http://www.gutenberg.org/cache/epub/30488/pg30488.txt')

In [24]:
# Clean preface, conclusion and titles
GH_txt = slicer_back(slicer_front(GH.get_text(),'I swayed upon the gaudy stern'),'THE GREEN HELMET') # Delete the preface and conclusion
GH_txt_c = re.sub('\r\n\r\n\r\n\r\n\r\n.+\r\n\r\n  ','\r\n',GH_txt) # Clean the titles

In [25]:
# append sentances
for line in GH_txt_c.split('\r\n'):
    if line.strip():
        corpus.append(line) 
len(corpus)

60718

Read from website: http://www.gutenberg.org/files/38520/38520-0.txt

Poems of James Russell Lowell

In [27]:
JRL = get_soup('http://www.gutenberg.org/files/38520/38520-0.txt')

In [28]:
# Clean preface, conclusion and titles
JRL_txt = slicer_back(slicer_front(JRL.get_text(),'If some small savor creep into my rhyme'),'MEMORIAL VERSES.') # Delete the preface and conclusion
JRL_txt_c1 = re.sub('\r\n\r\n\r\n.+\r\n  ','\r\n',JRL_txt) # Clean the titles
JRL_txt_c2 = re.sub('\r\n\r\n.+\r\n  ','\r\n',JRL_txt_c1) # Clean the titles
JRL_txt_c3 = re.sub('\r\n.+\r\n\r\n  ','\r\n',JRL_txt_c2) # Clean the titles
JRL_txt_c = re.sub('[1-9]\d*\.','',JRL_txt_c3) # Clean the numbers

In [29]:
# append sentances
for line in JRL_txt_c.split('\r\n'):
    if line.strip():
        corpus.append(line) 
len(corpus)

69734

Read from website: http://www.gutenberg.org/cache/epub/19188/pg19188.txt

Christina G. Rossetti

In [30]:
CGR = get_soup('http://www.gutenberg.org/cache/epub/19188/pg19188.txt')

In [31]:
# Clean preface, conclusion and titles
CGR_txt = slicer_back(slicer_front(CGR.get_text(),'Morning and evening'),'We trust to Thee.') # Delete the preface and conclusion
CGR_txt_c1 = re.sub('\r\n\r\n\r\n.+\r\n  ','\r\n',CGR_txt) # Clean the titles
CGR_txt_c2 = re.sub('\r\n\r\n.+\r\n  ','\r\n',CGR_txt_c1) # Clean the titles
CGR_txt_c3 = re.sub('\r\n.+\r\n\r\n  ','\r\n',CGR_txt_c2) # Clean the titles
CGR_txt_c = re.sub('[1-9]\d*\.','',CGR_txt_c3) # Clean the numbers

In [32]:
# append sentances
for line in CGR_txt_c.split('\r\n'):
    if line.strip():
        corpus.append(line) 
len(corpus)

78713

In [33]:
file=open('corpus_CGR.txt','w')  
for line in corpus:
    file.write(line+'\n') 
file.close() 