In [1]:
import pandas as pd
from langdetect import detect_langs
import matplotlib.pyplot as plt
import seaborn as sns
import re
data_dir = '../data/wrangled/'
data_file = 'wrangled_data.csv'

In [2]:
df = pd.read_csv(data_dir + data_file,index_col=0)
df.head(), len(df)

(             author                                          title  \
 2     jody_gladding                                     1_800_fear   
 3    joseph_brodsky                                 1_january_1965   
 5      joe_brainard                                  30_one_liners   
 7      alice_notley                                  30th_birthday   
 8  charles_bukowski  a_340_dollar_horse_and_a_hundred_dollar_whore   
 
    poetry_foundation_id                                            content  \
 2                 57135  You'd  like  to  talk  with  you  about  fear ...   
 3                 56736  The Wise Men will unlearn your name.\nAbove yo...   
 5                 58251  WINTER\nMore time is spent at the window.\n\nS...   
 7                 48037  May I never be afraid\nespecially of myself\nb...   
 8                 49569  don’t ever get the idea I am a poet; you can s...   
 
    length_in_words  
 2              113  
 3              150  
 5              356  
 7      

In [3]:
def wspace_schars(review, chars_to_keep=".,'\n" , no_white_space = True, no_newlines= True):
    """
    Function to formar expressions
    """
    
    to_keep= ""
    for i in chars_to_keep:
        to_keep+= i+'|'
   
    rep_special_chars= re.compile("[^\w\n|"+ (to_keep[:-1])+ "]|_") 
    
    text=rep_special_chars.sub(' ', review) # Subs special charas by white space except chars_to_keep
    if no_white_space:
        text = re.sub('\n+', '\n',text) # Remove consecutive breaklines
    if no_newlines:
        text = re.sub(' +', ' ',text) # Remove consecutive white space
    return text
    


In [4]:
test_string = """
To those who want to try a different way of formating:
!@#$%^&*()_+
'''''
'a____a'


a____a's



b   b.....
,..../?
"""
print(wspace_schars(test_string))


To those who want to try a different way of formating 
 
'''''
'a a'
a a's
b b.....
,.... 



In [5]:
df['content'] = df.content.apply(lambda x: wspace_schars(x))

In [6]:
# Longest poems
TOP_N = 12000
longest_poems_ids =df.length_in_words.sort_values(ascending=False)[:TOP_N]
longest_poems_ids = [i[0] for i in longest_poems_ids.items()]

In [7]:
# Only beowulf is long enough to be considered
# The book by sir_gawain_and_the_green_knight is not actually in english.
df_anonymous = df[df.author=='anonymous']
df_anonymous

Unnamed: 0,author,title,poetry_foundation_id,content,length_in_words
412,anonymous,alysoun,43516,"An hendy hap ichabbe yhent \nIchot, from heven...",189
1214,anonymous,barbara_allen,50273,"In Scarlet town, where I was born,\nThere was ...",254
1418,anonymous,beowulf_modern_english_translation_,50114,"LO, praise of the prowess of people kings\nof ...",23130
1606,anonymous,blow_northerne_wind,43522,"Blow, northerne wynd,\nSend thou me my suetyng...",310
2145,anonymous,charms_for_love,50931,I beat you with a hazel rod\nCome to me in mad...,300
3521,anonymous,each_day,55425,"Each day as dawn approaches,\nthe King sits in...",102
4019,anonymous,from_the_exeter_book_gnomic_verses,48750,lines 71 99 \nFrost shall freeze\nfire eat wo...,220
4580,anonymous,for_we_are_thy_people,53780,"For we are thy people, and thou art our God \n...",101
4677,anonymous,the_foxs_foray,56589,"A fox jumped out one winter's night,\nAnd begg...",379
4807,anonymous,from_old_english_rune_poem_,56322,i\n feoh \nWealth is a comfort to every man\ny...,220


In [8]:
#Observe that only Beowulf is long enough to keep. 

beowulf = df_anonymous[df_anonymous.title=='beowulf_modern_english_translation_']
beowulf

Unnamed: 0,author,title,poetry_foundation_id,content,length_in_words
1418,anonymous,beowulf_modern_english_translation_,50114,"LO, praise of the prowess of people kings\nof ...",23130


In [9]:
df_not_anonimous = df[df.author!='anonymous']
len(df_not_anonimous)

11775

In [10]:
# Let us count from which authors we have enough material
NEED_WORDS_PER_AUTTHOR = 2500
repeated_authors = df_not_anonimous.groupby('author')['length_in_words'].sum()
repeated_authors_list= [i[0] for i in repeated_authors[repeated_authors > NEED_WORDS_PER_AUTTHOR].items()]

In [11]:
df_not_anonimous = df_not_anonimous[df_not_anonimous.author.isin(repeated_authors_list)] 

In [12]:
len(df_not_anonimous)

5175

###  Explore best poems for training
- Measures word lenght, 
- Partitions accordingly,
- create pandas series,
- appends accordingly,

In [13]:
SPLIT_INTO = 100

def poem_fragments(poem_series, split_into=SPLIT_INTO):
    """ 
    Gets wordlend of a poem,  
    if larger than SPLIT_INTO partions into next paragraph
    return author, title and poem broken in this way
    """
    
    
    poem = poem_series
    poem_author = poem.author
    poem_title = poem.title
    poem_content = poem.content
    poem_pa= poem.content.split('.\n')
    i=0
    while ((i+1)!=(len(poem_pa))):
        if not (len(poem_pa[i].split())<SPLIT_INTO):
            if poem_pa[i][-1]!='.': poem_pa[i]=poem_pa[i]+'.'
            #print('FINAL')
            #print(poem_pa[i])
            i+=1        
        else:
            #print('BEFORE')
            #print(poem_pa[i])
            poem_pa[i] =  poem_pa[i]+'.\n'+poem_pa[i+1]
        
            del poem_pa[i+1]
    return  (poem_author, poem_title  ,poem_pa) 


In [14]:
df_LONGEST = df_not_anonimous.append(beowulf)
df_final = df_LONGEST[:0].drop(columns= ['poetry_foundation_id','length_in_words'])

In [15]:
for i in range(len(df_LONGEST)):
    (author, title, poem_pa  )= poem_fragments( df_LONGEST.iloc[i])
    for j in poem_pa:
        df_final=df_final.append(pd.Series({'author': author, 'title':title, 'content':j }),ignore_index=True)

In [16]:
authors_list= [i[0] for i in df_final.author.value_counts().items()]
authors_count_list= [i[1] for i in df_final.author.value_counts().items()]

In [17]:
author_dict = {j:i for i,j in enumerate(authors_list)}
df_final['author_label'] = df_final.author.apply(lambda x: author_dict[x])

In [18]:
process_dir= '../data/processed/'

In [19]:
# Take top hundred authors, ignore the first to no
MAX_N_AUTHORS= 10
MAX_LIST= authors_list[:MAX_N_AUTHORS]
df_final[df_final.author.isin(MAX_LIST)].drop(columns= ['author', 'title']).to_json(process_dir+'top_10_authors.json', orient='records', lines=True)

In [20]:
# Take top hundred authors, ignore the first to no
MAX_N_AUTHORS= 90
MAX_LIST= authors_list[:MAX_N_AUTHORS]
df_final[df_final.author.isin(MAX_LIST)].drop(columns= ['author', 'title']).to_json(process_dir+'top_90_authors.json', orient='records', lines=True)

In [21]:
# Take top hundred authors, ignore the first to no
MAX_N_AUTHORS= 100
MIN_N_AUTHORS = 90
MAX_LIST= authors_list[MIN_N_AUTHORS:MAX_N_AUTHORS]
df_final[df_final.author.isin(MAX_LIST)].drop(columns= ['author', 'title']).to_json(process_dir+'bottom_10_authors.json', orient='records', lines=True)

# Getting 10 longest poems

In [22]:
df_longest = df[df.author!='john_milton']

In [23]:
LONGEST_N= 10
longest_list= [i[0] for i in df_longest.length_in_words.sort_values(ascending=False).items()][:LONGEST_N]
longest_list

[1418, 12412, 14512, 11631, 5112, 66, 12259, 6929, 6180, 2967]

In [24]:
df_longest= df_longest[df_longest.index.isin(longest_list)]

In [25]:
df_top10 = df_longest[:0].drop(columns= ['poetry_foundation_id','length_in_words'])

In [26]:
for i in range(len(df_longest)):
    (author,title, poem_pa  )= poem_fragments( df_longest.iloc[i])
    for j in poem_pa:
        df_top10=df_top10.append(pd.Series({'author': author, 'title':title, 'content':j }),ignore_index=True)

In [27]:
author_dict = {j:i for i,j in enumerate(df_top10.author.unique())}
author_dict

{'john_dryden': 0,
 'anonymous': 1,
 'robert_pinsky': 2,
 'anne_carson': 3,
 'alfred_lord_tennyson': 4,
 'allen_ginsberg': 5,
 'philip_whalen': 6,
 'matthew_arnold': 7,
 'walt_whitman': 8,
 'william_shakespeare': 9}

In [28]:
df_top10['author_label'] = df_top10.author.apply(lambda x: author_dict[x])

In [29]:
len(df_top10)

676

In [30]:
df_top10.drop(columns= ['author', 'title']).to_json(process_dir+'longest_poems.json', orient='records', lines=True)