In [1]:
import pandas as pd
from langdetect import detect_langs
import matplotlib.pyplot as plt
import seaborn as sns
import re
data_dir = '../data/wrangled/'
data_file = 'wrangled_data.csv'

In [2]:
df = pd.read_csv(data_dir + data_file,index_col=0)

In [3]:
def wspace_schars(review, chars_to_keep=".,'\n" , no_white_space = True, no_newlines= True):
    """
    Function to formar expressions
    """
    
    to_keep= ""
    for i in chars_to_keep:
        to_keep+= i+'|'
   
    rep_special_chars= re.compile("[^\w\n|"+ (to_keep[:-1])+ "]|_") 
    
    text=rep_special_chars.sub(' ', review) # Subs special charas by white space except chars_to_keep
    if no_white_space:
        text = re.sub('\n+', '\n',text) # Remove consecutive breaklines
    if no_newlines:
        text = re.sub(' +', ' ',text) # Remove consecutive white space
    return text

In [4]:
test_string = """
To those who want to try a different way of formating:
!@#$%^&*()_+
'''''
'a____a'


a____a's



b   b.....
,..../?
"""
print(wspace_schars(test_string))


To those who want to try a different way of formating 
 
'''''
'a a'
a a's
b b.....
,.... 



In [5]:
df['content'] = df.content.apply(lambda x: wspace_schars(x, 
            chars_to_keep=".,'\n?!", no_white_space = False, no_newlines= False))

In [6]:
# Longest poems
TOP_N = 12000
longest_poems_ids =df.length_in_words.sort_values(ascending=False)[:TOP_N]
longest_poems_ids = [i[0] for i in longest_poems_ids.items()]

In [7]:
# Let us count from which authors we have enough material
NEED_WORDS_PER_AUTHOR = 2500
repeated_authors = df.groupby('author')['length_in_words'].sum()
repeated_authors_list= [i[0] for i in repeated_authors[repeated_authors > NEED_WORDS_PER_AUTHOR].items()]

df = df[df.author.isin(repeated_authors_list)] 

###  Explore best poems for training
- Measures word lenght, 
- Partitions accordingly,
- create pandas series,
- appends accordingly,

In [8]:
SPLIT_INTO = 100

def poem_fragments(poem_series, split_into=SPLIT_INTO):
    """ 
    Gets wordlend of a poem,  
    if larger than SPLIT_INTO partions into next paragraph
    return author, title and poem broken in this way
    """
    
    
    poem = poem_series
    poem_author = poem.author
    poem_title = poem.title
    poem_content = poem.content
    poem_pa= poem.content.split('.\n')
    i=0
    while ((i+1)!=(len(poem_pa))):
        if not (len(poem_pa[i].split())<SPLIT_INTO):
            if poem_pa[i][-1]!='.': poem_pa[i]=poem_pa[i]+'.'
            #print('FINAL')
            #print(poem_pa[i])
            i+=1        
        else:
            #print('BEFORE')
            #print(poem_pa[i])
            poem_pa[i] =  poem_pa[i]+'.\n'+poem_pa[i+1]
        
            del poem_pa[i+1]
    return  (poem_author, poem_title  ,poem_pa) 


In [9]:
df_final = df[:0].drop(columns= ['poetry_foundation_id','length_in_words'])

In [10]:
for i in range(len(df)):
    (author, title, poem_pa  )= poem_fragments( df.iloc[i])
    for j in poem_pa:
        df_final=df_final.append(pd.Series({'author': author, 'title':title, 'content':j }),ignore_index=True)

In [27]:
authors_list= [i[0] for i in df_final.author.value_counts().items()]
authors_count_list= [i[1] for i in df_final.author.value_counts().items()]

In [28]:
author_dict = {j:i for i,j in enumerate(authors_list)}
df_final['author_label'] = df_final.author.apply(lambda x: author_dict[x])

In [29]:
author_dict

{'alfred_lord_tennyson': 0,
 'algernon_charles_swinburne': 1,
 'robert_browning': 2,
 'alexander_pope': 3,
 'walt_whitman': 4,
 'william_shakespeare': 5,
 'william_wordsworth': 6,
 'edmund_spenser': 7,
 'matthew_arnold': 8,
 'percy_sshe_shelley': 9,
 'anonymous': 10,
 'john_dryden': 11,
 'john_ashbery': 12,
 'henry_wadsworth_longfellow': 13,
 'john_donne': 14,
 'robert_pinsky': 15,
 'john_koethe': 16,
 'lord_ron_george_gordon_': 17,
 'anne_bradstreet': 18,
 'derek_walcott': 19,
 'christian_wiman': 20,
 'samuel_taylor_coleridge': 21,
 'frank_bidart': 22,
 'henry_timrod': 23,
 'ezra_pound': 24,
 'carolyn_kizer': 25,
 'tom_sleigh': 26,
 'gertrude_stein': 27,
 'david_ferry': 28,
 'john_keats': 29,
 'mark_rudman': 30,
 'james_mcmichael': 31,
 'andrew_marvell': 32,
 'charles_reznikoff': 33,
 'alice_notley': 34,
 'anne_carson': 35,
 'elizabeth_barrett_browning': 36,
 'john_greenleaf_whittier': 37,
 'dean_young': 38,
 'william_butler_yeats': 39,
 'george_seferis': 40,
 'muriel_rukeyser': 41,
 

In [13]:
process_dir= '../data/processed/'

In [14]:
# Take top hundred authors, ignore the first to no
MAX_N_AUTHORS= 10
MAX_LIST= authors_list[:MAX_N_AUTHORS]
df_final[df_final.author.isin(MAX_LIST)].drop(columns= ['author', 'title']).to_json(process_dir+'top_10_authors.json', orient='records', lines=True)

In [15]:
# Take top hundred authors, ignore the first to no
MAX_N_AUTHORS= 90
MAX_LIST= authors_list[:MAX_N_AUTHORS]
df_final[df_final.author.isin(MAX_LIST)].drop(columns= ['author', 'title']).to_json(process_dir+'top_90_authors.json', orient='records', lines=True)

In [16]:
# Take top hundred authors, ignore the first to no
MAX_N_AUTHORS= 100
MIN_N_AUTHORS = 90
MAX_LIST= authors_list[MIN_N_AUTHORS:MAX_N_AUTHORS]
df_final[df_final.author.isin(MAX_LIST)].drop(columns= ['author', 'title']).to_json(process_dir+'bottom_10_authors.json', orient='records', lines=True)

# Getting 10 longest poems

In [17]:
LONGEST_N= 10
longest_list= [i[0] for i in df.length_in_words.sort_values(ascending=False).items()][:LONGEST_N]
longest_list

[1418, 12412, 14512, 11631, 5112, 66, 12259, 6929, 6180, 2967]

In [18]:
[i[1] for i in df.length_in_words.sort_values(ascending=False).items()][:LONGEST_N]

[23130, 15765, 9713, 9450, 7857, 7779, 7444, 7011, 6103, 6048]

In [19]:
_df= df[df.index.isin(longest_list)] 

In [20]:
df_longest10 = _df[:0].drop(columns= ['poetry_foundation_id','length_in_words'])

In [21]:
for i in range(len(_df)):
    (author,title, poem_pa  )= poem_fragments( _df.iloc[i])
    for j in poem_pa:
        df_longest10=df_longest10.append(pd.Series({'author': author, 'title':title, 'content':j }),ignore_index=True)

In [22]:
author_dict = {j:i for i,j in enumerate(df_longest10.author.unique())}
author_dict

{'john_dryden': 0,
 'robert_pinsky': 1,
 'anne_carson': 2,
 'alfred_lord_tennyson': 3,
 'allen_ginsberg': 4,
 'philip_whalen': 5,
 'matthew_arnold': 6,
 'walt_whitman': 7,
 'william_shakespeare': 8,
 'anonymous': 9}

In [23]:
df_longest10['author_label'] = df_longest10.author.apply(lambda x: author_dict[x])

In [24]:
len(df_longest10)

676

In [25]:
df_longest10.drop(columns= ['author', 'title']).to_json(process_dir+'longest_poems.json', orient='records', lines=True)