In [26]:
import math
import os
import pandas as pd
import shutil

poem_md = os.path.join("disco", "poem_metadata.tsv")
author_md = os.path.join("disco", "author_metadata.tsv")
selected_sonnets = "per-sonnet-selected-random-2"

In [27]:
poem_ids = []
for fn in os.listdir(selected_sonnets):
    poem_id = fn.replace("disco", "").replace(".txt", "")
    poem_ids.append(poem_id)

In [28]:
df_poems = pd.read_csv(poem_md, sep='\t')
df_authors = pd.read_csv(author_md, sep='\t')

In [29]:
df_poems.head()

Unnamed: 0,poem_id,author_id,author,incipit,title
0,001g_0001,001g,"Aragonés, Joseph","Valencia insigne, patria venturosa,",Con dicciones valencianas y castellanas
1,001n_0001,001n,"Aguilar, Juan de",Donde jamás el sol sus rayos tira,A un avaro
2,001n_0002,001n,"Aguilar, Juan de","Perfecto Libro, que a la Estampa ha dado",Soneto
3,001n_0003,001n,"Aguilar, Juan de","Al verte pobre ya, de amor inmundo,",Soneto
4,001n_0004,001n,"Aguilar, Juan de","Raro Fénix de Amor, que en vivas llamas,",Soneto


In [30]:
# filter df to poems we need to keep
matched_poems = df_poems[df_poems['poem_id'].isin(poem_ids)]

In [31]:
matched_poems = matched_poems[['poem_id', 'title', 'author']]

In [32]:
matched_poems

Unnamed: 0,poem_id,title,author
67,003t_0040,AL ATARDECER,"Bernabé, Manuel"
73,003t_0046,A LAONG - LAAN,"Bernabé, Manuel"
375,021g_0040,A Raimundo de Peñafort,"Andrés, Juan Francisco"
492,033g_0058,Soneto,"Sigüenza, Francisco de"
787,056g_0110,A San Isidro,"Prado, Juan Francisco de"
817,060e_0351,Al general Lamar,"Olmedo, José Joaquín"
931,080n_0263,Regalando una botlla de vino añejo,"Ros de Olano, Antonio"
1484,173n_0578,La casada,"Sansores, Rosario («Crisantema»)"
1508,178g_0353,- X -,Marqués de Santillana
1865,236g_0495,Soñé que una piedra me arrojaba,"Terrazas, Francisco de"


In [33]:
matched_poems['author_concat'] = matched_poems.author.apply(lambda x: x.split(",")[1].strip() + " " + x.split(",")[0].strip() if "," in x else x)
matched_poems['author_id'] = matched_poems.poem_id.apply(lambda x: x.split("_")[0])

In [34]:
df_authors.head()

Unnamed: 0,author,aid,normdate,birth,death,raw,hasbirth,hasdeath,hasdates,estimate,...,vf_de,vf_bi_esti,vf_de_esti,birth_diff,death_diff,vf_cand_n,auname_viaf,redo,match_type,vf_validation
0,"López de Ayala, Pedro",008g,14.5,1332,1407,Vitoria. 1332 - Calahorra. 1407 Poeta y cronis...,1,1,1,0,...,1407.0,False,False,0.0,0.0,1,"Pedro Lopez Ayala, 1332 1407",1.0,exact,high
1,Marqués de Santillana,178g,14.5,1398,1458,Carrión de los Condes (Palencia). 1398 - Guada...,1,1,1,0,...,1458.0,False,False,0.0,0.0,5,"Marques Santillana, 1398 1458",,exact,high
2,"Dueñas, Juan de",133g,15.0,1400,1460,Dueñas. Palencia. 1400 - 1460 Poeta.,1,1,1,0,...,1460.0,False,False,,0.0,10,Juan Duenas 1460,,exact,high
3,"Mena, Juan de",357g,15.0,1411,1456,Córdoba. 1411 - Torrelaguna (Santander). 1456 ...,1,1,1,0,...,1456.0,False,False,0.0,0.0,2,"Juan Mena, 1411 1456",,exact,high
4,"Vadillo, Juan",213g,15.0,XV,XV,Español. Siglo XV,0,0,0,0,...,,False,False,570.0,,2,"Juan Vadillo, 1970",,exact,medium


In [35]:
matched_poems

Unnamed: 0,poem_id,title,author,author_concat,author_id
67,003t_0040,AL ATARDECER,"Bernabé, Manuel",Manuel Bernabé,003t
73,003t_0046,A LAONG - LAAN,"Bernabé, Manuel",Manuel Bernabé,003t
375,021g_0040,A Raimundo de Peñafort,"Andrés, Juan Francisco",Juan Francisco Andrés,021g
492,033g_0058,Soneto,"Sigüenza, Francisco de",Francisco de Sigüenza,033g
787,056g_0110,A San Isidro,"Prado, Juan Francisco de",Juan Francisco de Prado,056g
817,060e_0351,Al general Lamar,"Olmedo, José Joaquín",José Joaquín Olmedo,060e
931,080n_0263,Regalando una botlla de vino añejo,"Ros de Olano, Antonio",Antonio Ros de Olano,080n
1484,173n_0578,La casada,"Sansores, Rosario («Crisantema»)",Rosario («Crisantema») Sansores,173n
1508,178g_0353,- X -,Marqués de Santillana,Marqués de Santillana,178g
1865,236g_0495,Soñé que una piedra me arrojaba,"Terrazas, Francisco de",Francisco de Terrazas,236g


In [36]:
matched_poems.columns

Index(['poem_id', 'title', 'author', 'author_concat', 'author_id'], dtype='object')

In [37]:
merged_df = pd.merge(matched_poems, df_authors, how='left', left_on='author_id', right_on='aid')

In [38]:
merged_df_sel = merged_df[['poem_id', 'title', 'author_x', 'author_concat', 'normdate']]

In [39]:
# reorder columns
merged_df_sel.columns = ['poem_id', 'title', 'author', 'author_concat', 'normdate']

In [40]:
merged_df_sel

Unnamed: 0,poem_id,title,author,author_concat,normdate
0,003t_0040,AL ATARDECER,"Bernabé, Manuel",Manuel Bernabé,19.5
1,003t_0046,A LAONG - LAAN,"Bernabé, Manuel",Manuel Bernabé,19.5
2,021g_0040,A Raimundo de Peñafort,"Andrés, Juan Francisco",Juan Francisco Andrés,17.0
3,033g_0058,Soneto,"Sigüenza, Francisco de",Francisco de Sigüenza,17.0
4,056g_0110,A San Isidro,"Prado, Juan Francisco de",Juan Francisco de Prado,17.0
5,060e_0351,Al general Lamar,"Olmedo, José Joaquín",José Joaquín Olmedo,18.5
6,080n_0263,Regalando una botlla de vino añejo,"Ros de Olano, Antonio",Antonio Ros de Olano,19.0
7,173n_0578,La casada,"Sansores, Rosario («Crisantema»)",Rosario («Crisantema») Sansores,19.5
8,178g_0353,- X -,Marqués de Santillana,Marqués de Santillana,14.5
9,236g_0495,Soñé que una piedra me arrojaba,"Terrazas, Francisco de",Francisco de Terrazas,16.5


In [41]:
merged_df_sel['poem_order'] = [x for x in range(77,101)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_sel['poem_order'] = [x for x in range(77,101)]


In [42]:
merged_df_sel

Unnamed: 0,poem_id,title,author,author_concat,normdate,poem_order
0,003t_0040,AL ATARDECER,"Bernabé, Manuel",Manuel Bernabé,19.5,77
1,003t_0046,A LAONG - LAAN,"Bernabé, Manuel",Manuel Bernabé,19.5,78
2,021g_0040,A Raimundo de Peñafort,"Andrés, Juan Francisco",Juan Francisco Andrés,17.0,79
3,033g_0058,Soneto,"Sigüenza, Francisco de",Francisco de Sigüenza,17.0,80
4,056g_0110,A San Isidro,"Prado, Juan Francisco de",Juan Francisco de Prado,17.0,81
5,060e_0351,Al general Lamar,"Olmedo, José Joaquín",José Joaquín Olmedo,18.5,82
6,080n_0263,Regalando una botlla de vino añejo,"Ros de Olano, Antonio",Antonio Ros de Olano,19.0,83
7,173n_0578,La casada,"Sansores, Rosario («Crisantema»)",Rosario («Crisantema») Sansores,19.5,84
8,178g_0353,- X -,Marqués de Santillana,Marqués de Santillana,14.5,85
9,236g_0495,Soñé que una piedra me arrojaba,"Terrazas, Francisco de",Francisco de Terrazas,16.5,86


In [49]:
# need to print this to copy-paste it into the corpus metadata file

for idx, row in merged_df_sel.iterrows():
    print(str(idx+77) + "\t" + row['title'] + "\t" + row['author_concat'] + "\t" + str(math.floor(row['normdate'])) + "\t0\t0\t\tdisco")

77	AL ATARDECER	Manuel Bernabé	19	0	0		disco
78	A LAONG - LAAN	Manuel Bernabé	19	0	0		disco
79	A Raimundo de Peñafort	Juan Francisco Andrés	17	0	0		disco
80	Soneto	Francisco de Sigüenza	17	0	0		disco
81	A San Isidro	Juan Francisco de Prado	17	0	0		disco
82	Al general Lamar	José Joaquín Olmedo	18	0	0		disco
83	Regalando una botlla de vino añejo	Antonio Ros de Olano	19	0	0		disco
84	La casada	Rosario («Crisantema») Sansores	19	0	0		disco
85	- X -	Marqués de Santillana	14	0	0		disco
86	Soñé que una piedra me arrojaba	Francisco de Terrazas	16	0	0		disco
87	Sonetos – - III -	Pedro de Quirós	16	0	0		disco
88	Sonetos – IV	Melchor Palau y Català	19	0	0		disco
89	Soneto	Luis Martín de la Plaza	16	0	0		disco
90	La mariposa y el árbol	Manuel Gutiérrez Nájera	19	0	0		disco
91	Soneto	Jerónimo de Lomas Cantoral	16	0	0		disco
92	Soneto	Juan de San Martín	17	0	0		disco
93	- II -	Luis Barahona de Soto	16	0	0		disco
94	La perla	Manuel Reina	19	0	0		disco
95	Yo he leído	Concepción Robles	19	0	0		disco
96

In [44]:
matched_poems.head(100)

Unnamed: 0,poem_id,title,author,author_concat,author_id
67,003t_0040,AL ATARDECER,"Bernabé, Manuel",Manuel Bernabé,003t
73,003t_0046,A LAONG - LAAN,"Bernabé, Manuel",Manuel Bernabé,003t
375,021g_0040,A Raimundo de Peñafort,"Andrés, Juan Francisco",Juan Francisco Andrés,021g
492,033g_0058,Soneto,"Sigüenza, Francisco de",Francisco de Sigüenza,033g
787,056g_0110,A San Isidro,"Prado, Juan Francisco de",Juan Francisco de Prado,056g
817,060e_0351,Al general Lamar,"Olmedo, José Joaquín",José Joaquín Olmedo,060e
931,080n_0263,Regalando una botlla de vino añejo,"Ros de Olano, Antonio",Antonio Ros de Olano,080n
1484,173n_0578,La casada,"Sansores, Rosario («Crisantema»)",Rosario («Crisantema») Sansores,173n
1508,178g_0353,- X -,Marqués de Santillana,Marqués de Santillana,178g
1865,236g_0495,Soñé que una piedra me arrojaba,"Terrazas, Francisco de",Francisco de Terrazas,236g


In [45]:
# copy create

corpus_dir = "../corpus"

for idx, row in merged_df_sel.iterrows():
    infn = f"disco{row['poem_id']}.txt"
    outfn = f"{str.zfill(str(row['poem_order']), 4)}.txt"
    shutil.copy(os.path.join(selected_sonnets, infn), os.path.join(corpus_dir, outfn))
    

In [46]:
# add title to sonnets without title

# collect the title + text first
poem_order_to_text = {}

for idx, row in merged_df_sel.iterrows():
    infn = f"{str.zfill(str(row['poem_order']), 4)}.txt"
    with open(os.path.join(corpus_dir, infn), "r") as f:
        poem_text = f.read()
        poem_title = row['title']
        poem_order_to_text[infn] = poem_title + "\n\n" + poem_text

In [47]:
poem_order_to_text

{'0077.txt': 'AL ATARDECER\n\nEn las turbias tristezas de los atardeceres,\nbebe el bardo cristiano raudales de armonía,\nque nada eleva al ánima como el morir del día,\nsímbolo de la eterna mudanza de los seres.\n\nDe las ferias del mundo cansado y sus placeres,\nmi ánima, como un pájaro, a Dios volar ansia;\npues de Tí, oh Dios, derívase la ingénita poesía,\na tu merced me entrego, porque en mi lar imperes.\n\nAbriré día y noche mi alma como un sagrario,\npresta, cuando Tú llames, a brindarte aposento,\ndonde darnos al goce de amar, siempre presente.\n\nJuntos abarcaremos el cruento itinerario,\ny verás cómo, cuando muera, no me arrepiento\nde haber vivido a solas contigo eternamente.\n\n',
 '0078.txt': 'A LAONG - LAAN\n\n¡LAONG-LAAN! Tu gloria, —la gloria de la diestra—\ninfla de orgullo el alma serena de los bravos.\nLos ídolos de barro, que en medio la palestra\nse imaginaron reyes, apenas son esclavos.\n\n¡SAN NICOLÁS! ¡SAMPALOK! Palta a mi loa pauta\npara arrojar mis rosas sobre

In [48]:
print(poem_order_to_text['0026.txt'])

KeyError: '0026.txt'

In [None]:
# then create new files with title + text

for ke, va in poem_order_to_text.items():
    with open(os.path.join(corpus_dir, ke), "w") as f:
        f.write(va)