# Combine the Greek and Latin Data

In the first step, I made two CSV files: one for Latin authors; one for Greek authors. I need to combine them now.

In [7]:
import pandas as pd

In [8]:
greek = pd.read_csv('data/deduped_greek_with_authorized_and_variants.csv')

In [9]:
latin = pd.read_csv('data/dllid_variant_authorized.csv')

In [10]:
greek.head()

Unnamed: 0,URL,Name,Variants,Latin
0,http://viaf.org/viaf/27455561,Lycus Rheginus 3. Jh. v. Chr,"Lycus Rheginus 3. Jh. v. Chr, Licos de Rhègion...",0
1,https://viaf.org/viaf/100905484,"Nicephorus Saint, Patriarch of Constantinople","Nicéphore Ier, 0758?-0828, patriarche de Const...",0
2,https://viaf.org/viaf/10236001,"Clitophon Rhodius, 1./2. Jh. v. Chr.","Clitofó, Clitophon Rhodius v1./2. Jh.",0
3,https://viaf.org/viaf/312800491,"Demetrius, of Phaleron, b. ca. 350 B.C.","Démétrios de Phalère 0350?-0283? av. J.-C., Δη...",0
4,https://viaf.org/viaf/34843722,"Echembrotus Lyricus, 6. Jh. v. Chr","Equembrot, Echembrotus Lyricus 6. Jh. v. Chr, ...",0


In [11]:
latin.head()

Unnamed: 0,DLL ID,Authorized Name,Variant Names,Latin
0,A5349,"Celsus, Aulus Cornelius",A. Cornelius Celsus,1
1,A4246,"Augustinus, de Dacia, -1285",Aage von Dänemark,1
2,A4448,"Svend Aagesen, approximately 1130-","Aagesen, Svend, n. c. 1130",1
3,A6040,"Almeloveen, Theodoor Jansson ab, 1657-1712","Ab Almeloveen, Theodoor Jansson, 1657-1712",1
4,A5015,"Abelard, Peter","Abaelard, Peter 1079-1142",1


In [12]:
grouped_latin = latin.groupby(["DLL ID", "Authorized Name", "Latin"])["Variant Names"].apply(" | ".join).reset_index()

In [13]:
# The Greek authors don't have a DLL ID, so I'll create one with "G" and the numbers from the VIAF URL.
greek['DLL ID'] = 'G' + greek['URL'].str.strip('/').str[-5:]

In [14]:
greek.head()

Unnamed: 0,URL,Name,Variants,Latin,DLL ID
0,http://viaf.org/viaf/27455561,Lycus Rheginus 3. Jh. v. Chr,"Lycus Rheginus 3. Jh. v. Chr, Licos de Rhègion...",0,G55561
1,https://viaf.org/viaf/100905484,"Nicephorus Saint, Patriarch of Constantinople","Nicéphore Ier, 0758?-0828, patriarche de Const...",0,G05484
2,https://viaf.org/viaf/10236001,"Clitophon Rhodius, 1./2. Jh. v. Chr.","Clitofó, Clitophon Rhodius v1./2. Jh.",0,G36001
3,https://viaf.org/viaf/312800491,"Demetrius, of Phaleron, b. ca. 350 B.C.","Démétrios de Phalère 0350?-0283? av. J.-C., Δη...",0,G00491
4,https://viaf.org/viaf/34843722,"Echembrotus Lyricus, 6. Jh. v. Chr","Equembrot, Echembrotus Lyricus 6. Jh. v. Chr, ...",0,G43722


In [15]:
# Rearrance the columns
greek = greek[['DLL ID','Name','Variants','Latin']]

In [16]:
# Rename the columns
greek = greek.rename(columns={'Name':'Authorized Name','Variants':'Variant Names'})

In [17]:
greek.head()

Unnamed: 0,DLL ID,Authorized Name,Variant Names,Latin
0,G55561,Lycus Rheginus 3. Jh. v. Chr,"Lycus Rheginus 3. Jh. v. Chr, Licos de Rhègion...",0
1,G05484,"Nicephorus Saint, Patriarch of Constantinople","Nicéphore Ier, 0758?-0828, patriarche de Const...",0
2,G36001,"Clitophon Rhodius, 1./2. Jh. v. Chr.","Clitofó, Clitophon Rhodius v1./2. Jh.",0
3,G00491,"Demetrius, of Phaleron, b. ca. 350 B.C.","Démétrios de Phalère 0350?-0283? av. J.-C., Δη...",0
4,G43722,"Echembrotus Lyricus, 6. Jh. v. Chr","Equembrot, Echembrotus Lyricus 6. Jh. v. Chr, ...",0


In [18]:
# Turn the Variant Names column into a string
greek['Variant Names'] = greek['Variant Names'].str.split(',')
# Join the items with the pipe character
greek['Variant Names'] = greek['Variant Names'].str.join(' | ')

In [19]:
greek = greek[['DLL ID','Authorized Name','Variant Names','Latin']]

In [20]:
greek

Unnamed: 0,DLL ID,Authorized Name,Variant Names,Latin
0,G55561,Lycus Rheginus 3. Jh. v. Chr,Lycus Rheginus 3. Jh. v. Chr | Licos de Rhègi...,0
1,G05484,"Nicephorus Saint, Patriarch of Constantinople",Nicéphore Ier | 0758?-0828 | patriarche de C...,0
2,G36001,"Clitophon Rhodius, 1./2. Jh. v. Chr.",Clitofó | Clitophon Rhodius v1./2. Jh.,0
3,G00491,"Demetrius, of Phaleron, b. ca. 350 B.C.",Démétrios de Phalère 0350?-0283? av. J.-C. | ...,0
4,G43722,"Echembrotus Lyricus, 6. Jh. v. Chr",Equembrot | Echembrotus Lyricus 6. Jh. v. Chr...,0
...,...,...,...,...
2373,G73664,"Leo, VI, Emperor of the East 866-912",Léon VI | 0866-0912 | empereur d'Orient | L...,0
2374,G54624,"Xenocrates, of Chalcedon, approximately 396 B....",Xenocrates | of Chalcedon | approximately 39...,0
2375,G28402,"Thaletas, Musicus, um. 665 v. chr.",Taletes | Thaletas Musicus v665,0
2376,G29158,"Pachymérès, George, 1242-approximately 1310",Pachymère | Georges 1242-1310? | Pachymeres ...,0


In [21]:
# Get the length of both dataframes
print(len(greek))
print(len(grouped_latin))

2378
3137


In [23]:
# Concatenate the dataframes
frames = [grouped_latin,greek]
df = pd.concat(frames,ignore_index=True)
display(df)

Unnamed: 0,DLL ID,Authorized Name,Latin,Variant Names
0,A1868,"Herryson, Joannes",1,"Herryson, Joannes floruit=15th Century A.D. | ..."
1,A1870,"Stratford, John, -1348",1,Johannes Stratford | John Stratford | John Str...
2,A2181,"Nicomachus, of Gerasa",1,"Nicòmac, de Gerasa, actiu segle I | Nicomachus..."
3,A2491,"Anaritius, -approximately 922",1,Abū al-ʿAbbās al-Faḍl ibn Ḥātim Nayrīzī...
4,A2492,"Strecker, Karl, 1861-1945",1,"Karl Strecker | Strecker, Karl | Strecker, Kar..."
...,...,...,...,...
5510,G73664,"Leo, VI, Emperor of the East 866-912",0,Léon VI | 0866-0912 | empereur d'Orient | L...
5511,G54624,"Xenocrates, of Chalcedon, approximately 396 B....",0,Xenocrates | of Chalcedon | approximately 39...
5512,G28402,"Thaletas, Musicus, um. 665 v. chr.",0,Taletes | Thaletas Musicus v665
5513,G29158,"Pachymérès, George, 1242-approximately 1310",0,Pachymère | Georges 1242-1310? | Pachymeres ...


In [24]:
# Rearrange the columns
df = df[['DLL ID', 'Authorized Name', 'Variant Names','Latin']]

In [25]:
# Write to csv for use in the next step.
df.to_csv('data/latin_greek.csv',index=False)