# Prepare HathiTrust data for processing

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/1908698974-1722799169.txt', sep='\t',encoding='utf-8')

In [4]:
df.columns

Index(['htid', 'access', 'rights', 'ht_bib_key', 'description', 'source',
       'source_bib_num', 'oclc_num', 'isbn', 'issn', 'lccn', 'title',
       'imprint', 'rights_reason_code', 'rights_timestamp', 'us_gov_doc_flag',
       'rights_date_used', 'pub_place', 'lang', 'bib_fmt', 'collection_code',
       'content_provider_code', 'responsible_entity_code',
       'digitization_agent_code', 'access_profile_code', 'author',
       'catalog_url', 'handle_url'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24799 entries, 0 to 24798
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   htid                     24799 non-null  object 
 1   access                   24799 non-null  int64  
 2   rights                   24799 non-null  object 
 3   ht_bib_key               24799 non-null  int64  
 4   description              10074 non-null  object 
 5   source                   24799 non-null  object 
 6   source_bib_num           24729 non-null  object 
 7   oclc_num                 17400 non-null  object 
 8   isbn                     164 non-null    object 
 9   issn                     0 non-null      float64
 10  lccn                     3208 non-null   object 
 11  title                    24799 non-null  object 
 12  imprint                  24788 non-null  object 
 13  rights_reason_code       24799 non-null  object 
 14  rights_timestamp      

## Check for duplicates

See if there are any identifiers that can be used to eliminate duplicate records before starting to process authors and titles.

In [6]:
duplicated_titles = df[df.duplicated(subset=['author','title'],keep=False)]
print(len(duplicated_titles))

13851


In [12]:
duplicated_source_bib_num = df[df.duplicated('source_bib_num',keep=False)]
print(len(duplicated_source_bib_num))

11703


In [14]:
duplicated_oclc_num = df[df.duplicated('oclc_num',keep=False)]
print(len(duplicated_oclc_num))

17870


In [15]:
duplicated_catalog_url = df[df.duplicated('catalog_url',keep=False)]
print(len(duplicated_catalog_url))

13443


In [16]:
duplicated_handle_url = df[df.duplicated('handle_url',keep=False)]
print(len(duplicated_handle_url))

0


In [17]:
duplicated_ht_bib_key = df[df.duplicated('ht_bib_key',keep=False)]
print(len(duplicated_ht_bib_key))

13443


In [19]:
duplicated_htid = df[df.duplicated('htid',keep=False)]
print(len(duplicated_htid))

0


There is some disagreement about what has been duplicated. Both `catalog_url` and `ht_bib_key` return 13,443 duplicates. That is similar to the number of duplications by `author` and `title` (13,851). The `source_bib_num` yields 11,703 duplicated rows. Curiously, there are 17,870 duplicated rows based on `oclc_num`, but there are only 17,400 rows with values in that column. I need to look more closely at the discrepancies.

In [20]:
# Check duplicated author, title, ht_bib_key
duplicated_author_title_htbibkey = df[df.duplicated(subset=['author','title','ht_bib_key'],keep=False)]
print(len(duplicated_author_title_htbibkey))

13435


In [27]:
duplicated_author_title_htbibkey

Unnamed: 0,htid,access,rights,ht_bib_key,description,source,source_bib_num,oclc_num,isbn,issn,...,lang,bib_fmt,collection_code,content_provider_code,responsible_entity_code,digitization_agent_code,access_profile_code,author,catalog_url,handle_url
5,bc.ark:/13960/s21cdmp944s,1,pd,102991209,t.24,MCHB,99138037598801021,1129016314,,,...,lat,BK,IBC,bc,bc,bc,open,"Drexel, Jeremias, 1581-1638,",https://catalog.hathitrust.org/Record/102991209,https://hdl.handle.net/2027/bc.ark:/13960/s21c...
7,bc.ark:/13960/s21zqjtnqq1,1,pd,102991209,t.16,MCHB,99138037598801021,1129016314,,,...,lat,BK,IBC,bc,bc,bc,open,"Drexel, Jeremias, 1581-1638,",https://catalog.hathitrust.org/Record/102991209,https://hdl.handle.net/2027/bc.ark:/13960/s21z...
8,bc.ark:/13960/s22tp1wf2p0,1,pd,102991209,t.14,MCHB,99138037598801021,1129016314,,,...,lat,BK,IBC,bc,bc,bc,open,"Drexel, Jeremias, 1581-1638,",https://catalog.hathitrust.org/Record/102991209,https://hdl.handle.net/2027/bc.ark:/13960/s22t...
9,bc.ark:/13960/s22vwz03q26,1,pd,102991209,t.21,MCHB,99138037598801021,1129016314,,,...,lat,BK,IBC,bc,bc,bc,open,"Drexel, Jeremias, 1581-1638,",https://catalog.hathitrust.org/Record/102991209,https://hdl.handle.net/2027/bc.ark:/13960/s22v...
10,bc.ark:/13960/s233d6vr52m,1,pd,102991209,t.20,MCHB,99138037598801021,1129016314,,,...,lat,BK,IBC,bc,bc,bc,open,"Drexel, Jeremias, 1581-1638,",https://catalog.hathitrust.org/Record/102991209,https://hdl.handle.net/2027/bc.ark:/13960/s233...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24790,yale.39002051605815,1,pd,8394308,,YALE,1696518,2167761,,,...,lat,BK,YALE,yale,yale,yale,open,"Thomas, Aquinas, Saint, 1225?-1274.",https://catalog.hathitrust.org/Record/8394308,https://hdl.handle.net/2027/yale.39002051605815
24791,yale.39002051605823,1,pd,8394308,,YALE,1696518,2167761,,,...,lat,BK,YALE,yale,yale,yale,open,"Thomas, Aquinas, Saint, 1225?-1274.",https://catalog.hathitrust.org/Record/8394308,https://hdl.handle.net/2027/yale.39002051605823
24792,yale.39002051605831,1,pd,8394308,,YALE,1696518,2167761,,,...,lat,BK,YALE,yale,yale,yale,open,"Thomas, Aquinas, Saint, 1225?-1274.",https://catalog.hathitrust.org/Record/8394308,https://hdl.handle.net/2027/yale.39002051605831
24793,yale.39002051605849,1,pd,8394308,,YALE,1696518,2167761,,,...,lat,BK,YALE,yale,yale,yale,open,"Thomas, Aquinas, Saint, 1225?-1274.",https://catalog.hathitrust.org/Record/8394308,https://hdl.handle.net/2027/yale.39002051605849


In [28]:
# Drop the rows duplicated in author, title, and ht_bib_key
deduplicated = df.drop_duplicates(subset=['author','title','ht_bib_key'])

In [None]:
# Drop some of the unnecessary
reduced = deduplicated[['author','title','imprint','pub_place','rights_date_used','handle_url']]

In [30]:
# Rename the columns
renamed = reduced.rename(columns={'imprint':'publisher','pub_place':'place','rights_date_used':'year','handle_url':'url'})

In [31]:
renamed

Unnamed: 0,author,title,publisher,place,year,url
0,"Du Creux, François, 1596?-1666.","Historiæ canadensis, seu Novæ-Franciæ libri de...",Apud Sebastianum Cramoisy et Sebast. Mabre-Cra...,fr,1664,https://hdl.handle.net/2027/aeu.ark:/13960/t25...
1,"Meyer, Ernst H. F. 1791-1858.",Ernesti Meyer de plantis labradoricis libri tres.,"Sumtibus Leopoldi Vossii, 1830.",gw,1830,https://hdl.handle.net/2027/aeu.ark:/13960/t5q...
2,"Laet, Joannes de, 1593-1649.","Novus orbis, seu Descriptionis Indiae Occident...","Apud Elzevirios, 1633.",ne,1633,https://hdl.handle.net/2027/aeu.ark:/13960/t61...
3,"Caesar, Julius",C. Julii Cæsaris commentariorum De Bello Galli...,"Armour and Ramsay, 1849.",quc,1849,https://hdl.handle.net/2027/aeu.ark:/13960/t6t...
4,,Collectanea latina seu ecclesiasticæ antiquita...,"[s.n.], 1853.",onc,1853,https://hdl.handle.net/2027/aeu.ark:/13960/t77...
...,...,...,...,...,...,...
24774,"Thomas, à Kempis, 1380-1471.","Opera omnia, voluminibus septem edidit additoq...","Herder, 1902-18.",xx,1918,https://hdl.handle.net/2027/yale.39002051605351
24795,,Cvlex carmen Vergilio ascriptvm; recensvit et ...,"Weidmann, 1891.",gw,1891,https://hdl.handle.net/2027/yul.12176050_000_00
24796,Persius.,A. Persii flacci Satirarum liber Ex recensione...,"sumptibus et typis B.G. Teubneri, 1881.",gw,1881,https://hdl.handle.net/2027/yul.12248268_000_00
24797,"Jaʻfarī, Ṣāliḥ ibn al-Ḥusayn, d. 1269 or ...",Liber decem quaestionum contra Christianos auc...,"Typis Caroli Drobnig, 1897.",gw,1897,https://hdl.handle.net/2027/yul.12324064_000_00


In [32]:
# Write to CSV
import csv
renamed.to_csv('../data/hathi2.csv',index=False,quoting=csv.QUOTE_ALL)

In [36]:
# Check authors
for author in sorted(renamed['author'].astype('str').unique()):
    print(author)

Abad, Diego José, 1727-1779
Abadía de Santillana del Mar.
Abati, Baldo Angelo
Abaunza, Pedro de 1599-1649.
Abbatius, Baldus Angelus, 16th cent.
Abbeloos, J. B. 1836-1896.
Abbeloos, Jean Baptiste, 1836-1906.
Abbo, Monk of St. Germain, approximately 850-approximately 923.
Abdias, Obispo de Babilonia.
Abelard, Peter, 1079-1142.
Abicht, Rudolf, 1850-1921.
Abrahams, Nicolai Christian Levin, 1798-1870.
Abril, Pedro Simón, ca. 1530- ca. 1595.
Abu al-Faraj al-Isbahani, 897 or 8-967.
Abū Miḥjan al-Thaqafī, active 629-637
Abū Miḥjan al-Thaqafī, fl. 629-637.
Abū Tammām Ḥabīb ibn Aws al-Ṭāʼī, active 808-842
Abū Tammām Ḥabīb ibn Aws al-Ṭāʾī, fl. 808-842,
Abū al-Rabīʻ Sulaymān ibn ʻAbd Allāh al-Muwaḥḥid.
Abū ʻUbayd al-Qāsim ibn Sallām, approximately 773-approximately 837
Abū al-Faraj al-Iṣbahānī, 897 or 898-967.
Academia Molshemensis (Francia)
Accademia degli Occulti (Brescia)
Acevedo, Alfonso de, 1518-1598
Achilles Tatius
Achilles Tatius.
Achillini, Alessandro
Achillin