In [1]:
# This code finds a unique set of volumes that are printed in England between 1500-1900 and are 
# written in English only. Some volumes are part of a serial set and are de-duplicated by choosing 
# the largest set of volumes from a given university, then adding any missing volumes from other
# universities. The largest serial set of volumes from a given university is chosen after standardizing 
# the 'description' field which describe the volume number of a serial set, and de-duplicating based on 
# this standardized 'description' field. The final list of volumes is then exported into a text file
# that we can use as an input to the 'Extracted Features Download Helper' algorithm in HTRC Analytics
# website. This algorithm creates a rsync script that we can use to download our volumes locally. 

In [1]:
import pandas as pd
from collections import Counter

# Load HTRC workset provided by HTRC Librarian of all volumes printed in England between 1500-1900
workset = pd.read_csv('./htrc_workset_final.csv', index_col=0)


In [2]:
# workset = workset.drop_duplicates(subset = ['record_id'], keep = 'first')

# Subset the workset that does not have a year, is not printed between 1500-1900, and not printed 
# in English (some volumes have multiple languages, and we keep volumes that are only in English)
workset =workset[~workset['year'].isna()]
workset = workset[(workset['year'] <= 1900) & (workset['year'] >= 1500)]
workset = workset[workset['language'] == 'eng']
workset

Unnamed: 0,title,year,language,authors,oclc,pub_place,record_id
mdp.39015029914945,Calendar to the sessions records. New Series.,1615.0,eng,Great Britain. Court of Quarter Sessions of th...,64221416,London,67108
hvd.32044098622814,"Remains, historical and literary, connected wi...",1858.0,eng,Chetham Society.,1554078,Preston England etc,45245
mdp.39015030098696,The Eclectic review.,1828.0,eng,,1645017,London,45926
uiug.30112051945001,Hansard's parliamentary debates.,1846.0,eng,"Great Britain. Parliament; Hansard, T. C. (Tho...",7655885,Sl,46502
nyp.33433081644647,The new monthly magazine.,1864.0,eng,,1590672,London,60891
...,...,...,...,...,...,...,...
uc1.31210013906464,Through Bosnia and the Herzegóvina on foot dur...,1876.0,eng,"Evans, Arthur, Sir, 1851-1941.",201699387,London,102596290
uc1.31210011845169,Memoirs of Doctor Burney : arranged from his o...,1832.0,eng,"Burney, Fanny, 1752-1840.",2772156,London,1457792
gri.ark:/13960/t0kt4cz3d,Painting popularly explained : including fresc...,1876.0,eng,"Gullick, Thomas John; Timbs, John, 1801-1875",21684331,London,102595851
gri.ark:/13960/t03z5zh7z,Catalogue of the valuable collection of water-...,1894.0,eng,"Christie, Manson & Woods.",171494672,London,102595845


In [6]:
# Find unique volumes based on Record_id
unique = workset[workset.duplicated(subset=['record_id'], keep=False) == False]

# Find duplicated volumes based on Record_id
duplicated = workset[workset.duplicated(subset=['record_id'], keep=False) == True]
htids_to_keep = set(duplicated.index.tolist())

In [1]:
# Get metadata of duplicated volumes, so that we can de-duplicate based on the 'description' and 
# 'rights_date_used' field of the metadata.

import pandas as pd
import gzip

# Function to handle lines with too many fields
def parse_line(line):
    fields = line.strip().split('\t')
    if len(fields) > 26:
        print(fields)
        # Combine extra fields into the last expected field
        fields = fields[:25] + ['\t'.join(fields[25:])]

    return fields[:26]  # Ensure we always return 26 fields

# Read the gzipped file line by line
data = []
with gzip.open(r'.\hathi_full_20241001.txt.gz', 'rt', encoding='utf-8') as file:
    for line in file:
        data.append(parse_line(line))

# Create DataFrame
df = pd.DataFrame(data, columns=[
    'htid', 'access', 'rights', 'ht_bib_key', 'description',
    'source', 'source_bib_num', 'oclc_num', 'isbn', 'issn',
    'lccn', 'title', 'imprint', 'rights_reason_code',
    'rights_timestamp', 'us_gov_doc_flag', 'rights_date_used',
    'pub_place', 'lang', 'bib_fmt', 'collection_code',
    'content_provider_code', 'responsible_entity_code',
    'digitization_agent_code', 'access_profile_code',
    'author'
])

del data

KeyboardInterrupt: 

In [None]:
import gc
df.set_index('htid', inplace=True)
df = df.loc[df.index.isin(htids_to_keep)]
df.reset_index(inplace=True)
gc.collect()

In [None]:
# df = df[['description','htid', 'ht_bib_key', 'source', 'rights_date_used']]
# d.drop_duplicates(subset = ['record_id'], keep = 'first')
df

In [10]:
# If a duplicated volume does not have an entry in the 'description' field, then it is not a serial volume
# In this case, we can choose any volume as they are all the same

no_serial = df[df['description'] =='']
no_serial = no_serial.drop_duplicates(subset = ['ht_bib_key'], keep = 'first')
no_serial

Unnamed: 0,description,htid,ht_bib_key,source,rights_date_used
0,,gri.ark:/13960/t0zp7r05j,000002718,CMALG,1871
2,,uc2.ark:/13960/t8bg2ks3g,000005398,UC,1883
7,,hvd.hnzsql,000006500,HVD,1889
12,,hvd.32044019058221,000007392,HVD,1875
15,,uc1.b3315237,000008815,UC,1896
...,...,...,...,...,...
273082,,hvd.32044036461671,102984816,HVD,1896
273084,,hvd.32044094973856,102984826,HVD,1886
273093,,hvd.hn1y4j,102985415,HVD,1899
273109,,hvd.32044106312044,102991213,HVD,1794


In [11]:
from collections import Counter

# If a volume has an entry in the 'description' field, then it is likey a serial volume
# We create a list of the 'description' field so that we can standarize them

serial = df[df['description'] !='']
counter = Counter(serial['description'])
# serial.set_index(['ht_bib_key', 'description'])
v = pd.DataFrame.from_dict(counter, orient='index', columns=['count']).reset_index()
v = v.sort_values('count', ascending=False)
v

Unnamed: 0,index,count
1,v.2,21016
0,v.1,20829
9,v.3,10173
5,v. 2,4997
4,v. 1,4905
...,...,...
25405,v.23 July-Dec 1760,1
25406,v.24 Jan-June 1761,1
25407,v.25 July-Dec 1761,1
25408,v.26 Jan-June 1762,1


In [12]:
import re
def vol_v(s):
    if 'vol' in s.lower():
        return s.lower().replace('vol','v')
    if len(s) < 2 and s.isnumeric():
        return 'v.' + s
    return s

def capital_v(s):
    pattern = r'V([0-9\W]*)$'
    if re.search(pattern, s):
        return re.sub(pattern, r'v\1', s)
    return s

def process_v_number(s):
    # This regex looks for a pattern of optional characters before 'v', then 'v' followed by numbers
    pattern = r'(?:^|\s)([^\s]*\s)?v\s*(\d+)'
    
    match = re.search(pattern, s, re.IGNORECASE)
    if match:
        # If there's a match, return 'v.' followed by the number
        return f'v.{match.group(2)}'
    
    # If no match is found, return the original string
    return s

def process_v_dot(s):
    # This regex looks for 'v.' optionally followed by a space, then numbers
    pattern = r'\bv\.\s*((?:\d+-?)+\d*)'
    
    match = re.search(pattern, s)
    if match:
        # If there's a match, return 'v.' followed by the number
        return f'v.{match.group(1)}'
    
    # If no match is found, return the original string
    return s

def remove_leading_zeros(s):
    # This regex looks for 'v.' followed by any number of digits
    pattern = r'(v\.)0*(\d+)'
    
    # Replace function to remove leading zeros
    def replace_zeros(match):
        prefix = match.group(1)  # 'v.'
        number = match.group(2)  # The number without leading zeros
        return f"{prefix}{number}"
    
    # Apply the regex substitution
    return re.sub(pattern, replace_zeros, s)

# Here we standarize the 'description' field
# Convert 'vol' to 'v' (eg. vol1 -> v1)
v['vol-v'] = v['index'].apply(vol_v)
# Convert capital 'V' into lowercase 'v' (eg. V1 -> v1)
v['cap-c'] = v['vol-v'].apply(capital_v)
# Convert 'v' followed by numbers into 'v.' followed by numbers (eg. v1 -> v.1)
v['v-num'] = v['cap-c'].apply(process_v_number)
# Remove any whitespace between 'v.' and numbers (eg. v. 1 -> v.1)
v['v-dot'] = v['v-num'].apply(process_v_dot)
# Remove any leading zeros between 'v.' and numbers (eg. v.001 -> v.1)
v['v-zeros'] = v['v-dot'].apply(remove_leading_zeros)
v_final = v[['index','v-zeros']]
v_final

Unnamed: 0,index,v-zeros
1,v.2,v.2
0,v.1,v.1
9,v.3,v.3
5,v. 2,v.2
4,v. 1,v.1
...,...,...
25405,v.23 July-Dec 1760,v.23
25406,v.24 Jan-June 1761,v.24
25407,v.25 July-Dec 1761,v.25
25408,v.26 Jan-June 1762,v.26


In [13]:
# def extract_before_dot(s):
#     # Find the position of the first dot
#     dot_position = s.find('.')
    
#     # If a dot is found, return the substring from start to that position
#     # If no dot is found, return the entire string
#     return s[:dot_position] if dot_position != -1 else s

# Merge the list of serial volumes with the standardized 'description' field
final = pd.merge(serial, v_final, left_on='description', right_on='index')
# final['univ'] = final['htid'].apply(extract_before_dot)

# Drop duplicates based on 'record_id', standardized 'description' (v-zeros column), and university source
final = final.drop_duplicates(subset = ['ht_bib_key', 'v-zeros','source'], keep = 'first') #consider replaceing v-zeros with description
record_ids = set(final['ht_bib_key'])
final

Unnamed: 0,description,htid,ht_bib_key,source,rights_date_used,index,v-zeros
0,v.1,uc1.31822043036524,000015029,UC,1662,v.1,v.1
2,v.1,hvd.hx66p3,000015522,HVD,1806,v.1,v.1
3,v.1,mdp.39015062330330,000015522,MIU,1806,v.1,v.1
4,v.1,uc2.ark:/13960/t37080b1s,000027101,UC,1888,v.1,v.1
5,v.1,mdp.39015004291343,000027101,MIU,1888,v.1,v.1
...,...,...,...,...,...,...,...
192174,"extra series,v.2",njp.32101017659515,103100966,NJP,1895,"extra series,v.2",v.2
192175,"v.33,36",njp.32101071987075,103100966,NJP,1895,"v.33,36",v.33
192177,vol.3-4 1884-1886,njp.32101043104908,103101143,NJP,1886,vol.3-4 1884-1886,v.3-4
192178,vol.5 1886-1887,njp.32101043104916,103101143,NJP,1887,vol.5 1886-1887,v.5


In [14]:
# Here we count the number of volumes in each university for a given 'record_id'
final_source = final[['ht_bib_key','source','v-zeros']]
final_source[['count']] = 1
final_source = final_source.groupby(['ht_bib_key','source']).sum()
final_source#.reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value


Unnamed: 0_level_0,Unnamed: 1_level_0,count
ht_bib_key,source,Unnamed: 2_level_1
000015029,UC,2
000015522,HVD,4
000015522,MIU,5
000015522,NYP,5
000015522,UC,4
...,...,...
103101185,NJP,2
103101345,NJP,3
103101355,NJP,7
103101411,NJP,3


In [15]:
# Here we find the university with the longest serial set
top_serials = []
for rec_id in record_ids:
    subset = final_source.loc[rec_id,:]
    top_serials.append((rec_id, subset['count'].idxmax()))

In [16]:
# Create a DataFrame from the top_serials list
top_serials_df = pd.DataFrame(top_serials, columns=['ht_bib_key', 'source'])

# Merge the original DataFrame with the top_serials DataFrame
merged_df = final.merge(top_serials_df, on=['ht_bib_key', 'source'], how='inner')

# Select only the columns we need
d_serials = merged_df[['htid', 'ht_bib_key', 'source', 'description','v-zeros', 'rights_date_used']]

# If you want to reset the index of the resulting DataFrame
d_serials = d_serials.reset_index(drop=True)

d_serials

Unnamed: 0,htid,ht_bib_key,source,description,v-zeros,rights_date_used
0,uc1.31822043036524,000015029,UC,v.1,v.1,1662
1,uc1.31822043036573,000015029,UC,v.2,v.2,1662
2,mdp.39015062330330,000015522,MIU,v.1,v.1,1806
3,mdp.39015062330215,000015522,MIU,v.2,v.2,1806
4,mdp.39015062330348,000015522,MIU,v.4,v.4,1806
...,...,...,...,...,...,...
117370,hvd.32044014686919,102984957,HVD,v.4 (1837-43),v.4,1843
117371,hvd.32044107266025,102991213,HVD,interleaved with annotations,interleaved with annotations,1794
117372,njp.32101043104908,103101143,NJP,vol.3-4 1884-1886,v.3-4,1886
117373,njp.32101043104916,103101143,NJP,vol.5 1886-1887,v.5,1887


In [17]:
# def strip_to_numbers(list_item):
#     return set(sorted([re.sub(r'[^0-9-]', '', str(item)) for item in list_item]))

# def get_min_max(number_list):
#     numbers = [int(num) for num in number_list if num]
#     if numbers:
#         return f"{min(numbers)}-{max(numbers)}"
#     else:
#         return "N/A"

# def find_missing_numbers(row):
#     if row['min_max'] == 'N/A':
#         return []
    
#     min_val, max_val = map(int, row['min_max'].split('-'))
#     full_range = set(range(min_val, max_val + 1))
#     existing_numbers = set(int(num) for num in row['vol_nums'] if num)
#     return sorted(list(full_range - existing_numbers))

# Here we find the longest serial based on the 'rights_date_used' entry
v_f = final#[final['v-zeros'].str.contains(r'\bv\.\s*\d', regex=True)]
v_f = v_f[['ht_bib_key', 'rights_date_used']].groupby('ht_bib_key').agg(set)
# v_f['vol_nums'] = v_f['v-zeros'].apply(strip_to_numbers)
# v_f['min_max'] = v_f['rights_date_used'].apply(get_min_max)
v_f

Unnamed: 0_level_0,rights_date_used
ht_bib_key,Unnamed: 1_level_1
000015029,{1662}
000015522,{1806}
000027101,{1888}
000030447,{1840}
000045245,"{1883, 1874, 1844, 1872, 1867, 1884, 1877, 184..."
...,...
103101185,{1850}
103101345,{1840}
103101355,{1865}
103101411,{1857}


In [18]:
# Here we find the missing serial volumes from the university with the largest serial based on 'rights_date_used'

def set_difference(row):
    return list(set(row['rights_date_used_f']) - set(row['rights_date_used_ds']))
    
v_s = d_serials#[d_serials['v-zeros'].str.contains(r'\bv\.\s*\d', regex=True)]
v_s = v_s[['ht_bib_key', 'rights_date_used']].groupby('ht_bib_key').agg(set)
# # v_s['vol_nums'] = v_s['v-zeros'].apply(strip_to_numbers)
# # v_s['min_max'] = v_s['rights_date_used'].apply(get_min_max)
# # v_s['missing_numbers'] = v_s.apply(find_missing_numbers, axis=1)
v_s = v_s.merge(v_f, on=['ht_bib_key'], how='inner', suffixes = ('_ds', '_f'))
v_s['set_diff'] = v_s.apply(set_difference, axis=1)
v_s = v_s[v_s['set_diff'].apply(lambda x: len(x) != 0)]
v_s = v_s[['set_diff']].explode('set_diff').reset_index()
v_s = v_s.rename(columns={'set_diff':'rights_date_used'})
v_s

Unnamed: 0,ht_bib_key,rights_date_used
0,000045245,1863
1,000045245,1850
2,000045245,1893
3,000045245,1881
4,000045245,1854
...,...,...
4082,100884589,1590
4083,100884589,1623
4084,100884657,1658
4085,100971137,1873


In [19]:
# Here we add the the missing volumes to the largest serial set from a given university for a given 'record_id'

to_add = v_s.merge(final, on=['ht_bib_key', 'rights_date_used'], how='inner')
to_add = to_add[['htid','ht_bib_key','source', 'description', 'v-zeros','rights_date_used']].drop_duplicates(subset=['ht_bib_key','rights_date_used'])
d_serials=pd.concat([d_serials, to_add])

In [20]:
# Here we merge the unique 'record_ids' (unique) with the duplicated volumes but are not part of a 
# serial (no_serial) with the duplicated volumes that are part of a serial (d_serials)
uni = set(unique.index.to_list())
ns = set(no_serial.htid.to_list())
ds = set(d_serials.htid.to_list())
tot = uni.union(ns).union(ds)
tot = pd.DataFrame(tot, columns=['htid'])

# Here we fix the htid so that it can enter into the 'Extracted_Features_Download_Helper' algorithm
# that is in the HTRC Analytics webpage. This algorithm creates a rsync script that we can use to
# download the volumes onto our local computer.
tot['clean_htid'] = tot['htid'].apply(lambda x: x.replace(":", "+").replace("/", "=").replace("$","\$"))
tot[['htid']].rename(columns={'htid':'volume'}).to_csv('./final_workset_for_rsync_deduplicated.txt', index=False, header=True)

In [21]:
# The next step is to use the 'final_workset_for_rsync_deduplicated.txt' file to create a workset in 
# HTRC Analytics website. This workset is then used to create a rsync script using the 
# 'Extracted_Features_Download_Helper' algorithm. Finally, we use the rsync script to download the
# files locally.

Unnamed: 0,htid,clean_htid
0,umn.319510009035055,umn.319510009035055
1,mdp.39015056066601,mdp.39015056066601
2,hvd.hn3hg2,hvd.hn3hg2
3,mdp.39015064468013,mdp.39015064468013
4,njp.32101076529195,njp.32101076529195
...,...,...
264820,uiug.30112037506612,uiug.30112037506612
264821,mcg.ark:/13960/t2b86x19f,mcg.ark+=13960=t2b86x19f
264822,hvd.hnp8bv,hvd.hnp8bv
264823,uiug.30112073440080,uiug.30112073440080
