In [21]:
from pandas.core.interchange.dataframe_protocol import DataFrame

"""
Read in the metadata and merge with the novelty data for lit data
"""

data_dir = '../precocity'
all_dfs = []

import pandas as pd
import os
import re
# Load metadata
df_meta = pd.read_csv('../metadata/19cfictionmeta.tsv', sep='\t')

# Load all paper data files
print("Reading data files...")
for file in os.listdir(data_dir):
    if file.endswith('.tsv'):
        match = re.search(r'(\d{4})', os.path.basename(file))
        year = match.group(1) if match else 'unknown'
        # print(f"Processing {csv} (year: {year})")
        path = os.path.join(data_dir, file)
        df = pd.read_csv(path, sep='\t')
        df['paperId'] = df['docid']
        df['decade'] = year
        merged = df_meta.merge(df, on='docid', how='left')
        all_dfs.append(merged)


# gender_meta = pd.read_csv('lit_data/author_genders_lit.csv')
#read in the gender data we have already

# gender_meta = gender_meta[gender_meta['gender'].isin(['male', 'female'])]

df_all = pd.concat(all_dfs, ignore_index=True)


"deduplicate the data by selecting one set of parameters ted recommended"

#filter for only male/female gender and also for the single set of novelty results (time chunks etc)

#chunks_used
    #use 0.25 (top 25% most novel chunks)
df_all = df_all.loc[df_all['chunks_used'] == 0.25]

#time_radius
    #use 20
df_all = df_all.loc[df_all['time_radius'] == 20]


#filtered
    #use trainauthquote
df_all = df_all.loc[df_all['filtered'] == 'trainauthquote']



#fraction_compared
    #10 percent most similar vs all articles... use all articles aka 1.0

df_all = df_all.loc[df_all['fraction_compared'] == 1.0]





Reading data files...


In [22]:
df_all.to_csv('df_all_fiction.csv')

In [23]:
df_meta.head()

Unnamed: 0,docid,allcopiesofwork,author,copiesin25yrs,earlyedition,imprint,inferreddate,lastname,latestcomp,nationality,...,mostdiscussedcontrast,usnorton,usnortoncontrast,nonusnorton,nonusnortoncontrast,preregistered,preregisteredcontrast,reviewed1965_1990,reviewed1965_1990contrast,toremove
0,uc2.ark+=13960=t5n877988,2.0,"Aytoun, William Edmondstoune",2.0,True,Edinburgh;London;William Blackwood and Sons;1861.,1861,Aytoun,1861,uk,...,0,0,0,0,0,0,0,0,0,False
1,uc2.ark+=13960=t1pg1hz3p,1.0,"Winfield, Arthur M",1.0,True,New York;Grosset & Dunlap;c1916.,1916,Winfield,1916,uk,...,0,0,0,0,0,0,0,0,0,False
2,uc1.b4369662,1.0,,1.0,True,Boston|Houghton Mifflin|1980.,1980,anonymous0,1980,guess: us,...,0,0,0,0,0,0,0,0,0,False
3,mdp.39015037418947,1.0,"Doig, Ivan",1.0,True,New York|Simon & Schuster|c1996.,1996,Doig,1996,us,...,0,0,0,0,0,0,0,0,0,False
4,uc1.32106010927223,2.0,"Tan, Amy",2.0,True,New York|Putnam|c1989.,1989,Tan,1989,us,...,0,1,0,0,0,0,0,1,0,False


In [24]:
def clean_author_name(name):
    if isinstance(name, str):
        name = name.strip()
        name = name.replace('[', '').replace(']', '')
        name = name.replace('"', '').replace("'", '')
        return name.strip()
    return name

In [25]:
df_all['author'] = df_all['author'].apply(clean_author_name)


In [26]:
unique_authors = df_all['author'].unique()

In [27]:
len(unique_authors)

491

In [28]:
df_gender = pd.DataFrame(columns=['author','gender'])

df_gender['author'] = unique_authors


In [29]:
# df_gender['author'] = df_gender['author'].str.split(',').apply(lambda x: x[1].strip() + ' ' + x[0].strip())


In [30]:
def flip_name(n):
    if ',' in n:
        last, first = n.split(',', 1)
        return f"{first.strip()} {last.strip()}"
    return n  # leave unchanged

df_gender['author'] = df_gender['author'].apply(flip_name)


In [31]:
#add gender metadata

import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import time

# Example DataFrame
# df = pd.DataFrame({
#     'author': ['Mary Shelley', 'H.G. Wells', 'J.K. Rowling'],
#     'title': ['Frankenstein', 'The Time Machine', 'Harry Potter']
# })

# Wikidata SPARQL endpoint
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

def get_gender_wikidata(name):
    """
    Query Wikidata for a person's gender given their name.
    Returns 'male', 'female', or None.
    """
    query = f"""
    SELECT ?genderLabel WHERE {{
      ?person wdt:P31 wd:Q5;        # instance of human
              rdfs:label "{name}"@en;
              wdt:P21 ?gender.     # property for gender/sex
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    LIMIT 1
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    try:
        results = sparql.query().convert()
        bindings = results['results']['bindings']
        if bindings:
            return bindings[0]['genderLabel']['value']
        else:
            return None
    except Exception as e:
        print(f"Error querying {name}: {e}")
        return None

# Add gender column
genders = []
for author in df_gender['author']:
    gender = get_gender_wikidata(author)
    genders.append(gender)
    time.sleep(1)  # be nice to the server

df_gender['gender'] = genders

# print(df)

In [32]:
df_gender.head()

Unnamed: 0,author,gender
0,"Robert Hugh, (Spirit) Benson",
1,Mary Catharine Rowsell,
2,I. A. R. (Ida Alexa Ross) Wylie,
3,Alice Milligan,female
4,"Nina (Wilcox), Mrs Putnam",


In [33]:
df_gender.to_csv('df_gender.csv')