In [77]:
"""
Read in the metadata and merge with the novelty data for lit data
"""

data_dir = 'lit_data/precocity'
all_dfs = []

import pandas as pd
import os
import re
# Load metadata
df_meta = pd.read_csv('lit_data/lit_metadata/LitMetadataWithS2.tsv', sep='\t')

# Load all paper data files
print("Reading data files...")
for file in os.listdir(data_dir):
    if file.endswith('.tsv'):
        match = re.search(r'(\d{4})', os.path.basename(file))
        year = match.group(1) if match else 'unknown'
        # print(f"Processing {csv} (year: {year})")
        path = os.path.join(data_dir, file)
        df = pd.read_csv(path, sep='\t')
        df['paperId'] = df['docid']
        df['decade'] = year
        merged = df_meta.merge(df, on='paperId', how='left')
        all_dfs.append(merged)


gender_meta = pd.read_csv('lit_data/author_genders_lit.csv')
#read in the gender data we have already

gender_meta = gender_meta[gender_meta['gender'].isin(['male', 'female'])]

df_all = pd.concat(all_dfs, ignore_index=True)


"deduplicate the data by selecting one set of parameters ted recommended"

#filter for only male/female gender and also for the single set of novelty results (time chunks etc)

#chunks_used
    #use 0.25 (top 25% most novel chunks)
df_all = df_all.loc[df_all['chunks_used'] == 0.25]

#time_radius
    #use 20
df_all = df_all.loc[df_all['time_radius'] == 20]


#filtered
    #use trainauthquote
df_all = df_all.loc[df_all['filtered'] == 'trainauthquote']



#fraction_compared
    #10 percent most similar vs all articles... use all articles aka 1.0

df_all = df_all.loc[df_all['fraction_compared'] == 1.0]





Reading data files...


In [78]:
""" explode out author list"""

"""
Split up author names in lists with semicolons or commas.
Clean/lowercase all author names.
"""

def split_authors(authors_str):
    # Replace commas with semicolons, then split on semicolons
    if not isinstance(authors_str, str):
        return []
    authors = re.split(r'[;,]', authors_str)  # split on semicolon or comma
    authors = [a.strip() for a in authors if a.strip()]  # remove empty strings
    return authors


def clean_author_name2(name):
    if isinstance(name, str):
        name = name.strip()
        name = name.lower()
        name = name.replace('[', '').replace(']', '')
        name = name.replace('"', '').replace("'", '')
        return name.strip()
    return name


df_all['authors'] = df_all['authors'].apply(split_authors)
df_all = df_all.explode('authors')






In [79]:
import ast

def flatten_author(author):
    # If it's already a list, leave it
    if isinstance(author, list):
        return author[0] if len(author) == 1 else ", ".join(author)

    # If it's a string that looks like a list, convert it
    if isinstance(author, str):
        try:
            parsed = ast.literal_eval(author)
            if isinstance(parsed, list):
                return parsed[0] if len(parsed) == 1 else ", ".join(parsed)
        except Exception:
            # fallback: just return the original string
            return author

    return author

df_all['authors'] = df_all['authors'].apply(flatten_author)


In [80]:
#num_chunks
    #I don't think this one needed to be filtered?


    #apply gender dict
gender_dict = pd.Series(gender_meta['gender'].values, index=gender_meta['author']).to_dict()
df_all['gender'] = df_all['authors'].map(gender_dict).fillna('unknown')


df_all_lit = df_all

In [81]:
df_all_lit.to_csv('df_all_lit.csv')

Now we pre-process all the econ data here

In [82]:
"""
Read in the metadata and merge with the novelty data for econ data
"""

data_dir = 'econ_data/precocity'
all_dfs = []
# Load metadata
df_meta = pd.read_csv('econ_data/econ_metadata/all-econ-S2metaWyearcol.tsv', sep='\t')
# Load all paper data files
print("Reading data files...")
for file in os.listdir(data_dir):
    if file.endswith('.tsv'):
        match = re.search(r'(\d{4})', os.path.basename(file))
        year = match.group(1) if match else 'unknown'
        # print(f"Processing {csv} (year: {year})")
        path = os.path.join(data_dir, file)
        df = pd.read_csv(path, sep='\t')
        df['paperId'] = df['docid']
        df['decade'] = year
        merged = df_meta.merge(df, on='paperId', how='left')
        all_dfs.append(merged)

gender_meta = pd.read_csv('econ_data/author_gender_econ.csv')

gender_meta = gender_meta[gender_meta['gender'].isin(['male', 'female'])]

#read in the gender data we have already

df_all = pd.concat(all_dfs, ignore_index=True)



"deduplicate the data by selecting one set of parameters ted recommended"

#filter for only male/female gender and also for the single set of novelty results (time chunks etc)

#chunks_used
    #use 0.25 (top 25% most novel chunks)
df_all = df_all.loc[df_all['chunks_used'] == 0.25]

#time_radius
    #use 20
df_all = df_all.loc[df_all['time_radius'] == 20]


#filtered
    #use trainauthquote
df_all = df_all.loc[df_all['filtered'] == 'trainauthquote']



#fraction_compared
    #10 percent most similar vs all articles... use all articles aka 1.0

df_all = df_all.loc[df_all['fraction_compared'] == 1.0]



Reading data files...


In [83]:
""" explode out author list"""

"""
Split up author names in lists with semicolons or commas.
Clean/lowercase all author names.
"""

def split_authors(authors_str):
    # Replace commas with semicolons, then split on semicolons
    if not isinstance(authors_str, str):
        return []
    authors = re.split(r'[;,]', authors_str)  # split on semicolon or comma
    authors = [a.strip() for a in authors if a.strip()]  # remove empty strings
    return authors


def clean_author_name2(name):
    if isinstance(name, str):
        name = name.strip()
        name = name.lower()
        name = name.replace('[', '').replace(']', '')
        name = name.replace('"', '').replace("'", '')
        return name.strip()
    return name


df_all['creator'] = df_all['creator'].apply(split_authors)
df_all = df_all.explode('creator')






In [84]:
import ast

def flatten_author(author):
    # If it's already a list, leave it
    if isinstance(author, list):
        return author[0] if len(author) == 1 else ", ".join(author)

    # If it's a string that looks like a list, convert it
    if isinstance(author, str):
        try:
            parsed = ast.literal_eval(author)
            if isinstance(parsed, list):
                return parsed[0] if len(parsed) == 1 else ", ".join(parsed)
        except Exception:
            # fallback: just return the original string
            return author

    return author

df_all['creator'] = df_all['creator'].apply(flatten_author)


In [85]:
#num_chunks
    #I don't think this one needed to be filtered?


    #apply gender dict
gender_dict = pd.Series(gender_meta['gender'].values, index=gender_meta['author']).to_dict()
df_all['gender'] = df_all['creator'].map(gender_dict).fillna('unknown')


df_all_econ = df_all

In [86]:
df_all_lit.head()

Unnamed: 0,doi,journal,year,authors,title,language,wordcount,doctype,citation_counts,S2titles,...,num_chunks,fraction_compared,filtered,time_radius,chunks_used,precocity,novelty,transience,decade,gender
17,10.2307/461288,PMLA,1967,Jean-Jacques Demorest,Pascal et le déséquilibre,['fre'],4999,article | research-article,0,Pascal et le déséquilibre,...,7.0,1.0,trainauthquote,20.0,0.25,-0.01339,8.627299,8.640689,1960,male
68,10.2307/3723455,The Modern Language Review,1969,Patricia Thomson,Review Article,['eng'],1243,article | book-review,8,The English Petrarchans : a critical bibliogra...,...,2.0,1.0,trainauthquote,20.0,0.25,0.284837,5.996719,5.711883,1960,female
174,10.2307/3721313,The Modern Language Review,1963,Margaret McHaffie,Review Article,['eng'],600,article | book-review,0,Jeremias Gotthelf : eine Einführung in seine W...,...,1.0,1.0,trainauthquote,20.0,0.25,0.235353,6.185017,5.949665,1960,unknown
339,10.2307/460722,PMLA,1963,Lillian Herlands Hornstein,King Robert of Sicily: A New Manuscript,['eng'],3991,article | research-article,0,King Robert of Sicily: A New Manuscript,...,7.0,1.0,trainauthquote,20.0,0.25,0.112459,6.578631,6.466172,1960,female
404,10.2307/512220,The Review of English Studies,1967,William H. Matchett,Donne's 'Peece of Chronicle',['eng'],920,article | research-article,0,DONNE'S ’PEECE OF CHRONICLE‘,...,1.0,1.0,trainauthquote,20.0,0.25,0.280793,5.364418,5.083625,1960,unknown


In [87]:
df_all_econ.head()

Unnamed: 0,index,id,title,isPartOf,publicationYear,doi,docType,docSubType,provider,collection,...,num_chunks,fraction_compared,filtered,time_radius,chunks_used,precocity,novelty,transience,decade,gender
16,3,http://www.jstor.org/stable/1808956,"Tariffs, Intermediate Goods, and Domestic Prot...",The American Economic Review,1969,10.2307/1808956,article,research-article,jstor,,...,15.0,1.0,trainauthquote,20.0,0.25,0.038633,0.005717,-0.032916,1960,unknown
74,32,http://www.jstor.org/stable/1884001,Revealed Preference and the Demand Theorem in ...,The Quarterly Journal of Economics,1969,10.2307/1884001,article,research-article,jstor,,...,4.0,1.0,trainauthquote,20.0,0.25,-0.00811,-0.004197,0.003913,1960,unknown
136,78,http://www.jstor.org/stable/1811027,The Differential Effects of Tight Money: Reply,The American Economic Review,1963,10.2307/1811027,article,research-article,jstor,,...,3.0,1.0,trainauthquote,20.0,0.25,0.0269,-0.015106,-0.042006,1960,unknown
136,78,http://www.jstor.org/stable/1811027,The Differential Effects of Tight Money: Reply,The American Economic Review,1963,10.2307/1811027,article,research-article,jstor,,...,3.0,1.0,trainauthquote,20.0,0.25,0.0269,-0.015106,-0.042006,1960,unknown
184,78,http://www.jstor.org/stable/1811027,The Differential Effects of Tight Money: Reply,The American Economic Review,1963,10.2307/1811027,article,research-article,jstor,,...,3.0,1.0,trainauthquote,20.0,0.25,0.018749,-0.020036,-0.038785,1960,unknown


In [88]:
print(df_all_econ.columns)

Index(['index', 'id', 'title', 'isPartOf', 'publicationYear', 'doi', 'docType',
       'docSubType', 'provider', 'collection', 'datePublished', 'issueNumber',
       'volumeNumber', 'url', 'creator', 'publisher', 'language', 'pageStart',
       'pageEnd', 'placeOfPublication', 'keyphrase', 'wordCount', 'pageCount',
       'file', 'paperId', 'citationCount', 'foundTitle', 'foundYear',
       'foundAuthors', 'citations', 'paperSource', 'doiMatch', 'searchMatch',
       'year', 'docid', 'date', 'num_chunks', 'fraction_compared', 'filtered',
       'time_radius', 'chunks_used', 'precocity', 'novelty', 'transience',
       'decade', 'gender'],
      dtype='object')


In [89]:
df_all_econ['creator'].head()

16      Roy J. Ruffin
74     Tapas Majumdar
136        G. L. Bach
136    C. J. Huizenga
184        G. L. Bach
Name: creator, dtype: object

In [90]:
df_all_econ.rename(columns={'creator': 'authors'}, inplace=True)


In [91]:
df_all_econ.rename(columns={'isPartOf': 'journal'}, inplace=True)


In [92]:
df_all_econ.rename(columns={'citationCount': 'citation_counts'}, inplace=True)


In [93]:
# df_all_econ.rename(columns={'publicationYear': 'year'}, inplace=True)


In [94]:
df_all_econ_filtered = df_all_econ[['authors','journal','citation_counts','precocity','novelty','gender','year']].copy()

In [95]:
df_all_lit_filtered = df_all_lit[['authors','journal','citation_counts','precocity','novelty','gender','year']].copy()




In [96]:
df_all_econ_filtered.loc[:, 'domain'] = 'econ'

In [97]:
df_all_lit_filtered.loc[:, 'domain'] = 'lit'

In [98]:
df_all_econ_filtered.index.is_unique   # should be True


False

In [99]:
df_all_lit_filtered.index.is_unique    # should be True


False

In [100]:
df_all_econ_filtered.columns[df_all_econ_filtered.columns.duplicated()]


Index([], dtype='object')

In [101]:
df_all_econ_filtered.reset_index(drop=True, inplace=True)
df_all_lit_filtered.reset_index(drop=True, inplace=True)

df_concat_litecon = pd.concat(
    [df_all_econ_filtered, df_all_lit_filtered],
    ignore_index=True
)


In [102]:
df_all_econ_filtered.index[df_all_econ_filtered.index.duplicated()]


Index([], dtype='int64')

In [103]:
df_concat_litecon.to_csv('df_litecon.csv')

In [104]:
df_concat_litecon.head()

Unnamed: 0,authors,journal,citation_counts,precocity,novelty,gender,year,domain
0,Roy J. Ruffin,The American Economic Review,44,0.038633,0.005717,unknown,1969,econ
1,Tapas Majumdar,The Quarterly Journal of Economics,6,-0.00811,-0.004197,unknown,1969,econ
2,G. L. Bach,The American Economic Review,1,0.0269,-0.015106,unknown,1963,econ
3,C. J. Huizenga,The American Economic Review,1,0.0269,-0.015106,unknown,1963,econ
4,G. L. Bach,The American Economic Review,1,0.018749,-0.020036,unknown,1963,econ


In [105]:
df_concat_litecon.tail()

Unnamed: 0,authors,journal,citation_counts,precocity,novelty,gender,year,domain
68544,Carlos Fuentes,The Modern Language Review,0,0.180122,5.312028,male,1989,lit
68545,Hugh White,The Review of English Studies,0,0.112379,5.490325,male,1987,lit
68546,John T. Matthews,ELH,2,0.39613,5.409131,male,1980,lit
68547,Hinrich Siefken,The Modern Language Review,0,0.165402,5.226582,unknown,1982,lit
68548,Barrie Ruth Straus,ELH,18,0.404571,5.552968,unknown,1988,lit
