In [50]:
#read in the data
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import math



gender_meta = pd.read_csv('df_gender.csv')
df_meta = pd.read_csv('../metadata/19cfictionmeta.tsv', sep='\t')
meta = pd.read_csv('../metadata/enriched_metadata.tsv', sep='\t')


In [51]:
df_all = pd.read_csv('df_all_fiction.csv')

In [60]:
gender_meta.head()

Unnamed: 0.1,Unnamed: 0,author,gender
0,0,"Robert Hugh, (Spirit) Benson",
1,1,Mary Catharine Rowsell,
2,2,I. A. R. (Ida Alexa Ross) Wylie,
3,3,Alice Milligan,female
4,4,"Nina (Wilcox), Mrs Putnam",


In [61]:
def flip_name(n):
    if ',' in n:
        last, first = n.split(',', 1)
        return f"{first.strip()} {last.strip()}"
    return n  # leave unchanged

gender_meta['author'] = gender_meta['author'].apply(flip_name)

In [62]:
gender_meta.head()


Unnamed: 0.1,Unnamed: 0,author,gender
0,0,(Spirit) Benson Robert Hugh,
1,1,Mary Catharine Rowsell,
2,2,I. A. R. (Ida Alexa Ross) Wylie,
3,3,Alice Milligan,female
4,4,Mrs Putnam Nina (Wilcox),


In [63]:
#apply gender dict
gender_dict = pd.Series(gender_meta['gender'].values, index=gender_meta['author']).to_dict()
df_all['gender'] = df_all['author'].map(gender_dict).fillna('unknown')
#
# gender_dict = pd.Series(gender_meta['hand_gender'].values, index=gender_meta['author']).to_dict()
# df_all['hand_gender'] = df_all['creator'].map(gender_dict).fillna('unknown')


In [64]:
df_all.head()

Unnamed: 0.1,Unnamed: 0,BOOK_ID,FILENAME,LIBRARIES,TITLE,AUTH_LAST,AUTH_FIRST,AUTH_ID,WRITTEN_AS,PUBL_CITY,...,fraction_compared,filtered,time_radius,chunks_used,precocity,novelty,transience,decade,author,gender
0,32,nyp.33433076050347,nyp.33433076050347.txt,,The wall between,Paine,"['Ralph', 'Delahaye']",,,,...,1.0,trainauthquote,20.0,0.25,-0.089578,4.664888,4.754466,1910,"Ralph, DelahayePaine",unknown
1,93,3262,00003262.txt,86.0,The angel of Lonesome Hill,Landis,Frederick,A_01980,,New York,...,1.0,trainauthquote,20.0,0.25,0.030063,5.273641,5.243578,1910,FrederickLandis,unknown
2,149,4849,00004849.txt,300.0,Sons and lovers,Lawrence,D. H.,A_02006,,New York,...,1.0,trainauthquote,20.0,0.25,0.129034,4.988555,4.859521,1910,D. H.Lawrence,unknown
3,260,3499,00003499.txt,82.0,Philip Dru: administrator,House,Edward Mandell,A_01705,,New York,...,1.0,trainauthquote,20.0,0.25,-0.067784,4.983762,5.051545,1910,Edward MandellHouse,unknown
4,335,uc2.ark+=13960=t6639kt8d,uc2.ark+=13960=t6639kt8d.txt,,Sylvia,Sinclair,['Upton'],,,,...,1.0,trainauthquote,20.0,0.25,-0.098932,4.660319,4.759251,1910,UptonSinclair,unknown


In [54]:

# We also create a list of the docids that have been discussed
# in critical articles, and a contrast set that have the 
# same distribution over time but are not mentioned in our
# literary studies corpus.

discussed = meta.loc[meta.is_discussed == True].index.tolist()
discussed_contrast = meta.loc[meta.discussed_contrast == True].index.tolist()

print('We have ', len(discussed), ' discussed docs and ', len(discussed_contrast), ' contrast docs.')

We have  463  discussed docs and  926  contrast docs.


In [55]:

def print_effect_size(t, p, df):
    ''' Calculates Cohen's d and r2
    for t-test statistics.'''

    d = 2*t / math.sqrt(df)
    r2 = t**2 / (t**2 + df)

    print(f'Cohens d is {d} and r2 is {r2}.')

In [56]:
dis_kld_precoc = []
notdis_kld_precoc = []

for decade in range(10, 80, 5):
	
	data = pd.read_csv('../precocity/precocity_tuned_19' + str(decade) + 's_docs.tsv', sep = '\t')
	data['docid'] = data['docid'].astype(str)
	data.set_index('docid', inplace=True)
    

	selected = pd.DataFrame(data.loc[(data.time_radius == 20) & (data.filtered == 'trainauth') & (data.chunks_used == .25) & (data.fraction_compared == 1.0), :])
	dis_kld_precoc.extend(selected.loc[selected.index.isin(discussed), 'precocity'])
	notdis_kld_precoc.extend(selected.loc[selected.index.isin(discussed_contrast), 'precocity'])

t, p = ttest_ind(dis_kld_precoc, notdis_kld_precoc)
df = len(dis_kld_precoc) + len(notdis_kld_precoc) - 2
print(t, p, df)
print_effect_size(t, p, df)

nan nan -2


  return f(*args, **kwargs)


ValueError: math domain error

In [58]:
import statsmodels.formula.api as smf

model = smf.logit("is_discussed ~ C(gender) + precocity + C(gender):precocity + decade", data=df_all).fit()
print(model.summary())



Optimization terminated successfully.
         Current function value: 0.258223
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:           is_discussed   No. Observations:                 5878
Model:                          Logit   Df Residuals:                     5875
Method:                           MLE   Df Model:                            2
Date:                Fri, 05 Dec 2025   Pseudo R-squ.:                 0.05494
Time:                        16:02:18   Log-Likelihood:                -1517.8
converged:                       True   LL-Null:                       -1606.1
Covariance Type:            nonrobust   LLR p-value:                 4.798e-39
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -7.2661      5.102     -1.424      0.154     -17.265       2.733
precocity      5.5600      0.

In [59]:
df_all['gender'].value_counts(dropna=False)


gender
unknown    5878
Name: count, dtype: int64