In this notebook, we attempt to model 'award-worthiness' using a logistic model and a mean-field approach to the cohort effects.

In [1]:
%load_ext autoreload
%autoreload 2
# standard imports
import pandas as pd
import seaborn as sns
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append("..")

In [6]:
shortlists = pd.read_csv("../data/train_novels.csv")
shortlists.head()

Unnamed: 0,author_qid,pubDate,goodreads_id,title,authorLabel,work_qid,year,n_nom,n_win,awards_as_of_year,age,birth_country,gender,topicality,max,count,median
0,Q1006918,1965-01-01T00:00:00Z,,The Ship That Sailed the Time Stream,G. C. Edmondson,Q122452512,1966,1,0,0,44.0,United States,male,0.219992,,,
1,Q1029497,2006-11-28T00:00:00Z,,Carnival,Elizabeth Bear,Q25217435,2007,2,0,1,36.0,United States,female,0.169624,,,
2,Q1029497,2007-08-01T00:00:00Z,,Undertow,Elizabeth Bear,Q131381664,2008,1,0,4,37.0,United States,female,0.245872,,,
3,Q1029497,2010-02-20T00:00:00Z,,Chill,Elizabeth Bear,Q131382134,2011,1,0,13,40.0,United States,female,-0.195356,,,
4,Q1029497,2011-02-22T00:00:00Z,,Grail,Elizabeth Bear,Q131382162,2012,1,0,15,41.0,United States,female,0.182915,,,


As it stands, the data in `shortlists` has a row per novel author pair.
We want to first flatten this so that we have one entry for each book on the short lists.

In [3]:
shortlists.columns

Index(['author_qid', 'pubDate', 'goodreads_id', 'title', 'authorLabel',
       'work_qid', 'year', 'n_nom', 'n_win', 'awards_as_of_year', 'age',
       'birth_country', 'gender', 'topicality', 'max', 'count', 'median'],
      dtype='object')

In [7]:
safe_to_drop = ["author_qid", "goodreads_id", "title", "authorLabel"]
shortlists.drop(columns=safe_to_drop, inplace=True)

shortlists.head()

Unnamed: 0,pubDate,work_qid,year,n_nom,n_win,awards_as_of_year,age,birth_country,gender,topicality,max,count,median
0,1965-01-01T00:00:00Z,Q122452512,1966,1,0,0,44.0,United States,male,0.219992,,,
1,2006-11-28T00:00:00Z,Q25217435,2007,2,0,1,36.0,United States,female,0.169624,,,
2,2007-08-01T00:00:00Z,Q131381664,2008,1,0,4,37.0,United States,female,0.245872,,,
3,2010-02-20T00:00:00Z,Q131382134,2011,1,0,13,40.0,United States,female,-0.195356,,,
4,2011-02-22T00:00:00Z,Q131382162,2012,1,0,15,41.0,United States,female,0.182915,,,


In [10]:
from models.prepare import prepare_df
prepare_df(shortlists)


Unnamed: 0,work_qid,year,n_win,topicality,max_bestseller_rank,months_on_bestseller,median_bestseller_rank,mean_age,gender,birth_country,awards_as_of_year,n_nom_all,month,tot_cohort_nom,tot_cohort_awards
0,Q1213656,1959,0,0.221080,0.0,0.0,0.0,52.0,[male],[United States],0,1,8.0,5,1
1,Q18208798,1959,0,0.190188,0.0,0.0,0.0,33.0,[male],[United States],0,1,8.0,5,1
2,Q586831,1959,1,0.228423,0.0,0.0,0.0,38.0,[male],[United States],1,1,1.0,5,1
3,Q519935,1959,0,0.267346,0.0,0.0,0.0,31.0,[male],[United States],1,1,1.0,5,1
4,Q3414362,1959,0,0.202211,0.0,0.0,0.0,28.0,[male],[Prussia],2,1,1.0,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,Q55653446,2018,0,-0.119301,0.0,0.0,0.0,38.0,[female],[United States],8,2,2.0,56,5
1453,Q131414152,2018,0,0.111775,0.0,0.0,0.0,76.0,[male],[United States],21,1,10.0,56,5
1454,Q100324066,2018,0,0.101054,0.0,0.0,0.0,50.0,[female],[Hungary],15,2,6.0,56,5
1455,Q131382750,2018,0,-0.096929,0.0,0.0,0.0,52.0,[female],[Russia],2,1,7.0,56,5


In [69]:
from sklearn.metrics import f1_score
from models.prepare import prepare_df
from models.naive import naive_win_counts

X_train = prepare_df(shortlists)
y_train = X_train.pop("n_win")

N_baseline_runs = 1000
baseline_f1s = np.zeros(N_baseline_runs)

for i in range(N_baseline_runs):
    baseline_f1s[i] = f1_score(y_train > 0, naive_win_counts(X_train) > 0)

baseline_f1s.mean()



np.float64(0.22783038104745326)

In [67]:

from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score, GroupKFold
from models.transformers import RowCountEncoder

def impute_topicality(df):
    return df.assign(
        topicality=df.groupby("year")["topicality"].transform(
            lambda x: SimpleImputer().fit_transform(x.values.reshape(-1, 1)).reshape(-1)
        )
    )


pipeline = Pipeline(
    [
        ("impute_topicality", FunctionTransformer(impute_topicality)),
        (
            "encode_author_data",
            RowCountEncoder(["gender", "birth_country"], "work_qid"),
        ),
        ('drop', make_column_transformer(
            ('drop', ['work_qid', 'month']),
            remainder='passthrough'
        )),
        ('imputer', SimpleImputer(strategy='median')),
        ('logistic', LogisticRegressionCV(max_iter=10000, fit_intercept=False,
                                          class_weight='balanced'))
    ]
)

pipeline.set_output(transform="pandas")

cross_val_score(pipeline, X_train, y_train > 0, cv=GroupKFold(n_splits=5), groups=X_train["year"], scoring="f1", params={"logistic__sample_weight":X_train["n_nom_all"]}).mean()   

np.float64(0.4250004682607491)

In [68]:
from sklearn.metrics import confusion_matrix

kfold = GroupKFold(n_splits=5 )
confusion = np.zeros((2,2))
f1_logistics = np.zeros(5)
for i, (train_index, test_index) in enumerate(kfold.split(X_train, y_train, groups=X_train["year"])):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    pipeline.fit(X_train_fold, y_train_fold>0, logistic__sample_weight=X_train_fold["n_nom_all"])
    y_pred = pipeline.predict(X_test_fold)
    confusion += confusion_matrix(y_test_fold>0, y_pred)
    f1_logistics[i] = f1_score(y_test_fold>0, y_pred)

print(confusion)
print(f1_logistics.mean())

[[1014.  262.]
 [  61.  120.]]
0.4250004682607491
