In this notebook, we attempt to model 'award-worthiness' using a logistic model and a mean-field approach to the cohort effects.

In [24]:
# standard imports
import pandas as pd
import seaborn as sns
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [3]:
shortlists = pd.read_csv("../data/train_novels.csv")
shortlists.head()

Unnamed: 0,author_qid,pubDate,goodreads_id,title,authorLabel,work_qid,year,n_nom,n_win,awards_as_of_year,age,birth_country,gender,topicality,max,count,median
0,Q1006918,1965-01-01T00:00:00Z,,The Ship That Sailed the Time Stream,G. C. Edmondson,Q122452512,1966,1,0,0,44.0,United States,male,,,,
1,Q1029497,2006-11-28T00:00:00Z,,Carnival,Elizabeth Bear,Q25217435,2007,2,0,1,36.0,United States,female,0.23294,,,
2,Q1029497,2007-08-01T00:00:00Z,,Undertow,Elizabeth Bear,Q131381664,2008,1,0,4,37.0,United States,female,0.258934,,,
3,Q1029497,2010-02-20T00:00:00Z,,Chill,Elizabeth Bear,Q131382134,2011,1,0,13,40.0,United States,female,-0.182816,,,
4,Q1029497,2011-02-22T00:00:00Z,,Grail,Elizabeth Bear,Q131382162,2012,1,0,15,41.0,United States,female,0.169822,,,


As it stands, the data in `shortlists` has a row per novel author pair.
We want to first flatten this so that we have one entry for each book on the short lists.

In [4]:
shortlists.columns

Index(['author_qid', 'pubDate', 'goodreads_id', 'title', 'authorLabel',
       'work_qid', 'year', 'n_nom', 'n_win', 'awards_as_of_year', 'age',
       'birth_country', 'gender', 'topicality', 'max', 'count', 'median'],
      dtype='object')

In [5]:
safe_to_drop = ["author_qid", "goodreads_id", "title", "authorLabel"]
shortlists.drop(columns=safe_to_drop, inplace=True)

shortlists.head()

Unnamed: 0,pubDate,work_qid,year,n_nom,n_win,awards_as_of_year,age,birth_country,gender,topicality,max,count,median
0,1965-01-01T00:00:00Z,Q122452512,1966,1,0,0,44.0,United States,male,,,,
1,2006-11-28T00:00:00Z,Q25217435,2007,2,0,1,36.0,United States,female,0.23294,,,
2,2007-08-01T00:00:00Z,Q131381664,2008,1,0,4,37.0,United States,female,0.258934,,,
3,2010-02-20T00:00:00Z,Q131382134,2011,1,0,13,40.0,United States,female,-0.182816,,,
4,2011-02-22T00:00:00Z,Q131382162,2012,1,0,15,41.0,United States,female,0.169822,,,


In [9]:
AUTHOR_COLUMNS = ["age", "gender", "awards_as_of_year", "birth_country"]


def reduce_authors(df: pd.DataFrame) -> pd.DataFrame:
    author_data = (
        df.groupby("work_qid")[AUTHOR_COLUMNS]
        .agg(
            mean_age=("age", "mean"),
            gender=("gender", list),
            birth_country=("birth_country", list),
            awards_as_of_year=("awards_as_of_year", "sum"),
        )
        .reset_index()
    )

    non_author_data = df.drop(columns=AUTHOR_COLUMNS).drop_duplicates(
        subset=["work_qid"], keep="first"
    )

    return pd.merge(non_author_data, author_data, on="work_qid", how="left")


def prepare_bestseller_stats(df: pd.DataFrame) -> pd.DataFrame:
    return df.fillna({"max": 0, "count": 0, "median": 0}).rename(
        columns={
            "max": "max_bestseller_rank",
            "count": "months_on_bestseller",
            "median": "median_bestseller_rank",
        }
    )


def compute_cohort_stats(df: pd.DataFrame) -> pd.DataFrame:
    return df.merge(df.groupby("year").agg(
        tot_cohort_nom=("n_nom", "sum"),
        tot_cohort_awards=("n_win", "sum"),
    ).reset_index(), on='year', how='outer')


def estimate_target(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(

    ).drop(columns=["n_win"])


def parse_dates(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(month=pd.to_datetime(df.pubDate).dt.month).drop(columns=["pubDate"])


def prepare_df(df: pd.DataFrame) -> pd.DataFrame:
    return (
        df.pipe(reduce_authors)
        .pipe(prepare_bestseller_stats)
        .pipe(parse_dates)
        .pipe(compute_cohort_stats)
    )


prepare_df(shortlists)


Unnamed: 0,work_qid,year,n_nom,n_win,topicality,max_bestseller_rank,months_on_bestseller,median_bestseller_rank,mean_age,gender,birth_country,awards_as_of_year,month,tot_cohort_nom,tot_cohort_awards
0,Q1213656,1959,1,0,0.345319,0.0,0.0,0.0,52.0,[male],[United States],0,8.0,4,1
1,Q18208798,1959,1,0,0.269948,0.0,0.0,0.0,33.0,[male],[United States],0,8.0,4,1
2,Q586831,1959,0,1,0.363310,0.0,0.0,0.0,38.0,[male],[United States],1,1.0,4,1
3,Q519935,1959,1,0,0.459207,0.0,0.0,0.0,31.0,[male],[United States],1,1.0,4,1
4,Q3414362,1959,1,0,0.299219,0.0,0.0,0.0,28.0,[male],[Prussia],2,1.0,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,Q55653446,2018,2,0,-0.088432,0.0,0.0,0.0,38.0,[female],[United States],8,2.0,52,5
1453,Q131414152,2018,1,0,0.089377,0.0,0.0,0.0,76.0,[male],[United States],21,10.0,52,5
1454,Q100324066,2018,2,0,0.090678,0.0,0.0,0.0,50.0,[female],[Hungary],15,6.0,52,5
1455,Q131382750,2018,1,0,-0.091164,0.0,0.0,0.0,52.0,[female],[Russia],2,7.0,52,5


In [None]:
def row_count_encode(df, columns):
    return (
        pd.get_dummies(df[["work_qid"] + columns].explode(columns), columns=columns)
        .groupby("work_qid")
        .sum()
    )


row_count_encode(prepare_df(shortlists), ["gender", "birth_country"])

prepare_df(shortlists).explode(["gender"])

Unnamed: 0,pubDate,work_qid,year,n_nom,n_win,topicality,max_bestseller_rank,months_on_bestseller,median_bestseller_rank,mean_age,gender,birth_country,awards_as_of_year
0,1965-01-01,Q122452512,1966,1,0,,0.0,0.0,0.0,44.0,male,[United States],0
1,2006-11-28,Q25217435,2007,2,0,0.232940,0.0,0.0,0.0,36.0,female,[United States],1
2,2007-08-01,Q131381664,2008,1,0,0.258934,0.0,0.0,0.0,37.0,female,[United States],4
3,2010-02-20,Q131382134,2011,1,0,-0.182816,0.0,0.0,0.0,40.0,female,[United States],13
4,2011-02-22,Q131382162,2012,1,0,0.169822,0.0,0.0,0.0,41.0,female,[United States],15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,1982-09-01,Q97357253,1983,1,0,-0.398277,0.0,0.0,0.0,69.0,male,[United States],19
1453,2014-10-06,Q19868032,2015,1,0,0.148777,0.0,0.0,0.0,55.0,male,[Netherlands],0
1454,2001-10-01,Q44286620,2002,1,0,-0.840815,0.0,0.0,0.0,44.0,male,[Canada],1
1455,2006-10-03,Q2511421,2007,2,0,0.237130,0.0,0.0,0.0,49.0,male,[Canada],2


In [108]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder


class RowCountEncoder(TransformerMixin,BaseEstimator):
    def __init__(self, columns, key_column):
        self.key_column = key_column
        self.columns = columns
        self.encoder = OneHotEncoder(sparse_output=False, 
                                     handle_unknown='infrequent_if_exist',
                                     min_frequency=0.01)
        self.encoder.set_output(transform="pandas")

    def _explode(self, X: pd.DataFrame) -> pd.DataFrame:
        return X[[self.key_column] + self.columns].explode(self.columns)

    def fit(self, X, y=None):
        self.n_features_in_ = len(X.columns)
        self.other_columns = [c for c in X.columns if c not in self.columns]
        self.encoder.fit(self._explode(X).drop(columns=[self.key_column]))
        return self

    def get_feature_names_out(self, input_features=None):
        return self.other_columns + self.encoder.get_feature_names_out(input_features)

    def transform(self, X: pd.DataFrame, y=None):
        exploded = self._explode(X)

        return pd.merge(
            X.drop(columns=self.columns),
            pd.concat(
                [
                    exploded[self.key_column],
                    self.encoder.transform(exploded.drop(columns=[self.key_column])),
                ]
            )
            .groupby(self.key_column)
            .sum(),
            on=self.key_column
        )


In [123]:
from sklearn.base import RegressorMixin
from sklearn.metrics import log_loss, confusion_matrix

X_train = prepare_df(shortlists)
y_train = X_train["n_win"]/np.maximum(X_train["n_win"]+X_train["n_nom"], 1)
X_train = X_train.drop(columns=["n_win"])

class NaiveFractionModel(RegressorMixin, BaseEstimator):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        pass

    def predict(self, X, y=None):
        return X["tot_cohort_awards"]*X["n_nom"]/X["tot_cohort_nom"]

baseline_model = NaiveFractionModel()
baseline_model.fit(X_train)
y_baseline = baseline_model.predict(X_train)

-log_loss(y_train>0, y_baseline, sample_weight=X_train["n_nom"])

# -np.sum(y_train*np.log(y_baseline+1e-10))
# sns.residplot(y=(y_train, x=y_baseline)


-0.5484845493101105

In [132]:

from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupKFold

def impute_topicality(df):
    return df.assign(
        topicality=df.groupby("year")["topicality"].transform(
            lambda x: SimpleImputer().fit_transform(x.values.reshape(-1, 1)).reshape(-1)
        )
    )


pipeline = Pipeline(
    [
        ("impute_topicality", FunctionTransformer(impute_topicality)),
        (
            "encode_author_data",
            RowCountEncoder(["gender", "birth_country"], "work_qid"),
        ),
        ('drop', make_column_transformer(
            ('drop', ['work_qid', 'month']),
            remainder='passthrough'
        )),
        ('imputer', SimpleImputer(strategy='median')),
        ('logistic', LogisticRegression(max_iter=1000, penalty=None, fit_intercept=False))
    ]
)

pipeline.set_output(transform="pandas")

cross_val_score(pipeline, X_train, y_train > 0, cv=GroupKFold(n_splits=5), groups=X_train["year"], scoring="neg_log_loss" ).mean()   

np.float64(-0.3644983879568703)

In [131]:
from sklearn.metrics import confusion_matrix

kfold = GroupKFold(n_splits=5 )
confusion = np.zeros((2,2))
for train_index, test_index in kfold.split(X_train, y_train, groups=X_train["year"]):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    pipeline.fit(X_train_fold, y_train_fold>0)
    confusion += confusion_matrix(y_test_fold>0, pipeline.predict(X_test_fold))

confusion

array([[1272.,    0.],
       [ 183.,    2.]])