In [2]:
common_cols = ['state_code','municipality_code','year']
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

In [3]:
def crude_mortality(df):
    df['cmr_pop'] = (df['ndeaths_y'].sum() / df['population'].sum())*1000
    df['cmr_mine'] = (df.groupby(common_cols)['ndeaths_y'].transform('sum') / df.groupby(common_cols)['population'].transform('sum'))*1000
    return df

In [5]:
def standard_mortality(df, aggcols, confcols):
    allcols = aggcols + confcols
    coln = ''
    for col in allcols:
        coln = coln + col + '_'
    coln = coln + 'rate'
    # allcols = aggcols + confcols
    if not allcols:
        allcols = ['count']
    df2 = df.groupby(allcols).agg({
        'ndeaths_y': 'sum', 'population': 'sum'
    }).reset_index()
    df2[coln] = (df2['ndeaths_y'] / df2['population'])*1000
    mergecols = allcols + [coln]
    df3 = pd.merge(df, df2[mergecols], on=allcols)
    return df3

In [None]:
def get_confound(df, cols, cola, keepcols=[],colname=0,mmode=0):
    coln = ''
    coln2 = 'smr'
    for col in cols:
        coln = coln + col + '_'
        coln2 = coln2 + '_' + col
    coln = coln + 'rate'
    df['ed'] = df['population'] * df[coln]
    agg_dict = { 'ndeaths_y': 'sum', 'ed': 'sum' }
    agg_dict.update({col: 'sum' for col in keepcols if col in df.columns})
    dfn = df.groupby(cola).agg(agg_dict).reset_index()
    if(colname==0):
        if mmode==0:
            dfn[coln2] = dfn['ndeaths_y'] * 1000 / dfn['ed']
        else:
            dfn[coln2] = dfn['ndeaths_y'] - dfn['ed']/1000
    else:
        coln2 = 'smr'
        if mmode==0:
            dfn['smr'] = dfn['ndeaths_y'] * 1000 / dfn['ed']
        else: 
            dfn['smr'] = dfn['ndeaths_y'] - dfn['ed']/1000
    valid_keepcols = [col for col in keepcols if col in dfn.columns]
    return dfn[cola+[coln2]+valid_keepcols]

In [None]:
def compute_smr(df, cols, cola, cancer_cols, keepcols=True, colname=0,mmode=0):
    coln = ''
    coln2 = 'smr'
    for col in cols:
        coln = coln + col + '_'
        coln2 = coln2 + '_' + col
    coln = coln + 'rate'
    df['ed'] = df['population'] * df[coln]
    agg_dict = { 'ndeaths_y': 'sum', 'ed': 'sum', 'population': 'sum' }
    agg_dict.update({col: 'mean' for col in cancer_cols if col in df.columns})
    dfn = df.groupby(cola).agg(agg_dict).reset_index()
    if(colname==0):
        if mmode==0:
            dfn[coln2] = dfn['ndeaths_y'] * 1000 / dfn['ed']
        else:
            dfn[coln2] = dfn['ndeaths_y'] - dfn['ed']/1000
    else:
        coln2 = 'smr'
        if mmode==0:
            dfn['smr'] = dfn['ndeaths_y'] * 1000 / dfn['ed']
        else: 
            dfn['smr'] = dfn['ndeaths_y'] - dfn['ed']/1000
    valid_keepcols = [col for col in cancer_cols if col in dfn.columns]
    if keepcols:
        valid_keepcols = valid_keepcols + ['ndeaths_y', 'ed','population']
    valid_keepcols = [col for col in valid_keepcols if col in dfn.columns]
    return dfn[cola+[coln2]+valid_keepcols]

In [None]:
def t_test(dfy, smr_columns):
    results = []
    for i, col1 in enumerate(smr_columns):
        for j, col2 in enumerate(smr_columns):
            if i < j:  # Ensure each combination is unique and avoid self-comparison
                t_statistic, p_value = stats.ttest_rel(dfy[col1], dfy[col2])
                mean_diff = dfy[col1] - dfy[col2]
                effect_size = mean_diff.mean() / mean_diff.std()
                results.append({
                    'col1': col1,'col2': col2,'t_statistic': t_statistic,'p_value': p_value,'effect_size': effect_size
                })
    results_df = pd.DataFrame(results)
    return results_df

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calculate_vif(X):
    vif = pd.DataFrame()
    vif["Features"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

In [None]:
def run_fitting(df, cols, pred):
    X = df[cols]
    y = df[pred]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    lasso = LassoCV(alphas=np.logspace(-10, 10, 100), cv=10, random_state=42, max_iter=1000000)
    lasso.fit(X_train_scaled, y_train)
    print("Best alpha:", lasso.alpha_)
    cv_scores = cross_val_score(lasso, X_train_scaled, y_train, cv=10)
    print("Cross-validated training score:", np.mean(cv_scores))
    test_score = lasso.score(X_test_scaled, y_test)
    print("Test score:", test_score)
    coefficients = pd.Series(lasso.coef_, index=X.columns)
    y_pred_lasso = lasso.predict(X_test_scaled)
    lasso_r2 = r2_score(y_test, y_pred_lasso)
    print("Lasso r2 score:")
    print(lasso_r2)
    print("Coefficients:")
    print(coefficients[coefficients != 0])
    X = sm.add_constant(X_train_scaled)
    model = sm.OLS(y_train, X).fit()
    print(model.summary())