In [1]:
import collections

import numpy as np
import pandas as pd
from scipy import linalg
from scipy import stats

from stat570.linear_model import linear_regression

np.set_printoptions(suppress=True)

In [2]:
mortality_data = pd.read_csv('mortality.txt')
mortality_data.head()

Unnamed: 0,log.s1,log.s3
1,-0.027825,-0.042655
2,-0.019877,-0.031257
3,-0.06782,-0.09198
4,-0.119681,-0.159399
5,-0.028417,-0.037638


In [3]:
estimates = linear_regression.LinearRegression.from_data_frame(mortality_data,['log.s3'], 'log.s1').coefficients_
estimates.index = ['$\\hat{\\beta}_0$', '$\\hat{\\beta}_1$']
with open('p2_estimates.tex', 'w') as f:
    f.write(estimates.to_latex(escape=False).replace('std_error', 'standard error'))
estimates

Unnamed: 0,estimate,std_error,t-statistic,p-value
$\hat{\beta}_0$,-0.002178,0.002584,-0.842783,0.4038069
$\hat{\beta}_1$,0.68665,0.03034,22.63199,3.179129e-26


In [4]:
covariance = np.cov(mortality_data, rowvar=False, ddof=1)
residual_variance = (covariance[0,0]*covariance[1,1] - covariance[0,1]**2)/covariance[1,1]
residual_variance

7.627211333009732e-05

In [5]:
def compute_prediction_variance(x):    
    X = np.column_stack((np.ones_like(mortality_data['log.s3']), mortality_data['log.s3']))
    gram_matrix = X.T.dot(X)
    inverted_gram_matrix = linalg.cho_solve(
        linalg.cho_factor(gram_matrix), np.eye(len(gram_matrix)))
    return residual_variance*(1. + 1./len(mortality_data) +
                              inverted_gram_matrix[1,1]*(x - np.mean(mortality_data['log.s3']))**2)

In [6]:
x0 = np.log(0.95)
y0_hat = estimates['estimate'][0] + x0*estimates['estimate'][1]
y0_hat_ci_lower_bound = y0_hat + np.sqrt(compute_prediction_variance(x0))*stats.norm.ppf(0.025)
y0_hat_ci_upper_bound = y0_hat + np.sqrt(compute_prediction_variance(x0))*stats.norm.ppf(0.975)

In [7]:
paramters_of_interest = pd.DataFrame(collections.OrderedDict([
    ('Estimate', [0, 0, 0, 0]),
    ('95% CI lower bound', [0, 0, 0, 0]),
    ('95% CI upper bound', [0, 0, 0, 0])
]), index=[
    'Surviving the first year',
    'Death within the first year',
    'Death between ages 1 and 5, given survival until age 1',
    'Death between ages 1 and 5',
])

paramters_of_interest.loc['Surviving the first year'] = np.exp([
    y0_hat, y0_hat_ci_lower_bound, y0_hat_ci_upper_bound,
])

paramters_of_interest.loc['Death within the first year'] = 1 - np.exp([
    y0_hat, y0_hat_ci_upper_bound, y0_hat_ci_lower_bound,
])

paramters_of_interest.loc['Death between ages 1 and 5, given survival until age 1'] = np.exp(x0)/np.exp([
    y0_hat, y0_hat_ci_upper_bound, y0_hat_ci_lower_bound,
])

paramters_of_interest.loc['Death between ages 1 and 5'] = np.exp([
    y0_hat, y0_hat_ci_lower_bound, y0_hat_ci_upper_bound,
]) - np.exp(x0)

with open('p2_parameters_of_interest.tex', 'w') as f:
    f.write(paramters_of_interest.to_latex())
    
paramters_of_interest

Unnamed: 0,Estimate,95% CI lower bound,95% CI upper bound
Surviving the first year,0.963292,0.946724,0.98015
Death within the first year,0.036708,0.01985,0.053276
"Death between ages 1 and 5, given survival until age 1",0.986201,0.969239,1.00346
Death between ages 1 and 5,0.013292,-0.003276,0.03015
