In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
### Question 25

In [3]:
data = pd.read_csv('xy.csv', index_col=0)

In [4]:
X = data[data['variable']=='X']['value']
Y = data[data['variable']=='Y']['value']
mu_0 = 1

def bootstrap_ci(X, Y, m, alpha):
    n1 = len(X)
    n2 = len(Y)
    X_resample = np.random.choice(X, size=(m,n1))
    Y_resample = np.random.choice(Y, size=(m,n2))
    mu_x = np.mean(X_resample, axis=1)
    mu_y = np.mean(Y_resample, axis=1)
    
    ratio = mu_x/mu_y
    
    lower = np.quantile(ratio, alpha/2)
    upper = np.quantile(ratio, 1 - alpha/2)
    return lower, upper

lower, upper = bootstrap_ci(X, Y, 1000, .05)

lower, upper
if (lower > mu_0) | (upper < mu_0):
    print('reject H_0')
else:
    print('fail to reject')
    
print(lower, upper)

reject H_0
0.2776331674105821 0.7749608342315764


In [5]:
### Question 26

In [6]:
df = pd.read_csv('machine_learning.csv', index_col=0)
y = df['y'] == 'chinchilla'

X = df.drop(columns=['y'])
X_train, X_test, y_train, y_test = train_test_split(X, y)

cat_cols = ['color', 'birth_country']
cont_cols = ['weight', 'waist_measurement' ,'height']

cat_pipe = Pipeline([('OneHot', OneHotEncoder(handle_unknown='ignore'))])
cont_pipe = Pipeline([('Imputer', SimpleImputer(strategy='mean')),
                      ('StandardScaler', StandardScaler())])

cf = ColumnTransformer([('cat_pipe', cat_pipe, cat_cols),
                        ('cont_pipe', cont_pipe, cont_cols)])

pipe = Pipeline([('cf', cf),
                 ('lr', LogisticRegression())])

param_grid = {'lr__C': np.logspace(-1,2,50)}

lr = GridSearchCV(pipe, param_grid=param_grid, scoring='neg_brier_score')

In [7]:
lr.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cf',
                                        ColumnTransformer(transformers=[('cat_pipe',
                                                                         Pipeline(steps=[('OneHot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['color',
                                                                          'birth_country']),
                                                                        ('cont_pipe',
                                                                         Pipeline(steps=[('Imputer',
                                                                                          SimpleImputer()),
                                                                                         ('StandardScaler',
                                                         

In [8]:
lr.best_score_

-0.06787852157488146

In [9]:
lr.score(X_test, y_test)

-0.07183996097756502

In [10]:
lr.best_params_

{'lr__C': 24.420530945486497}