# ISLP - Chapter 5 - Exercise 9
### Author: pzuehlke

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
housing = pd.read_csv("Boston.csv")
# Let's rename the index column and also set it as the de facto index:
housing = housing.rename(columns={"Unnamed: 0": "property"})
housing = housing.set_index("property")

print(housing.info())
housing.head()

<class 'pandas.core.frame.DataFrame'>
Index: 506 entries, 1 to 506
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  lstat    506 non-null    float64
 12  medv     506 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 55.3 KB
None


Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,medv
property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


__9 (a):__ Below we obtain the estimate $ \hat{\mu} = 22.5328 $.

In [None]:
mu_hat = housing["medv"].mean()
print(round(mu_hat, 4))  # looks reasonable

22.5328


__9 (b):__ Following the formula in the hint:

In [4]:
# sample standard deviation (estimate for sigma, the population standard deviation):
s = housing["medv"].std()
print(f"Sample standard deviation: {s:.4f}")

# standard error of the mean (estimate for sigma / sqrt(n)):
n = len(housing)
std_err = s / np.sqrt(n)
print(f"Standard error of the mean: {std_err:.4f}")


Sample standard deviation: 9.1971
Standard error of the mean: 0.4089


__9 (c):__ The estimate for the standard error computed using the bootstrap is
quite close to the estimate computed using the sample standard deviation.

In [23]:
rng = np.random.default_rng(1)
B = 1000
bootstrap_means = np.zeros(B)
for b in range(B):
    boot_sample = rng.choice(n, size=n, replace=True)
    boot_mu_estimate = housing["medv"].iloc[boot_sample].mean()
    bootstrap_means[b] = boot_mu_estimate
bootstrap_se = bootstrap_means.std()
print(bootstrap_se)

0.4153379484767965


__9 (d):__ Again, the two methods provide results that are very close.

In [26]:
confidence_int_boot = np.percentile(bootstrap_means, [2.5, 97.5])
print(confidence_int_boot)
confidence_int_2se = np.array([mu_hat - 2 * std_err, mu_hat + 2 * std_err])
print(confidence_int_2se)

[21.70865119 23.35789032]
[21.71508403 23.35052862]


__9 (e):__ An estimate for the population median is the sample median, which equals $ 21.2 $.

In [31]:
median_hat = housing["medv"].median()
print(median_hat)

21.2


__9 (f):__

In [39]:
B = 1000
rng = np.random.default_rng(1)
bootstrap_medians = np.zeros(B)
for b in range(B):
    bootstrap_sample = rng.choice(n, size=n, replace=True)
    mean_estimate = housing["medv"].iloc[bootstrap_sample].median()
    bootstrap_medians[b] = mean_estimate
bootstrap_median_se = bootstrap_medians.std()
print(bootstrap_median_se)

0.38542053655714786


__9 (g):__ An estimate for the tenth percentile of `medv` based on the dataset is $ 12.75 $.

In [43]:
print(np.percentile(housing["medv"], 10))

12.75


__9 (h):__ Based on bootstrapping, an estimate for the standard error of $ \hat{\mu}_{0.1} $ is
$ 0.523 $.

In [47]:
B = 1000
rng = np.random.default_rng(1)
bootstrap_percentiles = np.zeros(B)
for b in range(B):
    bootstrap_sample = rng.choice(n, size=n, replace=True)
    bootstrap_percentiles[b] = np.percentile(housing["medv"].iloc[bootstrap_sample], 10)
bootstrap_percentile_se = bootstrap_percentiles.std()
print(bootstrap_percentile_se)

0.5226133465574717
