# Lecture 1

In [1]:
import pandas as pd
import scipy.stats as sp
from statsmodels.formula.api import ols
import math as m

In Problems 5–10, use the results of Problems 7–12, respectively,
from Section 4.2 to answer the following questions:
(a) What are the estimates of b0 and b1?
(b) Compute the standard error, the point estimate for s.
(c) Determine sb1
.
(d) Assuming the residuals are normally distributed, test
H0: b1 = 0 versus H1: b1 ≠ 0 at the a = 0.05 level of
significance.

In [2]:
for i, (ex_num, data) in enumerate({
    5: pd.DataFrame({
        "x": [3, 4, 5, 7,  8],
        "y": [4, 5, 8, 12, 14]
    }),
    6: pd.DataFrame({
        "x": [3, 5, 7, 9, 11],
        "y": [0, 2, 3, 6, 9]
    }),
    7: pd.DataFrame({
        "x": [-2, -1, 0, 1, 2],
        "y": [-4, 0,  1, 4, 5]
    }),
    8: pd.DataFrame({
        "x": [-2, -1, 0, 1,  2],
        "y": [ 7,  6, 3, 2,  0]
    }),
    11: pd.DataFrame({
        "x": [5, 15, 25, 35, 50, 72, 105],
        "y": [69.2, 68.3, 67.5, 67.1, 66.4, 66.1, 63.9]
    })
}.items()):

    if (i != 0):
        print("\n")

    print(f"Ex. {ex_num}")


    print("\n(a)")
    model = ols("y ~ x", data = data).fit()
    B0 = model.params["Intercept"]
    B1 = model.params["x"]
    print(f"Fit linear regression model.\nB0: {B0:.4f}\nB1: {B1:.4f}")


    print("\n(b)")
    y_pred = B0 + B1 * data["x"]
    n = data.shape[0]
    SE = (sum((data['y'] - y_pred)**2)/(n-2))**(1/2)
    print(f"Standard Error: {SE:.4f}")


    print("\n(c)")
    x_mean = data["x"].mean()
    SB1 = SE / ((sum((data["x"] - data["x"].mean())**2))) ** (1/2)
    print(f"The sample standard deviation of B1: {SB1:.4f}")


    print("\n(d)")
    a = 0.05
    t0 = B1 / SB1
    p = 2 * sp.t.sf(abs(t0), n - 2)
    print(f"p: {p:4f}")
    if (p < a):
        print("H0 rejected. There is a linear relation between the explanatory and response variables.")
    else:
        print("H0 not rejected. No conclusion can be drawn.")


    if (ex_num == 11):
        # (e) Construct a 95% confidence interval about the slope of the true least-squares regression line.
        t_half_alpha = sp.t.ppf(a/2, n - 2)

        # (1 - alpha) % C.I. for slope beta_1
        Bound = t_half_alpha * SE / m.sqrt(sum((data.iloc[:, 0] - x_mean)**2))
        LB_beta_1 = B1 - Bound
        UB_beta_1 = B1 + Bound
        print(f"\n(e): 95% confidence interval: [{LB_beta_1:4f} <-> {UB_beta_1:4f}]")

Ex. 5

(a)
Fit linear regression model.
B0: -2.6395
B1: 2.0814

(b)
Standard Error: 0.4782

(c)
The sample standard deviation of B1: 0.1153

(d)
p: 0.000371
H0 rejected. There is a linear relation between the explanatory and response variables.


Ex. 6

(a)
Fit linear regression model.
B0: -3.7000
B1: 1.1000

(b)
Standard Error: 0.7303

(c)
The sample standard deviation of B1: 0.1155

(d)
p: 0.002453
H0 rejected. There is a linear relation between the explanatory and response variables.


Ex. 7

(a)
Fit linear regression model.
B0: 1.2000
B1: 2.2000

(b)
Standard Error: 0.8944

(c)
The sample standard deviation of B1: 0.2828

(d)
p: 0.004422
H0 rejected. There is a linear relation between the explanatory and response variables.


Ex. 8

(a)
Fit linear regression model.
B0: 3.6000
B1: -1.8000

(b)
Standard Error: 0.5164

(c)
The sample standard deviation of B1: 0.1633

(d)
p: 0.001599
H0 rejected. There is a linear relation between the explanatory and response variables.


Ex. 11

(a)
F

15. Concrete As concrete cures, it gains strength. The following
data represent the 7-day and 28-day strength (in pounds per
square inch) of a certain type of concrete:

In [38]:
# 15
data = pd.DataFrame({
    'x': [2300, 3390, 2430, 2890, 3330, 2480, 3380, 2660, 2620, 3340], 
    'y': [4070, 5220, 4640, 4620, 4850, 4120, 5020, 4890, 4190, 4630],
})

# (a) Treating the 7-day strength as the explanatory variable, x, determine the estimates of b0 and b1.
print("15 (a)")
model = ols("y ~ x", data).fit()
B0 = model.params["Intercept"]
B1 = model.params['x']
print(f"B0: {B0:.4f}")
print(f"B1: {B1:.4f}")

# (b) Compute the standard error of the estimate.
print("\n15 (b)")
n = data.shape[0]
pred_y = B0 + data['x'] * B1
SSR = sum((pred_y - data['y'])**2)
SE = (SSR / (n-2)) ** (1/2)
print(f"SE: {SE:.4f}")

# (c) Determine sb1.
print("\n15 (c)")
x_mean = data['x'].mean()
deviation_x = sum((data['x'] - x_mean)**2) ** (1/2)
SB1 = SE / deviation_x
print(f"SB1: {SB1:.4f}")

# (d) Assuming the residuals are normally distributed, test whether a linear relation exists between 7-day strength and 28-day strength at the a = 0.05 level of significance.
print("\n15 (d)")
a = 0.05
r = data.corr().iloc[0][1]              # Correlation coeficient for x and y
t0 = r * (n-2)**(1/2) / (1-r**2)**(1/2) # t for H0
cv = sp.t.ppf(a/2, n - 2)               # Critical Value for a (t-half-alpha)
p = 2 * sp.t.sf(abs(t0), n - 2)         # p-value for t0
# print(f"t0: {t0:.4f}")
# print(f"cv: {cv:.4f}")
print(f"p: {p:.4f}")
print(f"H0 rejected. (p < a)")

# (e) Assuming the residuals are normally distributed, construct a 95% confidence interval for the slope of the true leastsquares regression line.
print("\n15 (e)")
LB = B1 - cv * SB1
UB = B1 + cv * SB1
print(f"LB: {LB:.4f}")
print(f"UB: {UB:.4f}")

# (f) What is the estimated mean 28-day strength of this concrete if the 7-day strength is 3000 psi?
print("\n15 (f)")
y_pred_3000 = B0 + B1 * 3000
print(f"28-day strength: {y_pred_3000:.4f}")

15 (a)
B0: 2675.5619
B1: 0.6764

15 (b)
SE: 271.0423

15 (c)
SB1: 0.2055

15 (d)
p: 0.0110
H0 rejected. (p < a)

15 (e)
LB: 1.1504
UB: 0.2025

15 (f)
28-day strength: 4704.8174


18. American Black Bears In 1969, Dr. Michael R. Pelton of
the University of Tennessee initiated a long-term study of the
American black bear (Ursus americanus) population in Great
Smoky Mountains National Park. One aspect of the study was
to develop a model that could be used to predict a bear’s weight
(since it is not practical to weigh bears in the field). One variable
that is thought to be related to weight is the length of the bear.
The following data represent the lengths and weights of 12
American black bears.

In [53]:
# TABLE
x = [139.0, 138.0, 139.0, 120.5, 149.0, 141.0, 141.0, 150.0, 166.0, 151.5, 129.5, 150.0]
y = [110,   60,    90,    60,    85,    100,   95,    85,    155,   140,   105,   110  ]
data = pd.DataFrame({
    'x': x,
    'y': y
})


# VARIABLES
model = ols("y ~ x", data).fit()

n = data.shape[0]
r = data.corr().iloc[0][1]

a = 0.05
cv = sp.t.ppf(a/2, n-2) # cv (a/2)


# (a) Treating total length as the explanatory variable, x, determine
# the estimates of b0 and b1.
print("\n18 (a)")
B0 = model.params["Intercept"]
B1 = model.params['x']
print(f"B0: {B0:.4f}")
print(f"B1: {B1:.4f}")

# (b) Assuming the residuals are normally distributed, test
# whether a linear relation exists between total length and
# weight at the a = 0.05 level of significance.
print("\n18 (b)")
t0 = r * m.sqrt(n-2) / m.sqrt(1-r**2)
p = 2 * sp.t.sf(abs(t0), n-2)
print(f"p: {p:.4f}")
print(f"H0 rejected. (p < a)")

# (c) Assuming the residuals are normally distributed, construct
# a 95% confidence interval for the slope of the true leastsquares regression line.
print("\n18 (c)")
pred_y = B0 + B1 * data['x']
SSE = sum((data['y'] - pred_y) ** 2)
SE = m.sqrt(SSE / (n-2))
x_bar = data['x'].mean()
deviation_x = m.sqrt(sum((x_bar - data['x'])**2))
LB = B1 - cv * SE / deviation_x
UB = B1 + cv * SE / deviation_x
print(f"LB: {LB:.4f}")
print(f"UB: {UB:.4f}")

# (d) What is the mean weight of American black bears of length
# 146.0 cm?
print("\n18 (d)")
print(f"{(B0 + 146 * B1):.4f}")


18 (a)
B0: -142.4709
B1: 1.6942

18 (b)
p: 0.0106
H0 rejected. (p < a)

18 (c)
LB: 2.8987
UB: 0.4896

18 (d)
104.8776


### 19.
CEO Performance (Refer to Problem 31 in Section 4.1) The
following data represent the total compensation for 12 randomly
selected chief executive officers (CEOs) and the company’s
stock performance in 2013.

In [73]:
x = [14.53, 4.09, 7.11, 1.05, 1.97, 3.76, 12.06, 7.62, 8.47, 4.04, 20.87, 6.63]
y = [75.43, 64.01, 142.07, 32.72, 10.64, 30.66, 0.77, 69.39, 58.69, 55.93, 24.28, 32.21]
data = pd.DataFrame({
    'x': x,
    'y': y
})


# (a) Treating compensation as the explanatory variable, x, determine
# the estimates of b0 and b1.
print("\n19 (a)")
model = ols("y ~ x", data).fit()
(B0, B1) = (model.params["Intercept"], model.params["x"])
print(f"B0: {B0:.4f}")
print(f"B1: {B1:.4f}")


# (b) Assuming the residuals are normally distributed, test whether
# a linear relation exists between compensation and stock
# return at the a = 0.05 level of significance
print("\n19 (b)")
a = 0.05
r = data.corr().iloc[0][1]
t0 = r * m.sqrt(n-2) / m.sqrt(1 - r**2)
p = 2 * sp.t.sf(abs(t0), n-2)
print(f"p: {p:.4f}")
print(f"H0 not rejected. (p > a)")


# (c) Assuming the residuals are normally distributed, construct
# a 95% confidence interval for the slope of the true leastsquares regression line.
print("\n19 (c)")
cv = sp.t.ppf(a/2, n-2)

y_pred = B0 + B1 * data.x
SSE = sum((data.y - y_pred)**2)
SE = m.sqrt(SSE / (n-2))

deviation_x = m.sqrt(sum((data.x - data.x.mean())**2))

LB = B1 - cv * SE / deviation_x
UB = B1 + cv * SE / deviation_x

print(f"LB: {LB:.4f}")
print(f"UB: {UB:.4f}")


# (d) Based on your results to parts (b) and (c), would you
# recommend using the least-squares regression line to
# predict the stock return of a company based on the CEO’s
# compensation? Why? What would be a good estimate of the
# stock return based on the data in the table?
print("\n19 (d)")
print(f"Since H0 was not rejected, no significatnt linear relation was established. The best estimate so far the therefore the average of y: {data.y.mean():.4f}.")


19 (a)
B0: 51.1310
B1: -0.1819

19 (b)
p: 0.9317
H0 not rejected. (p > a)

19 (c)
LB: 4.4278
UB: -4.7916

19 (d)
Since H0 was not rejected, no significatnt linear relation was established. The best estimate so far the therefore the average of y: 49.7333.


### 24. 
The output shown was obtained from Minitab.

In [6]:
# Trivial

### Questions

In [None]:
# Why formula for SB1 is different
# 19c LB > UB