# Chapter 2 - Linear Models: Least Squares Theory

In [1]:
import warnings

import pandas as pd
import proplot as plot
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices
from scipy import stats

warnings.filterwarnings("ignore")
%pylab inline


plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams["font.weight"] = "bold"

Populating the interactive namespace from numpy and matplotlib


In [2]:
scots_races_df = pd.read_csv("../data/ScotsRaces.tsv", sep="\t")
scots_races_df.head()

FileNotFoundError: [Errno 2] File ../data/ScotsRaces.tsv does not exist: '../data/ScotsRaces.tsv'

A dataset contains a list of hill races in Scotland for the year. Explanatory variables:
- distance of the race (in miles)
- the cumulative climb (in thousands of feet)

In [None]:
print(scots_races_df[["time", "climb", "distance"]].mean(axis=0))
print(scots_races_df[["time", "climb", "distance"]].std(axis=0))

In [None]:
sns.pairplot(scots_races_df[["time", "climb", "distance"]])

In [None]:
scots_races_df[["climb", "distance", "time"]].corr()

In [None]:
fit_cd = smf.ols(
    formula="""time ~ climb + distance""",
    data=scots_races_df[["climb", "distance", "time"]],
).fit()
print(fit_cd.summary())

Thus, adjusted for climb, the predicted record time increased by 6.34 minutes for every additional midle of distance

In [None]:
sns.distplot(fit_cd.resid, kde=False, color="slateblue")

In [None]:
pd.Series(fit_cd.resid_pearson).quantile(q=[0, 0.25, 0.5, 0.75, 1])

In [None]:
print(pearsonr(fit_cd.fittedvalues, fit_cd.resid))

In [None]:
print(fit_cd.resid_pearson.mean(), fit_cd.resid_pearson.std())

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter(scots_races_df["distance"], fit_cd.resid_pearson)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter(fit_cd.fittedvalues, fit_cd.resid_pearson)

In [None]:
influence = fit_cd.get_influence()
summary_frame = influence.summary_frame()
fig, ax = plt.subplots()
ax.plot(summary_frame["cooks_d"])
ax.set_xlabel("Observation")
ax.set_ylabel("Cook's distance")

# ANOVA

In [None]:
lm = smf.ols(
    formula="""time ~ climb + distance""",
    data=scots_races_df[["climb", "distance", "time"]],
).fit()
table = sm.stats.anova_lm(lm, typ=2)
print(table)

In [None]:
# Change order
lm = smf.ols(
    formula="""time ~ distance + climb""",
    data=scots_races_df[["climb", "distance", "time"]],
).fit()
table = sm.stats.anova_lm(lm, typ=2)
print(table)

In [None]:
lm = smf.ols(
    formula="""time ~ climb + distance + climb:distance """,
    data=scots_races_df[["climb", "distance", "time"]],
).fit()
table = sm.stats.anova_lm(lm, typ=2)
print(lm.summary())