In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%matplotlib inline

In [None]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import random
import patsy
import sklearn.linear_model as linear

sns.set(style="whitegrid")

## The Mean as a Function

In this section, I want to build intuition for the math that shows us that linear models are linear models of the *mean*. For the real data, you can find all of it in the *Fundamentals* zip under resources.

Let's start by looking at some synthetic data:

In [None]:
np.random.seed(8472385)

In [None]:
data = {}
data["x"] = stats.norm.rvs(10.0, 1, 100)
data["e"] = stats.norm.rvs(0, 1, 100)
data["y"] = data["x"] * 2.5 + data["e"]
data = pd.DataFrame(data)

In [None]:
beta_0 = data.y.mean()
beta_0

In [None]:
figure = plt.figure(figsize=(4,3))

axes = figure.add_subplot(1, 1, 1)
axes.scatter( data.x, data.y, color="dimgray", alpha=0.8)
axes.axhline(beta_0, color="firebrick")

axes.set_ylabel(r"$y$")
axes.set_xlabel(r"$x$")
axes.set_title(r"Plot of $y$ v. $x$")

plt.show()
plt.close()

In [None]:
min = data.x.min()
max = data.x.max()

In [None]:
buckets = 1

data["groups"] = data.x.apply(lambda x: np.round(((x - min) / (max - min)) / (1/buckets)))

grouped_data = data.groupby( "groups")
grouped_means = grouped_data[ "y"].mean()

xs = np.linspace(min, max, len(grouped_means))
figure = plt.figure(figsize=(10,6))
axes = figure.add_subplot(1, 1, 1)
axes.scatter( data.x, data.y, color="dimgray", alpha=0.5)
axes.scatter( xs, grouped_means, color="firebrick")

axes.set_ylabel(r"$y$")
axes.set_xlabel(r"$x$")
axes.set_title(r"Plot of $y$ v. $x$")

plt.show()
plt.close()


In [None]:
wells = pd.read_csv( "arsenic.wells.tsv", sep=" ")

In [None]:
wells.switch.mean()

In [None]:
wells["dist10"] = wells[ "dist"] // 10

In [None]:
grouped_wells = wells.groupby( "dist10")
relfreq_switching = grouped_wells[ "switch"].mean()

figure = plt.figure(figsize=(10,6))

axes = figure.add_subplot(1, 1, 1)

xs = list(range( 0, len( relfreq_switching)))
ys = relfreq_switching
axes.scatter( xs, ys, color="steelblue", alpha=0.75)
axes.set_title( "Relative Frequency of Switching by Distance (10s of meters)")
axes.set_xlabel("distance (10s of meters)")
axes.set_ylabel("Relative Frequence of Switching")

plt.show()
plt.close()