In [None]:
%capture
pip install celluloid

# Goals

1. Implement dedicated practice.
2. Finish book sections.
3. Discuss probabilistic perspective.
4. Use regression to predict housing prices.
5. Use Naive Bayes to classify the fashion MNIST dataset.

In [None]:
%pylab inline

import numpy as np
import pandas as pd

#import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# What is machine learning?

## Categories of ML
- Supervised.
$$P(y|X)$$
    - Regression.
    - Classification.
- Unsupervised.
$$P(X)$$
    - Clustering.
    - Dimensionality reduction.
- Semi-supervised.
- Reinforcement learning.

# Scikit-learn

## Data representation
 

In [None]:
iris = sns.load_dataset('iris')
iris.head() 

In [None]:
n_samples, n_features = iris.shape  

In [None]:
X_iris = iris.drop('species', axis=1)
y_iris = iris['species']

In [None]:
sns.pairplot(iris, hue='species', height=1.5);

# The Scikit-learn API

## The basics
1. Choose a class of model by importing the appropriate class from Scikit-learn.
2. Choose hyperparameters by instantiating this class with desired values.
3. Arrange data into features matrix and target vector.
4. Fit the model to your data calling the `fit` method of the model instance.
5. Apply the model to new data.
    - Supervised: `predict`.
    - Unsupervised: `transform` or `predict`.
    
## Supervised learning


In [None]:
rng = np.random.RandomState(42)
x = 10 * rng.rand(50)
y = 2 * x - 1 + rng.randn(50)
plt.scatter(x, y);

1. Choose model class.

[Linear regression](https://scikit-learn.org/stable/modules/linear_model.html)


In [None]:
X = np.c_[np.ones(len(x)), x]
print(X.shape)
np.dot(X.T, y.reshape(-1, 1))

In [None]:
def myreg(x, y, n_iter=30, alpha=0.01):
    '''Creates an animation of training a regressor'''
    from IPython.display import HTML
    from celluloid import Camera

    fig = plt.figure()
    camera = Camera(fig)
    mu = x.mean()
    sigma = x.std()
    x = (x - mu) / sigma
    X = np.c_[np.ones(len(x)), x]
    m = X.shape[1]
    w = np.random.rand(X.shape[1])
    h = lambda X, w: X @ w
    preds = [h(X, w)]
    for i in range(n_iter):
        y_pred = h(X, w)
        w = w - alpha * (1/m) * (y_pred - y) @ X
        preds.append(y_pred)
        plt.scatter(x=x, y=y)
        t = plt.plot(x, y_pred)
        plt.legend(t, [f'iteration {i} w = {w}'])
        camera.snap()
    animation = camera.animate()
    return HTML(animation.to_html5_video())
myreg(x, y)

In [None]:
from sklearn.linear_model import LinearRegression



In [None]:
?LinearRegression

2. Choose model hyperparameters.

In [None]:
model = LinearRegression(fit_intercept=True)
model

3. Arrange features and target.

In [None]:
X = x[:, np.newaxis]
X.shape

4. Fit.

In [None]:
model.fit(X, y)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
plt.scatter(x, y)
plt.plot(x, model.predict(X))

5. Predict labels for unknown data.


In [None]:
xfit = np.linspace(-1, 11)
Xfit = xfit[:, np.newaxis]
yfit = model.predict(Xfit)

plt.scatter(x, y)
plt.plot(xfit, yfit)

### Why learn regression?

Regression summarizes how predictions of an _outcome_ vary across individuals by a set of _predictors_.

There are four main uses:
1. Prediction.
2. Exploring associations.
3. Extrapolation.
4. Causal inference.

<figure>
    <figcaption class="text-center small">Causal inference will become important for simulation studies.</figcaption> 
    <img src="https://www.basicbooks.com/wp-content/uploads/2017/12/9780465097609.jpg?fit=436%2C675" alt="The Book of Why" width=200>    
</figure>

In [None]:
# enable internet

f_hibbs = 'http://www.stat.columbia.edu/~gelman/arm/examples/ElectionsEconomy/hibbs.dat'
df_elec = pd.read_table(f_hibbs, sep=' ')
df_elec.head()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 6), 
                               sharex=True, sharey=True)

# add years as text
df_elec.plot(kind='scatter', x='growth', y='inc.party.vote', ax=ax1,
             alpha=0.5)
for row in df_elec.iterrows():
    ax1.text(x=row[1]['growth'], y=row[1]['inc.party.vote'], s=row[1]['year'])

sns.regplot(data=df_elec, x='growth', y='inc.party.vote', ax=ax2,
            scatter=True)

def format_ax(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.set_xlabel('Average recent % growth in personal income')
    ax.set_ylabel('Incumbent party\'s vote share')
    
format_ax(ax1)
format_ax(ax2)

ax1.set_title('Forecasting the election from the economy')
ax2.set_title('Data and linear fit')

plt.tight_layout();

_The `statsmodels` package_

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

dat = pd.DataFrame(dict(x=df_elec['growth'], 
                        y=df_elec['inc.party.vote']))
res = smf.ols('y ~ x', data=dat).fit()
res.summary()

__$R^2$__

__Standard error__

__p-value__

[StatQuest](https://www.youtube.com/watch?v=2AQKmw14mHM&ab_channel=StatQuestwithJoshStarmer)

## Iris classification

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_iris, y_iris, random_state=1
)

In [None]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

## Classification on digits

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
digits.images.shape

In [None]:
X = digits.data
y = digits.target
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0
)

In [None]:
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix

mat = confusion_matrix(y_test, y_pred)

sns.heatmap(mat, square=True, annot=True, cbar=False)
plt.xlabel('predicted value')
plt.ylabel('true value');

# Hyperparameters and model validation

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, X, y, cv=5)

## Bias vs variance

In [None]:
plt.scatter(dat['x'], dat['y'])

In [None]:
dat['y']

In [None]:
reg = LinearRegression()
reg.fit(dat['x'].to_frame(), dat['y'])
plt.scatter(dat['x'], dat['y'])
plt.plot(dat['x'], reg.predict(dat['x'].to_frame()));

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=10)
X_poly = poly.fit_transform(dat['x'].to_frame())
reg = LinearRegression()
reg.fit(X_poly, dat['y'])

x = dat['x']
y = reg.predict(X_poly)

lists = sorted(zip(*[x, y]))
new_x, new_y = list(zip(*lists))


plt.scatter(new_x, new_y)
plt.plot(new_x, new_y);