## S/4HANA ML Class
### Linear Models / Bias and Variability

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.datasets import make_circles
np.random.seed(0)
X, y = make_circles(n_samples=1000, factor=.3, noise=.2)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)

In [None]:
from sklearn import linear_model

logReg = linear_model.LogisticRegression(solver='lbfgs')
logReg.fit(X_train, y_train)

yhat = logReg.predict(X_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, yhat)

In [None]:
from sklearn import svm

svc = svm.SVC(gamma='auto')
svc.fit(X_train, y_train)

yhat = svc.predict(X_test)
mean_squared_error(y_test, yhat)


In [None]:
# Scatter Plot
plt.figure()
plt.title("Raw Data")
blueSet = y == 0
orangeSet = y == 1

plt.scatter(X[blueSet, 0], X[blueSet, 1], c="blue",
            s=20, edgecolor='k')
plt.scatter(X[orangeSet, 0], X[orangeSet, 1], c="orange",
            s=20, edgecolor='k')
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.show()

### Decision Boundary

In [None]:
# Linear Regression Decision Boundary


w = logReg.coef_[0]
b = logReg.intercept_
# w[1] * y = w[0] * x + b # solve for y
# y = (w[0] * x)/w[1] + b / w[1]
# Use smallest and largest x values

y_boundary = [-1 * (w[0] * X[:,0].min() + b) / w[1],
              -1 * (w[0] * X[:,0].max() + b) / w[1]]
                 
# plt.plot([X.min(), X.max()], y_boundary, 'r-')


In [None]:
# Scatter plot of raw data + LR decision boundary
plt.figure()
plt.title("Raw Data")
blueSet = y == 0
orangeSet = y == 1

plt.scatter(X[blueSet, 0], X[blueSet, 1], c="blue",
            s=20, edgecolor='k')
plt.scatter(X[orangeSet, 0], X[orangeSet, 1], c="orange",
            s=20, edgecolor='k')
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
# Print Decision Boundary
plt.ylim(X[:,1].min(), X[:,1].max())
plt.plot([X[:,0].min(), X[:,0].max()], y_boundary, 'r-')

plt.show()

In [None]:
# blue = 0
# orange = 1

In [None]:
# Some predictions...
logReg.predict(np.array([[0, -1]]))[0]

In [None]:
logReg.predict(np.array([[1, 10]]))[0]

#### Generic Solution for Plotting Decision Boundaries

In [None]:
# Create a grid 

gridXs, gridYs = np.mgrid[X[:, 0].min():X[:, 0].max():.1, X[:,1].min():X[:,1].max():.1]
gridXs = gridXs.ravel()
gridYs = gridYs.ravel()

In [None]:
def lr_predict(k):
    return logReg.predict(k.reshape(-1, 2))

preds = np.apply_along_axis(lr_predict, 1, np.stack([gridXs, gridYs], axis=1)).ravel()
#axis 1: apply across columns, i.e. row-wise 

In [None]:
preds.shape

In [None]:
# Scatter Plot
plt.figure()
plt.title("Predictions")

plt.scatter(gridXs, gridYs, 2, c=preds)
#colors = [logReg.predict(np.array([a,b]).reshape(-1,2)) for a in xx for b in yy]
plt.show()

In [None]:
# Scatter Plot
plt.figure()
plt.title("Predictions and Decision Boundary")
blueSet = y == 0
orangeSet = y == 1

plt.scatter(X[blueSet, 0], X[blueSet, 1], c="blue",
            s=20, edgecolor='k')
plt.scatter(X[orangeSet, 0], X[orangeSet, 1], c="orange",
            s=20, edgecolor='k')
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
# Print Decision Boundary
plt.ylim(X[:,1].min(), X[:,1].max())
plt.plot([X[:,0].min(), X[:,0].max()], y_boundary, 'r-')
plt.scatter(gridXs, gridYs, 5, c=preds)

plt.show()

In [None]:
def lr_predict_proba(k):
    return logReg.predict_proba(k.reshape(-1, 2))[:,1]

predProbs = np.apply_along_axis(lr_predict_proba, 1, np.stack([gridXs, gridYs], axis=1)).ravel()

In [None]:
# Scatter Plot
plt.figure()
plt.title("Color Map of Predicted Probability")

plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

# Print Decision Boundary
plt.ylim(X[:,1].min(), X[:,1].max())
plt.plot([X[:,0].min(), X[:,0].max()], y_boundary, 'r-')

# Print Probabilities
plt.scatter(gridXs, gridYs, 200, c=predProbs)
plt.show()

In [None]:

def svc_predict(k):
    return svc.predict(k.reshape(-1, 2))

svc_preds = np.apply_along_axis(svc_predict, 1, np.stack([gridXs, gridYs], axis=1)).ravel()


# Scatter Plot
plt.figure()
plt.title("SVM-based Predictions, LR Decision Boundary")
blueSet = y == 0
orangeSet = y == 1

#plt.scatter(X[blueSet, 0], X[blueSet, 1], c="blue",
#            s=20, edgecolor='k')
#plt.scatter(X[orangeSet, 0], X[orangeSet, 1], c="orange",
#            s=20, edgecolor='k')
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
# Print Decision Boundary
plt.ylim(X[:,1].min(), X[:,1].max())
plt.plot([X[:,0].min(), X[:,0].max()], y_boundary, 'r-')
plt.scatter(gridXs, gridYs, 5, c=svc_preds)

plt.show()

## Exercises:

- What is the MSE (mean squared error) for KNN?
- Is KNN a linear classifier?
- What influence does the parameter $k$ have on bias and variance? 