# Solutions for Supervised Machine Learning

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_breast_cancer, load_boston, load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV

## Feature Engineering

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
boston = load_boston()
df = pd.DataFrame(boston["data"], columns=boston["feature_names"])

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False)
polynomials = poly.fit_transform(df)
print(f"There are {polynomials.shape[1]} polynomials")

In [None]:
out = pd.DataFrame(polynomials)
out.columns = poly.get_feature_names(df.columns)
out["y"] = boston["target"]
out.to_csv("../output/polynomials.csv", index=False)

## Regularization

In [None]:
from sklearn.linear_model import Lasso, LinearRegression, Ridge

In [None]:
df = pd.read_csv(Path("../output/polynomials.csv"))
print(df.columns)

In [None]:
y = df["y"]
X = df.drop("y", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
linear = LinearRegression().fit(X_train, y_train)
ridge = Ridge(alpha=0.3).fit(X_train, y_train)
lasso = Lasso(alpha=0.3).fit(X_train, y_train)
print(linear.score(X_test, y_test))
print(ridge.score(X_test, y_test))
print(lasso.score(X_test, y_test))

In [None]:
coefs = pd.DataFrame({"linear": linear.coef_, "ridge": ridge.coef_, "lasso": lasso.coef_},
                     index=df.columns[:-1])
mask = (coefs["ridge"] != 0) & (coefs["lasso"] == 0)
print(f"There are {mask.sum()} coefficients zero for in the Lasso but non-zero with Ridge")

In [None]:
fig, ax = plt.subplots(figsize=(10, 30))
coefs.plot.barh(ax=ax)
fig.savefig(Path("./output/polynomials.pdf"))

## Neural Network Regression

In [None]:
from numpy import array
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
diabetes = load_diabetes()
print(diabetes['DESCR'])

X = diabetes['data']
y = diabetes['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
pipe = Pipeline([("scaler", StandardScaler()),
                 ("nn", MLPRegressor(random_state=42, solver="lbfgs", activation="tanh"))])
param_grid = {"nn__hidden_layer_sizes": [(10, 10), (10, 10, 10), (20, 20)],
              "nn__alpha": [0.01, 0.02, 0.03]}
grid = GridSearchCV(pipe, param_grid, cv=3, return_train_score=True)
grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
scores = array(grid.cv_results_["mean_test_score"]).reshape(3, 3)
sns.heatmap(scores, annot=True,
            xticklabels=param_grid["nn__hidden_layer_sizes"],
            yticklabels=param_grid["nn__alpha"])

In [None]:
best = grid.best_estimator_
coef_matrices = best._final_estimator.coefs_
df = pd.DataFrame(coef_matrices[0], index=diabetes["feature_names"])
sns.heatmap(df)
fig.savefig(Path("./output/nn_diabetes_importances.pdf"))

## Neural Networks Classification

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [None]:
cancer = load_breast_cancer()
X = cancer['data']
y = cancer['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
pipe = Pipeline([("scaler", MinMaxScaler()),
                 ("nn", MLPClassifier(max_iter=1_000, random_state=3, solver="lbfgs",
                                      activation="tanh"))])
param_grid = {"nn__hidden_layer_sizes": [(20, 10), (20, 20)],
              "nn__alpha": [0.01, 0.001]}
grid = GridSearchCV(pipe, param_grid, cv=5, return_train_score=True, scoring="roc_auc")
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

In [None]:
preds = grid.predict(X_test)
confusion_m = pd.DataFrame(confusion_matrix(y_test, preds))
sns.heatmap(confusion_m, annot=True)