# Lab 8 - Neural Networks
- **Author:** Emily Aiken ([emilyaiken@berkeley.edu](mailto:emilyaiken@berkeley.edu))
- **Date:** March 16, 2022
- **Course:** INFO 251: Applied machine learning

## Topics:
1. Neural networks (regression)
2. Neural networks (classification)
3. Neural networks (multiclass classification)

## Learning Goals:
At the end of this lab, you will...
- Know how to code up feed forward neural networks in Keras for regression, classification, and multiclass classification problems
- Know the main hyperparameters for neural networks: number of hidden layers, number of hidden nodes, activation functions
- Know the main optimization parameters for neural networks: optimizer, learning rate, batch size, epochs

## Resources:
- [Keras activation functions](https://keras.io/api/layers/activations/)
- [Keras optimizers](https://keras.io/api/optimizers/)
- [Keras loss functions](https://keras.io/api/losses/)
- [Keras performance metrics](https://keras.io/api/metrics/)

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, roc_auc_score, accuracy_score

from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

### I. Regression Data: Loading and Baseline Model

In [2]:
# Data
data = datasets.load_boston()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [3]:
# Standardize the data
for col in df.columns:
    if col != 'target':
        mean, std = df[col].mean(), df[col].std()
        df[col] = (df[col] - mean)/std

In [4]:
# Split data into training and test
train, test = train_test_split(df, shuffle=True, test_size=0.25, random_state=0)
x_train, y_train = train.drop('target', axis=1), train['target']
x_test, y_test = test.drop('target', axis=1), test['target']

In [5]:
# Let's fit a basic random forest model -- just as a baseline
model = RandomForestRegressor(max_depth=8, n_estimators=50, random_state=1)
model.fit(x_train, y_train)
yhat_train = model.predict(x_train)
yhat_test = model.predict(x_test)
print('RF r2 on training set: %.2f' % r2_score(y_train, yhat_train))
print('RF r2 on test set: %.2f' % r2_score(y_test, yhat_test))

RF r2 on training set: 0.98
RF r2 on test set: 0.78


### II. Neural Network (Regression)

#### A. Scikit-learn implementation

In [6]:
model = MLPRegressor(hidden_layer_sizes=[5, 3], activation='relu', solver='adam', max_iter=500,
                    shuffle=True, random_state=1)
model.fit(x_train, y_train)
yhat_train = model.predict(x_train)
yhat_test = model.predict(x_test)

# Get metrics
print('r2 on training set: %.2f' % r2_score(y_train, yhat_train))
print('r2 on test set: %.2f' % r2_score(y_test, yhat_test))

r2 on training set: 0.77
r2 on test set: 0.58


#### B. Keras Implementation

In [8]:
# Random seeds
np.random.seed(1)
tf.random.set_seed(1)

# Define NN
model = Sequential()
model.add(Dense(5, input_dim=len(x_train.columns), activation='relu')) # First layer defines input_dim
model.add(Dense(1, activation='linear')) # For regression/classification, last layer of size 1
model.compile(loss='mse', optimizer='adam', metrics=['mse']) # No r2 metric available in keras

# Fit and predict with NN
model.fit(x_train, y_train, epochs=50, batch_size=10, verbose=0)
yhat_train = model.predict(x_train)
yhat_test = model.predict(x_test)

# Get metrics
print('r2 on training set: %.2f' % r2_score(y_train, yhat_train))
print('r2 on test set: %.2f' % r2_score(y_test, yhat_test))

r2 on training set: 0.65
r2 on test set: 0.42


In [None]:
# TODO: Tune the hyperparameters until the r2 score on the test set exceeds that of the random forest

### III. Classification Data: Loading and Baseline Model

In [10]:
# Load data
data = datasets.load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
for col in df.columns:
    if col != 'target':
        df[col] = df[col] + np.random.normal(0, 4*df[col].std(), len(df))
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,-6.215214,29.609666,237.235447,957.253337,0.075135,0.159193,0.479545,-0.071647,0.285759,0.025896,...,55.990891,93.878385,-922.594251,0.222788,1.144124,0.275914,0.259383,0.401851,0.093737,0
1,30.85278,10.468181,233.091104,2997.133173,0.202155,0.337712,-0.272807,0.232042,0.032971,0.073385,...,1.941621,-21.039659,1830.481937,0.071908,0.145836,1.144626,0.297835,0.229907,0.201592,0
2,14.038096,-0.372287,97.239263,2599.881859,0.10395,0.47192,0.848899,0.111572,0.334732,0.084134,...,28.195078,114.552176,1137.989482,0.203666,-0.184891,1.374075,0.680334,0.400192,0.022262,0
3,-9.344638,28.71572,-84.247511,1149.990156,0.134138,0.155378,-0.060954,-0.076007,0.202641,0.102352,...,23.208891,52.822893,2918.450449,0.17588,-0.059224,1.13868,0.296743,0.747734,0.177588,0
4,26.649647,31.003179,3.117436,3042.472837,-0.014671,0.287769,0.630249,-0.005492,0.034629,0.06699,...,37.821356,116.864101,4639.176265,0.350309,1.45949,0.520597,-0.288537,0.392953,0.115381,0


In [11]:
# Standardize the data
for col in df.columns:
    if col != 'target':
        mean, std = df[col].mean(), df[col].std()
        df[col] = (df[col] - mean)/std

In [12]:
# Split data into training and test
train, test = train_test_split(df, shuffle=True, test_size=0.25, random_state=0)
x_train, y_train = train.drop('target', axis=1), train['target']
x_test, y_test = test.drop('target', axis=1), test['target']

In [13]:
# Let's fit a basic random forest model -- just as a baseline
model = RandomForestClassifier(max_depth=4, n_estimators=50, random_state=1)
model.fit(x_train, y_train)
yhat_train = model.predict_proba(x_train)[:, 1]
yhat_test = model.predict_proba(x_test)[:, 1]
print('RF AUC on training set: %.2f' % roc_auc_score(y_train, yhat_train))
print('RF AUC on test set: %.2f' % roc_auc_score(y_test, yhat_test))

RF AUC on training set: 0.98
RF AUC on test set: 0.75


### IV. Neural Network (Classification)

In [14]:
# TODO: Train a neural network to predict malignance. Tune hyperparameters until the AUC score exceeds that
# of the random forest above.

### V. Multiclass Classification Data Loading and Baseline Model

In [None]:
# Load data
data = datasets.load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
for col in df.columns:
    if col != 'target':
        df[col] = df[col] + np.random.normal(0, 4*df[col].std(), len(df))
df.head()

In [None]:
# Standardize the data
for col in df.columns:
    if col != 'target':
        mean, std = df[col].mean(), df[col].std()
        df[col] = (df[col] - mean)/std

In [None]:
# Split data into training and test
train, test = train_test_split(df, shuffle=True, test_size=0.25, random_state=0)
x_train, y_train = train.drop('target', axis=1), train['target']
x_test, y_test = test.drop('target', axis=1), test['target']

In [None]:
# Let's fit a basic random forest model -- just as a baseline
model = RandomForestClassifier(max_depth=6, n_estimators=50, random_state=1)
model.fit(x_train, y_train)
yhat_train = model.predict(x_train)
yhat_test = model.predict(x_test)
print('RF accuracy on training set: %.2f' % accuracy_score(y_train, yhat_train))
print('RF accuracy on test set: %.2f' % accuracy_score(y_test, yhat_test))

### VI. Neural Network (Multiclass Classification)

In [None]:
# Random seeds
np.random.seed(1)
tf.random.set_seed(1)

# One hot encode the y variable
y_train_dummies = pd.get_dummies(y_train)
y_test_dummies = pd.get_dummies(y_test)

# Define NN
model = Sequential()
model.add(Dense(5, input_dim=len(x_train.columns), activation='relu')) # First layer defines input_dim
model.add(Dense(len(y_train_dummies.columns), activation='softmax')) 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 

# Fit and predict with NN
model.fit(x_train, y_train_dummies, epochs=50, batch_size=10, verbose=0)
yhat_train = model.predict(x_train)
yhat_test = model.predict(x_test)

# Convert probabilities to categorical predictions
yhat_train = np.argmax(yhat_train, axis=1)
yhat_test = np.argmax(yhat_test, axis=1)

# Get metrics
print('Accuracy on training set: %.2f' % accuracy_score(y_train, yhat_train))
print('Accuracy on test set: %.2f' % accuracy_score(y_test, yhat_test))

In [None]:
# TODO: Tune the hyperparameters until the overall accuracy score exceeds that of the random forest
