# Keras Feedforward 1 - lab

### Instructions:

Read and execute the notebook cell by cell in Colab.  Add code where you see # YOUR CODE HERE

In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras import models, layers, Input, Model
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [None]:
def plot_loss(history):
    """ Plot training, test loss. """

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'])
    plt.show();

### Read and preprocess the data.

In [None]:
df = pd.read_csv('1994-census-summary.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   usid            32561 non-null  int64 
 1   age             32561 non-null  int64 
 2   workclass       30725 non-null  object
 3   fnlwgt          32561 non-null  int64 
 4   education       32561 non-null  object
 5   education_num   32561 non-null  int64 
 6   marital_status  32561 non-null  object
 7   occupation      30718 non-null  object
 8   relationship    32561 non-null  object
 9   race            32561 non-null  object
 10  sex             32561 non-null  object
 11  capital_gain    32561 non-null  int64 
 12  capital_loss    32561 non-null  int64 
 13  hours_per_week  32561 non-null  int64 
 14  native_country  31978 non-null  object
 15  label           32561 non-null  object
dtypes: int64(7), object(9)
memory usage: 4.0+ MB


In [None]:
df['relationship'].value_counts()

Husband           13193
Not_in_family      8305
Own_child          5068
Unmarried          3446
Wife               1568
Other_relative      981
Name: relationship, dtype: int64

In [None]:
df['label'].value_counts()

<=50K    24720
>50K      7841
Name: label, dtype: int64

Let's try to predict whether income is > 50K using age, education_num, sex, and race.

We'll need to do a little preprocessing.

In [None]:
predictors = ['age', 'education_num', 'sex', 'race', 'relationship']
target = 'label'
dfp = pd.get_dummies(df[predictors], drop_first=True)
column_names = dfp.columns

Sanity check.

In [None]:
dfp.head()

Unnamed: 0,age,education_num,sex_Male,race_Asian_Pac_Islander,race_Black,race_Other,race_White,relationship_Not_in_family,relationship_Other_relative,relationship_Own_child,relationship_Unmarried,relationship_Wife
0,39,13,1,0,0,0,1,1,0,0,0,0
1,50,13,1,0,0,0,1,0,0,0,0,0
2,38,9,1,0,0,0,1,1,0,0,0,0
3,53,7,1,0,1,0,0,0,0,0,0,0
4,28,13,0,0,1,0,0,0,0,0,0,1


Train/test split, then scale the predictors.

In [None]:
X = dfp.values
y = (df[target] == '>50K').astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Sanity check.

In [None]:
print(X_train.shape)
print(y_train.shape)

(24420, 12)
(24420,)


### Logistic regression with a neural net

In [None]:
# delete any old models
K.clear_session()

Build the model.

In [None]:
# YOUR CODE HERE
# assign your neural logistic regression model to 'model'.

In [None]:
model.summary()

Compile the model.  Use stochastic gradient descent for the optimizer, as in lecture.

In [None]:
# YOUR CODE HERE

Train the model.  

Use 10 epochs and a batch size of 16.

If using Colab, you might like trying the options under Runtime/Change runtime type to see which is fastest.  Using a GPU or TPU is probably not helpful for small examples like this.

In [None]:
# YOUR CODE HERE

In [None]:
plot_loss(history)

Compute test accuracy.

In [None]:
y_pred = (model.predict(X_test)[:,0] > 0.5).astype(int)
test_acc = (y_pred == y_test).mean()
print(f'test accuracy with neural net: {test_acc:.3f}')

### Logistic regression with Scikit-Learn.

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
print('test accuracy with sklearn: {:.3f}'.format(clf.score(X_test, y_test)))

### Play with number of epochs in neural net.

In [None]:
# recompile, otherwise model will continue training from where it left off.
model.compile()     # replace this line with your .compile() call

# YOUR .fit() CODE HERE

In [None]:
plot_loss(history)

### A classifier with a single hidden layer of 10 neurons.  Use the ReLU activiation function in the hidden layer.

In [None]:
# YOUR CODE HERE; assign to variable 'model'

# YOUR CODE HERE to compile the model.

In [None]:
model.summary()

Observe what happens during training.  Does loss keep decreasing?  Does validation loss keep decreasing?  

In [None]:
# YOUR .fit() code here.

Notice how the plot is a little different from before.

In [None]:
plot_loss(history)

In [None]:
y_pred = (model.predict(X_test)[:,0] > 0.5).astype(int)
print('test accuracy with neural net: {:.3f}'.format((y_pred == y_test).mean()))

### Play with batch size

In [None]:
# YOUR CODE HERE

# LINE 1 is to recompile the model
# LINE 2 is to fit the model.

In [None]:
y_pred = (model.predict(X_test)[:,0] > 0.5).astype(int)
print('test accuracy with neural net: {:.3f}'.format((y_pred == y_test).mean()))

### A hidden layer of 50 neurons.

In [None]:
# YOUR CODE HERE

# first build the model, then compile the model

In [None]:
# YOUR CODE HERE
# train the model using .fit()

In [None]:
y_pred = (model.predict(X_test)[:,0] > 0.5).astype(int)
print('test accuracy with neural net: {:.3f}'.format((y_pred == y_test).mean()))

test accuracy with neural net: 0.828


### Two hidden layers of 20 neurons each

In [None]:
# YOUR CODE HERE

# first build the model, then compile the model

In [None]:
# YOUR CODE HERE
# train the model using .fit()

In [None]:
y_pred = (model.predict(X_test)[:,0] > 0.5).astype(int)
print('test accuracy with neural net: {:.3f}'.format((y_pred == y_test).mean()))

If you still have time, create a function 'plot_accuracy()' that is like the plot_loss() function at the top of this file -- but that plots the accuracy.  Try it with some of your code above.