In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## INTROTONEURALNETWORKS/2 BUILDINGNEURALNETWORKS/INTROTONEURALNETWORKS BUILDINGNEURALNETWORKS 1 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs

In [4]:
# =================================================-
#### Slide 5: Directory settings  ####

# Set 'main_dir' to location of the project folder
from pathlib import Path

home_dir = Path(".").resolve()
main_dir = home_dir.parent.parent
print(main_dir)
data_dir = str(main_dir) + "/data"
print(data_dir)

/Users/maptv/maptv/IntrotoNeural_Networks
/Users/maptv/maptv/IntrotoNeural_Networks/data


In [4]:
from pathlib import Path
data_dir = Path.cwd().parent.parent / "data"

In [5]:
data_dir.exists()

True

In [6]:
# =================================================-
#### Slide 6: Loading packages  ####

# Helper packages.
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from textwrap import wrap

# Scikit-learn package for building a perceptron.
from sklearn.neural_network import MLPClassifier

# Scikit-learn package for data preprocessing.
from sklearn.preprocessing import MinMaxScaler

# Model set up, tuning and model metrics packages.
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [7]:
# =================================================-
#### Slide 7: Load the data  ####

from pathlib import Path
data_dir = Path.cwd().parent.parent / "data"
credit_card = pd.read_csv(data_dir / "credit_card_data.csv")
credit_card.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [8]:
# =================================================-
#### Slide 9: Data at first glance  ####

# The data types.
credit_card.dtypes

ID                              int64
LIMIT_BAL                       int64
SEX                             int64
EDUCATION                       int64
MARRIAGE                        int64
AGE                             int64
PAY_0                           int64
PAY_2                           int64
PAY_3                           int64
PAY_4                           int64
PAY_5                           int64
PAY_6                           int64
BILL_AMT1                     float64
BILL_AMT2                       int64
BILL_AMT3                       int64
BILL_AMT4                       int64
BILL_AMT5                       int64
BILL_AMT6                       int64
PAY_AMT1                        int64
PAY_AMT2                        int64
PAY_AMT3                        int64
PAY_AMT4                        int64
PAY_AMT5                        int64
PAY_AMT6                        int64
default_payment_next_month      int64
dtype: object

In [9]:
# =================================================-
#### Slide 10: Check for NAs in the dataset  ####

# Check for NAs.
credit_card.isna().sum()

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     1
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default_payment_next_month    0
dtype: int64

In [11]:
# =================================================-
#### Slide 11: Using fillna() to handle missing values  ####

# Fill missing values with mean
credit_card = credit_card.fillna(credit_card.mean())
# Check for NAs in 'BILL_AMT1'.

In [12]:
credit_card.isna().sum()["BILL_AMT1"]

0

In [13]:
# Drop an unnecessary identifier column.
credit_card = credit_card.set_index("ID")
credit_card

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29996,220000,1,3,1,39,0,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29997,150000,1,3,2,43,-1,-1,-1,-1,0,...,8979,5190,0,1837,3526,8998,129,0,0,0
29998,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29999,80000,1,3,1,41,1,-1,0,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [14]:
# =================================================-
#### Slide 15: Transform and replace categorical variables  ####

# Convert 'sex' into dummy variables.
credit_card = pd.get_dummies(
    credit_card,
    columns=["SEX", "EDUCATION", "MARRIAGE"],
    drop_first=True,
)

In [15]:
credit_card.loc[:,credit_card.columns.str.startswith("SEX")]

Unnamed: 0_level_0,SEX_2
ID,Unnamed: 1_level_1
1,True
2,True
3,True
4,True
5,False
...,...
29996,False
29997,False
29998,False
29999,False


In [17]:
credit_card.loc[:,credit_card.columns.str.startswith("SEX")].value_counts()

SEX_2
True     18112
False    11888
Name: count, dtype: int64

In [18]:

# Separate predictors from data.
X = credit_card.drop(["default_payment_next_month"], axis=1)

In [19]:
# =================================================-
#### Slide 16: Data prep: split  ####


# We do this so we can always have a data set to test against 
# this is the standard and always has to be done on data in Machine Learning 
# Shift TAB - look at documentation in JP
#
# its random and reproducable 
#
#
#


# random_state=1
# 42 - a known random number ?

# Separate target from data.
y = credit_card["default_payment_next_month"]
# Split data into train and test set, use a 70 - 30 split.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (21000, 30) Test shape: (9000, 30)


In [20]:
ord("*")

42

In [21]:
!ls *

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
# =================================================-
#### Slide 17: Data prep: scale with MinMaxScaler  ####

# Transforms each feature to a given range.
# The default is the range between 0 and 1.
min_max_scaler = preprocessing.MinMaxScaler()
X_train_scaled = min_max_scaler.fit_transform(X_train)
X_test_scaled = min_max_scaler.transform(X_test)


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################

In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## INTROTONEURALNETWORKS/2 BUILDINGNEURALNETWORKS/INTROTONEURALNETWORKS BUILDINGNEURALNETWORKS 2 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs

In [None]:
# =================================================-
#### Slide 2: Build the model architecture  ####

# Build neural network model
nn = MLPClassifier(
    hidden_layer_sizes=(64), random_state=1  # <- 64 neurons for hidden layer
)  # <- set seed to 1

In [None]:
# =================================================-
#### Slide 3: Fit the model to training data  ####

# Fit the saved model to your training data.
fit_nn = nn.fit(X_train_scaled, y_train)

In [None]:
# =================================================-
#### Slide 4: Inspect accuracy of training model  ####

# Compute accuracy using training data.
acc_train_nn = fit_nn.score(X_train_scaled, y_train)
print("Train Accuracy:", acc_train_nn)

In [None]:
# =================================================-
#### Slide 10: Predict on test data  ####

# Predict on test data.
predicted_values_nn = fit_nn.predict(X_test_scaled)
print(predicted_values_nn)

# Compute test model accuracy score.
test_accuracy_score = metrics.accuracy_score(y_test, predicted_values_nn)
print("Accuracy on test data: ", test_accuracy_score)

In [None]:
# =================================================-
#### Slide 11: Confusion matrix   ####

# Take a look at test data confusion matrix.
conf_matrix_test = metrics.confusion_matrix(y_test, predicted_values_nn)
print(conf_matrix_test)

In [None]:
# =================================================-
#### Slide 19: Classification report  ####

# Create a list of target names to interpret class assignments.
target_names = ["default_payment_0", "default_payment_1"]
# Print an entire classification report.
class_report = metrics.classification_report(
    y_test, predicted_values_nn, target_names=target_names
)
print(class_report)

In [None]:
# =================================================-
#### Slide 22: Getting probabilities instead of class labels  ####

# Get probabilities instead of predicted values.
test_probabilities = fit_nn.predict_proba(X_test_scaled)

# Get probabilities of test predictions only.
test_predictions = test_probabilities[:, 1]

In [None]:
# =================================================-
#### Slide 23: Computing FPR, TPR, and threshold  ####

# Get FPR, TPR, and threshold values.
fpr, tpr, threshold = metrics.roc_curve(
    y_test, test_predictions  # <- test data labels
)  # <- predicted probabilities
print("False positive: ", fpr)
print("True positive: ", tpr)
print("Threshold: ", threshold)

In [None]:
# =================================================-
#### Slide 24: Computing AUC  ####

# Get AUC by providing the FPR and TPR.
auc = metrics.auc(fpr, tpr)
print("Area under the ROC curve: ", auc)

In [None]:
# =================================================-
#### Slide 25: Putting it all together: ROC plot  ####

# Make an ROC curve plot.
plt.title("Receiver Operator Characteristic")
plt.plot(fpr, tpr, "b", label="AUC = %0.2f" % auc)
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1], "r--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

In [None]:
# =================================================-
#### Slide 26: Putting it all together: ROC plot (cont'd)  ####

# Make an ROC curve plot.
plt.title("Receiver Operator Characteristic")
plt.plot(fpr, tpr, "b", label="AUC = %0.2f" % auc)
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1], "r--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################

In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## INTROTONEURALNETWORKS/2 BUILDINGNEURALNETWORKS/INTROTONEURALNETWORKS BUILDINGNEURALNETWORKS 3 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs

In [None]:
# =================================================-
#### Slide 12: Fitting & visualizing training history: accuracy   ####

N_EPOCHS = 25  # <- number of epochs
N_CLASSES = np.unique(y_train)  # <- number of classes in the target variable

# Build neural network model by creating a classifier:
# add the number of hidden neurons in the 1st hidden layer and set random state.
mlp = MLPClassifier(hidden_layer_sizes=(64), random_state=1)

scores_train = []  # <- we will store scores for training history here
scores_test = []  # <- we will store scores for testing history here

epoch = 0  # <- set epoch count
while epoch < N_EPOCHS:
    mlp_fit = mlp.partial_fit(X_train_scaled, y_train, classes=N_CLASSES)

    # Compute score for train data.
    scores_train.append(mlp.score(X_train_scaled, y_train))

    # Compute score for test data.
    scores_test.append(mlp.score(X_test_scaled, y_test))
    epoch += 1  # <- increment the epoch

In [None]:
# =================================================-
#### Slide 13: Inspect model accuracy and loss  ####

plt.plot(scores_train, color="green", alpha=0.8, label="Train")
plt.plot(scores_test, color="magenta", alpha=0.8, label="Test")
plt.title("Accuracy over epochs", fontsize=14)
plt.xlabel("Epochs")
plt.legend(loc="upper left")
plt.show()
plt.plot(mlp.loss_curve_)
plt.title("Loss over epochs", fontsize=14)
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.show()

In [None]:
# =================================================-
#### Slide 14: Inspect model accuracy and loss (cont'd)  ####

plt.plot(scores_train, color="green", alpha=0.8, label="Train")
plt.plot(scores_test, color="magenta", alpha=0.8, label="Test")
plt.title("Accuracy over epochs", fontsize=14)
plt.xlabel("Epochs")
plt.legend(loc="upper left")
plt.show()

In [None]:
# =================================================-
#### Slide 17: Class-imbalanced dataset challenges  ####

print(credit_card["default_payment_next_month"].value_counts())

In [None]:
# =================================================-
#### Slide 20: Balancing target with SMOTE (cont'd)  ####

from imblearn.over_sampling import SMOTE

# Let's initialize SMOTE object.
smote = SMOTE()

# We can now fit the sampling method to our train data and labels.
X_train_sm, y_train_sm = smote.fit_resample(X_train_scaled, y_train)
print(y_train_sm.value_counts())

In [None]:
# =================================================-
#### Slide 23: Exercise  ####


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################