# Student Loan Risk with Deep Learning

In [1]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

2023-08-29 11:26:18.084666: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


---

## Prepare the data to be used on a neural network model

### Step 1: Read the `student_loans.csv` file into a Pandas DataFrame. Review the DataFrame, looking for columns that could eventually define your features and target variables.   

In [2]:
# Read the csv into a Pandas DataFrame
file_path = "https://static.bc-edx.com/mbc/ai/m6/datasets/student_loans.csv"


# Review the DataFrame
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
# Review the data types associated with the columns
df.dtypes

payment_history           float64
location_parameter        float64
stem_degree_score         float64
gpa_ranking               float64
alumni_success            float64
study_major_code          float64
time_to_completion        float64
finance_workshop_score    float64
cohort_ranking            float64
total_loan_score          float64
financial_aid_score       float64
credit_ranking              int64
dtype: object

### Step 2: Using the preprocessed data, create the features (`X`) and target (`y`) datasets. The target dataset should be defined by the preprocessed DataFrame column “credit_ranking”. The remaining columns should define the features dataset.

In [4]:
# Define the target set y using the credit_ranking column
y = df["credit_ranking"]

# Display a sample of y
y

0       5
1       5
2       5
3       6
4       5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: credit_ranking, Length: 1599, dtype: int64

In [5]:
# Define features set X by selecting all columns but credit_ranking
X = df.drop(columns="credit_ranking")

# Review the features DataFrame
X.head()

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


### Step 3: Split the features and target sets into training and testing datasets.


In [6]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


### Step 4: Use scikit-learn's `StandardScaler` to scale the features data.

In [7]:
# Create a StandardScaler instance
X_scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

---

## Compile and Evaluate a Model Using a Neural Network

### Step 1: Create a deep neural network by assigning the number of input features, the number of layers, and the number of neurons on each layer using Tensorflow’s Keras.

> **Hint** You can start with a two-layer deep neural network model that uses the `relu` activation function for both layers.


In [8]:
# Define the the number of inputs (features) to the model
# two-deep neural network: input layer, hidden layer 1, hidden layer 2, 1 output layer
# there are 11 features and 1 output in the data
# inputs and hidden nodes
number_inputs = 11

# create a sequential neural network
# Review the number of features
X.shape[1]

11

In [9]:
# Define the number of neurons in the output layer
number_outputs = 1

In [10]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 = 6


# Review the number hidden nodes in the first layer


In [11]:
# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 = 3

# Review the number hidden nodes in the second layer


In [12]:
# Create the Sequential model instance
nn = Sequential()

In [13]:
# Add the first hidden layer
nn.add(Dense(units=hidden_nodes_layer1, input_dim=number_inputs, activation="relu"))



In [14]:
# Add the second hidden layer
nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))


In [15]:
# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units=number_outputs, activation="linear"))

In [16]:
# Display the Sequential model summary
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 6)                 72        
                                                                 
 dense_1 (Dense)             (None, 3)                 21        
                                                                 
 dense_2 (Dense)             (None, 1)                 4         
                                                                 
Total params: 97 (388.00 Byte)
Trainable params: 97 (388.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Step 2: Compile and fit the model using the `mse` loss function, the `adam` optimizer, and the `mse` evaluation metric.


In [17]:
# Compile the Sequential model
nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["mse"])

In [18]:
# Fit the model using 100 epochs and the training data
student_loan_model = nn.fit(X_train_scaled, y_train, epochs=100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Step 3: Evaluate the model using the test data to determine the model’s loss and accuracy.


In [19]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

13/13 - 0s - loss: 0.3966 - mse: 0.3966 - 133ms/epoch - 10ms/step
Loss: 0.39663153886795044, Accuracy: 0.39663153886795044


### Step 4: Save and export your model to an HDF5 file, and name the file `student_loans.h5`.


In [20]:
# Set the model's file path
file_path = Path("saved_models/student_loans.h5")


# Export your model to a HDF5 file
nn.save(file_path)

  saving_api.save_model(


---
## Predict Loan Repayment Success by Using your Neural Network Model

### Step 1: Reload your saved model.

In [21]:
# Set the model's file path
file_path = Path("./saved_models/student_loans.h5")


# Load the model to a new object
nn_imported = tf.keras.models.load_model(file_path)

### Step 2: Make predictions on the testing data.

In [22]:
# Make predictions on the testing data
predictions = (nn_imported.predict(X_test_scaled)).round().astype("int32")



In [23]:
predictions[:10]

array([[6],
       [5],
       [6],
       [6],
       [6],
       [6],
       [6],
       [5],
       [6],
       [5]], dtype=int32)

In [24]:
predictions.ravel()

array([6, 5, 6, 6, 6, 6, 6, 5, 6, 5, 6, 5, 5, 7, 6, 5, 6, 6, 6, 6, 6, 6,
       5, 7, 5, 6, 6, 5, 5, 6, 6, 6, 5, 6, 5, 6, 5, 5, 6, 5, 6, 5, 5, 6,
       5, 6, 5, 5, 5, 6, 5, 5, 6, 5, 5, 6, 6, 6, 6, 5, 6, 5, 5, 5, 5, 6,
       7, 6, 6, 6, 5, 6, 5, 6, 5, 5, 6, 5, 6, 6, 6, 5, 5, 5, 6, 5, 5, 6,
       5, 6, 5, 6, 5, 6, 6, 5, 6, 5, 4, 6, 6, 5, 5, 5, 5, 5, 6, 6, 6, 6,
       7, 6, 6, 6, 5, 7, 5, 5, 5, 5, 5, 7, 6, 4, 5, 6, 6, 5, 6, 6, 6, 5,
       6, 5, 5, 6, 5, 7, 5, 6, 5, 6, 5, 5, 6, 6, 6, 5, 6, 6, 6, 7, 5, 5,
       6, 6, 7, 6, 6, 6, 6, 6, 5, 5, 7, 6, 6, 5, 5, 6, 5, 6, 5, 5, 7, 6,
       5, 5, 6, 5, 6, 7, 6, 5, 6, 8, 6, 5, 5, 5, 6, 6, 6, 6, 6, 5, 6, 5,
       5, 5, 6, 6, 7, 5, 7, 6, 5, 5, 6, 6, 5, 6, 5, 6, 5, 5, 5, 5, 6, 5,
       6, 5, 5, 5, 6, 6, 5, 6, 7, 6, 5, 5, 6, 6, 5, 5, 6, 5, 6, 5, 6, 6,
       6, 5, 6, 6, 6, 5, 5, 5, 5, 5, 6, 6, 6, 5, 5, 5, 5, 6, 6, 5, 6, 6,
       5, 6, 6, 5, 5, 6, 5, 6, 5, 6, 5, 5, 5, 6, 5, 5, 7, 6, 5, 5, 5, 6,
       6, 5, 5, 6, 5, 6, 6, 6, 6, 6, 5, 6, 5, 6, 6,

### Step 3: Create a DataFrame to compare the predictions with the actual values.

In [25]:
# Create a DataFrame to compare the predictions with the actual values
results = pd.DataFrame({"predictions": predictions.ravel(), "actual": y_test})


### Step 4: Display a sample of the DataFrame you created in step 3.

In [26]:
# Display sample data
results.head(10)

Unnamed: 0,predictions,actual
75,6,5
1283,5,6
408,6,6
1281,6,6
1118,6,6
1143,6,6
1215,6,6
181,5,5
1186,6,5
1252,5,5


In [27]:
# out of curiosity, wondering if an addiational hidden layer would improve the model

number_inputs = 11
hidden_nodes_layer1 = 9
hidden_nodes_layer2 = 6
hidden_nodes_layer3 = 3

nn_2 = Sequential()

nn_2.add(Dense(units=hidden_nodes_layer1, input_dim=number_inputs, activation="relu"))
nn_2.add(Dense(units=hidden_nodes_layer2, activation="relu"))
nn_2.add(Dense(units=hidden_nodes_layer3, activation="relu"))
nn_2.add(Dense(units=1, activation="linear"))



In [28]:
nn_2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 9)                 108       
                                                                 
 dense_4 (Dense)             (None, 6)                 60        
                                                                 
 dense_5 (Dense)             (None, 3)                 21        
                                                                 
 dense_6 (Dense)             (None, 1)                 4         
                                                                 
Total params: 193 (772.00 Byte)
Trainable params: 193 (772.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
nn_2.compile(loss="mean_squared_error", optimizer="adam", metrics=["mse"])

student_loan_model_2 = nn_2.fit(X_train_scaled, y_train, epochs=100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [30]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss_1: {model_loss}, Accuracy_1: {model_accuracy}")

model_loss, model_accuracy = nn_2.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss_2: {model_loss}, Accuracy_2: {model_accuracy}")

13/13 - 0s - loss: 0.3966 - mse: 0.3966 - 58ms/epoch - 4ms/step
Loss_1: 0.39663153886795044, Accuracy_1: 0.39663153886795044
13/13 - 0s - loss: 0.4075 - mse: 0.4075 - 141ms/epoch - 11ms/step
Loss_2: 0.40747082233428955, Accuracy_2: 0.40747082233428955


In [31]:
# When calculating the risk level, the prediction is rounded to the nearst whole integer
# assuming the 
predictions_raw = (nn_imported.predict(X_test_scaled))
# predictions = (nn_imported.predict(X_test_scaled)).round().astype("int32")

import numpy as np
mse_predictions = np.mean((predictions-predictions_raw)**2)
mse_predictions

# adding an additional hidden layer resulted in ≈6% reduction in MSE 
# This is less than the MSE introsuced by rounding
# stick to two layers




0.08823927017405025

In [32]:
predictions_raw

array([[5.8782415],
       [5.4037166],
       [6.1060066],
       [5.700898 ],
       [6.375184 ],
       [5.9687266],
       [6.232463 ],
       [4.6749005],
       [5.5602736],
       [5.0213375],
       [6.3546042],
       [5.3184166],
       [5.352652 ],
       [6.666741 ],
       [5.6693087],
       [5.3304296],
       [5.9123874],
       [5.508565 ],
       [5.5105805],
       [5.5044193],
       [5.873447 ],
       [5.7187448],
       [5.4088874],
       [7.182464 ],
       [5.390429 ],
       [6.2532887],
       [6.298467 ],
       [5.3977194],
       [5.41146  ],
       [6.342683 ],
       [5.6609025],
       [6.3738427],
       [5.490583 ],
       [6.2072377],
       [5.1481085],
       [5.574974 ],
       [5.121603 ],
       [5.4579525],
       [6.196579 ],
       [4.609047 ],
       [5.9961243],
       [4.775133 ],
       [5.0998898],
       [6.106289 ],
       [5.054347 ],
       [5.5962744],
       [5.264251 ],
       [5.119737 ],
       [5.437259 ],
       [6.3535247],
