# Student Loan Risk with Deep Learning

In [1]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

---

## Prepare the data to be used on a neural network model

### Step 1: Read the `student_loans.csv` file into a Pandas DataFrame. Review the DataFrame, looking for columns that could eventually define your features and target variables.   

In [2]:
# Read the csv into a Pandas DataFrame
file_path = "https://static.bc-edx.com/mbc/ai/m6/datasets/student_loans.csv"
df = pd.read_csv(file_path)

# Review the DataFrame
# Display the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(df.head())

# Display information about the DataFrame
print("\nDataFrame Information:")
print(df.info())

# Display statistical summaries of the DataFrame
print("\nDataFrame Statistical Summary:")
print(df.describe())

First few rows of the DataFrame:
   payment_history  location_parameter  stem_degree_score  gpa_ranking  \
0              7.4                0.70               0.00          1.9   
1              7.8                0.88               0.00          2.6   
2              7.8                0.76               0.04          2.3   
3             11.2                0.28               0.56          1.9   
4              7.4                0.70               0.00          1.9   

   alumni_success  study_major_code  time_to_completion  \
0           0.076              11.0                34.0   
1           0.098              25.0                67.0   
2           0.092              15.0                54.0   
3           0.075              17.0                60.0   
4           0.076              11.0                34.0   

   finance_workshop_score  cohort_ranking  total_loan_score  \
0                  0.9978            3.51              0.56   
1                  0.9968            3.20

In [4]:
# Review the data types associated with the columns
print(df.dtypes)


payment_history           float64
location_parameter        float64
stem_degree_score         float64
gpa_ranking               float64
alumni_success            float64
study_major_code          float64
time_to_completion        float64
finance_workshop_score    float64
cohort_ranking            float64
total_loan_score          float64
financial_aid_score       float64
credit_ranking              int64
dtype: object


### Step 2: Using the preprocessed data, create the features (`X`) and target (`y`) datasets. The target dataset should be defined by the preprocessed DataFrame column “credit_ranking”. The remaining columns should define the features dataset.

In [5]:
# Define the target set y using the credit_ranking column
y = df['credit_ranking'].values

# Display a sample of y
print(y[:5])  # Display the first 5 entries


[5 5 5 6 5]


In [6]:
# Define features set X by selecting all columns but credit_ranking
X = df.drop(columns=['credit_ranking'])

# Review the features DataFrame
print(X.head())  # Display the first few rows


   payment_history  location_parameter  stem_degree_score  gpa_ranking  \
0              7.4                0.70               0.00          1.9   
1              7.8                0.88               0.00          2.6   
2              7.8                0.76               0.04          2.3   
3             11.2                0.28               0.56          1.9   
4              7.4                0.70               0.00          1.9   

   alumni_success  study_major_code  time_to_completion  \
0           0.076              11.0                34.0   
1           0.098              25.0                67.0   
2           0.092              15.0                54.0   
3           0.075              17.0                60.0   
4           0.076              11.0                34.0   

   finance_workshop_score  cohort_ranking  total_loan_score  \
0                  0.9978            3.51              0.56   
1                  0.9968            3.20              0.68   
2          

### Step 3: Split the features and target sets into training and testing datasets.


In [7]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
from sklearn.model_selection import train_test_split

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Optionally, you can review the shapes of the datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (1199, 11)
X_test shape: (400, 11)
y_train shape: (1199,)
y_test shape: (400,)


### Step 4: Use scikit-learn's `StandardScaler` to scale the features data.

In [9]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
scaler.fit(X_train)

# Fit the scaler to the features training dataset
# Assumed this ^ was a typo and ignored it

# Scale the features training dataset
X_train_scaled = scaler.transform(X_train)

# Scale the features testing dataset
X_test_scaled = scaler.transform(X_test)

# Review the scaled data
print("Scaled X_train sample:\n", X_train_scaled[:5])
print("Scaled X_test sample:\n", X_test_scaled[:5])

Scaled X_train sample:
 [[-0.56794375 -1.18088385 -0.19241296 -0.17192598 -0.45737585  1.86853316
   0.7479316  -0.45120802  0.73249574 -0.22553894 -0.31915735]
 [-0.00940983  0.80065564 -0.04123135 -0.32236513 -0.0699402  -0.45384715
  -0.08558383  0.44779905 -0.03016065 -0.75844691 -1.15783431]
 [ 2.33643265 -0.84597577  1.67216021  0.05373275 -0.13451281  1.48146978
   0.80967349  1.49315611 -1.36480934  0.42579302  0.05358797]
 [ 0.21400374  1.30301776 -0.24280683 -0.17192598 -0.50042426 -0.64737885
  -0.11645477 -0.23168303 -1.36480934 -0.64002292 -0.97146166]
 [-0.00940983  1.74956187 -1.30107808 -0.24714555 -0.37127904 -1.03444223
  -0.98084114 -0.02261162  0.03339405 -0.46238693  0.51951962]]
Scaled X_test sample:
 [[ 0.26985713 -0.67852173  1.87373569 -0.24714555  0.12377763 -0.64737885
  -0.11645477  0.97047758  1.43159743  0.01130905  0.05358797]
 [ 0.21400374  0.54947458  0.05955639  0.12895233  0.18835024  0.12674792
   0.71706066  0.31190264 -0.34793415 -0.16632694 -0.225

---

## Compile and Evaluate a Model Using a Neural Network

### Step 1: Create a deep neural network by assigning the number of input features, the number of layers, and the number of neurons on each layer using Tensorflow’s Keras.

> **Hint** You can start with a two-layer deep neural network model that uses the `relu` activation function for both layers.


In [10]:
# Define the number of inputs (features) to the model
number_input_features = X_train_scaled.shape[1]

# Review the number of features
print(number_input_features)


11


In [11]:
# Define the number of neurons in the output layer
number_output_neurons = 1


In [13]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 = (number_input_features + number_output_neurons) // 2

# Review the number hidden nodes in the first layer
print(hidden_nodes_layer1)


6


In [15]:
# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 = hidden_nodes_layer1 // 2

# Review the number hidden nodes in the second layer
print(hidden_nodes_layer2)


3


In [16]:
# Create the Sequential model instance
model = Sequential()


In [17]:
# Add the first hidden layer
model.add(Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='relu'))


In [18]:
# Add the second hidden layer
model.add(Dense(units=hidden_nodes_layer2, activation='relu'))


In [19]:
# Add the output layer to the model specifying the number of output neurons and activation function
model.add(Dense(units=1, activation='linear'))


In [20]:
# Display the Sequential model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 6)                 72        
                                                                 
 dense_1 (Dense)             (None, 3)                 21        
                                                                 
 dense_2 (Dense)             (None, 1)                 4         
                                                                 
Total params: 97 (388.00 Byte)
Trainable params: 97 (388.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Step 2: Compile and fit the model using the `mse` loss function, the `adam` optimizer, and the `mse` evaluation metric.


In [21]:
# Compile the Sequential model
model.compile(loss="mse", optimizer="adam", metrics=["mse"])


In [22]:
# Fit the model using 50 epochs and the training data
model.fit(X_train_scaled, y_train, epochs=50)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7e29d8bb7b80>

### Step 3: Evaluate the model using the test data to determine the model’s loss and accuracy.


In [23]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test, verbose=2)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


13/13 - 0s - loss: 15.3866 - mse: 15.3866 - 166ms/epoch - 13ms/step
Loss: 15.386613845825195, Accuracy: 15.386613845825195


### Step 4: Save and export your model to an HDF5 file, and name the file `student_loans.h5`.


In [24]:
# Set the model's file path
file_path = "student_loans.h5"

# Export your model to a HDF5 file
model.save(file_path)


  saving_api.save_model(


---
## Predict Loan Repayment Success by Using your Neural Network Model

### Step 1: Reload your saved model.

In [26]:
# Import the load_model function from the tensorflow.keras.models module
from tensorflow.keras.models import load_model

# Set the model's file path
file_path = "student_loans.h5"

# Load the model to a new object
nn_imported = load_model(file_path)


### Step 2: Make predictions on the testing data.

In [27]:
# Make predictions on the testing data
predictions = nn_imported.predict(X_test_scaled)




### Step 3: Create a DataFrame to compare the predictions with the actual values.

In [28]:
# Create a DataFrame to compare the predictions with the actual values
results_df = pd.DataFrame({
    "Actual": y_test,
    "Predicted": predictions.ravel()
})


### Step 4: Display a sample of the DataFrame you created in step 3.

In [29]:
# Display sample data
results_df.head()


Unnamed: 0,Actual,Predicted
0,5,1.758364
1,6,1.758364
2,6,1.758364
3,6,1.758364
4,6,1.758364
