### School Dropout - Tensor ML

### Import Relevant Libraries

In [48]:
import numpy as np
import tensorflow as tf
import pandas as pd

### Data

In [23]:
# let's create a temporary variable npz to store the three data
npz = np.load('school_dropout_data_train.npz', allow_pickle=True)

# we extract the inputs using the keyword under which we saved them
# to ensure that they are all floats, let's also take care of that
train_inputs = npz['inputs'].astype(float)
train_targets = npz['targets'].astype(int)

npz = np.load('school_dropout_data_validation.npz', allow_pickle=True)
validation_inputs, validation_targets = npz['inputs'].astype(float), npz['targets'].astype(int)

npz = np.load('school_dropout_data_test.npz', allow_pickle=True)
test_inputs, test_targets = npz['inputs'].astype(float), npz['targets'].astype(int)

### Model

In [26]:
input_size = 30
output_size = 2

hidden_layer_size = 100

#define how the model will look like
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 1st hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 2nd hidden layer
    tf.keras.layers.Dense(output_size, activation='softmax') # output layer
])

# Choose optimizer and loss function
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

### Training
# Set the batch size
batch_size = 100

#Set a maximum number of training epochs
max_epochs = 100

#Set an early stopping mechanism
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

# fit the model
model.fit(
    train_inputs,
    train_targets,
    batch_size=batch_size,
    epochs=max_epochs,
    callbacks=[early_stopping],
    validation_data=(validation_inputs, validation_targets),
    verbose=2
    
)

Epoch 1/100
22/22 - 1s - 23ms/step - accuracy: 0.7677 - loss: 0.5260 - val_accuracy: 0.8526 - val_loss: 0.3813
Epoch 2/100
22/22 - 0s - 2ms/step - accuracy: 0.8421 - loss: 0.3667 - val_accuracy: 0.8691 - val_loss: 0.3307
Epoch 3/100
22/22 - 0s - 2ms/step - accuracy: 0.8609 - loss: 0.3305 - val_accuracy: 0.8788 - val_loss: 0.3179
Epoch 4/100
22/22 - 0s - 2ms/step - accuracy: 0.8659 - loss: 0.3110 - val_accuracy: 0.8581 - val_loss: 0.3260
Epoch 5/100
22/22 - 0s - 2ms/step - accuracy: 0.8719 - loss: 0.2984 - val_accuracy: 0.8747 - val_loss: 0.3121
Epoch 6/100
22/22 - 0s - 2ms/step - accuracy: 0.8779 - loss: 0.2887 - val_accuracy: 0.8802 - val_loss: 0.3130
Epoch 7/100
22/22 - 0s - 2ms/step - accuracy: 0.8926 - loss: 0.2761 - val_accuracy: 0.8719 - val_loss: 0.3170


<keras.src.callbacks.history.History at 0x150c12620>

### Test the model

In [29]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 846us/step - accuracy: 0.8838 - loss: 0.2984


In [31]:
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.30. Test accuracy: 88.84%


### Make Predictions

In [34]:
predictions = model.predict(test_inputs)

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  


In [65]:
# Change the probabilities to classes
predicted_classes = (predictions > 0.5).astype(int)
predicted_labels = predicted_classes.argmax(axis=1)

### Reconstructing the DataFrame

In [57]:
import joblib

# Load the saved scaler
scaler = joblib.load('scaler.pkl')

# Inverse transform to get the original test input values
original_test_inputs = scaler.inverse_transform(test_inputs)

In [59]:
column_names = ['Marital Status', 'Daytime/evening attendance',
       'Previous qualification (grade)', 'Admission grade', 'Displaced',
       'Tuition fees up to date', 'Gender', 'Scholarship holder',
       'Age at enrollment', 'Curricular units 1st sem (grade)',
       'Curricular units 2nd sem (grade)', 'Course_171', 'Course_8014',
       'Course_9003', 'Course_9070', 'Course_9085', 'Course_9119',
       'Course_9130', 'Course_9147', 'Course_9238', 'Course_9254',
       'Course_9500', 'Course_9556', 'Course_9670', 'Course_9773',
       'Course_9853', 'Course_9991', 'Unemployment rate', 'Inflation rate',
       'GDP']

test_df = pd.DataFrame(original_test_inputs, columns=column_names)
test_df

Unnamed: 0,Marital Status,Daytime/evening attendance,Previous qualification (grade),Admission grade,Displaced,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,Curricular units 1st sem (grade),...,Course_9254,Course_9500,Course_9556,Course_9670,Course_9773,Course_9853,Course_9991,Unemployment rate,Inflation rate,GDP
0,1.0,1.0,133.1,100.0,0.0,1.0,0.0,0.0,37.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,-6.938894e-18,13.9,-0.3,0.79
1,1.0,1.0,120.0,117.2,0.0,1.0,1.0,0.0,21.0,12.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,-6.938894e-18,15.5,2.8,-4.06
2,4.0,1.0,120.0,110.0,0.0,1.0,0.0,0.0,34.0,11.250000,...,0.0,0.0,0.0,0.0,0.0,0.0,-6.938894e-18,13.9,-0.3,0.79
3,1.0,1.0,133.0,122.2,0.0,1.0,0.0,0.0,20.0,12.783333,...,0.0,1.0,0.0,0.0,0.0,0.0,-6.938894e-18,13.9,-0.3,0.79
4,2.0,0.0,120.0,168.2,0.0,1.0,1.0,0.0,45.0,14.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.000000e+00,12.4,0.5,1.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
721,1.0,1.0,124.0,123.3,1.0,1.0,0.0,1.0,20.0,14.901429,...,0.0,1.0,0.0,0.0,0.0,0.0,-6.938894e-18,9.4,-0.8,-3.12
722,1.0,1.0,133.1,131.3,1.0,1.0,0.0,1.0,18.0,12.285714,...,0.0,0.0,0.0,0.0,1.0,0.0,-6.938894e-18,8.9,1.4,3.51
723,2.0,1.0,130.0,117.9,0.0,1.0,0.0,0.0,35.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,-6.938894e-18,10.8,1.4,1.74
724,1.0,1.0,120.0,134.4,0.0,1.0,1.0,0.0,24.0,15.714286,...,0.0,0.0,0.0,0.0,0.0,0.0,-6.938894e-18,15.5,2.8,-4.06


In [67]:
test_df['Predicted'] = predicted_labels

test_df.to_csv('school_dropout_predictions_original_data.csv', index=False)