# data prep

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/student.csv")

# delete broken data
df = df.dropna(how='any',axis=0)
df = df.drop("school", axis=1)
df

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,F,18,U,GT3,A,4,4,at_home,teacher,course,...,4,3,4,1,1,3,6,5,6,6
1,F,17,U,GT3,T,1,1,at_home,other,course,...,5,3,3,1,1,3,4,5,5,6
2,F,15,U,LE3,T,1,1,at_home,other,other,...,4,3,2,2,3,3,10,7,8,10
3,F,15,U,GT3,T,4,2,health,services,home,...,3,2,2,1,1,5,2,15,14,15
4,F,16,U,GT3,T,3,3,other,other,home,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039,F,19,R,GT3,T,2,3,services,other,course,...,5,4,2,1,2,5,4,10,11,10
1040,F,18,U,LE3,T,3,1,teacher,services,course,...,4,3,4,1,1,1,4,15,15,16
1041,F,18,U,GT3,T,1,1,other,other,course,...,1,1,1,1,1,5,6,11,12,9
1042,M,17,U,LE3,T,3,1,services,services,course,...,2,4,5,3,4,2,6,10,10,10


In [3]:
for i, _ in enumerate(df.dtypes):
    print(df.columns.values[i], _)

sex object
age int64
address object
famsize object
Pstatus object
Medu int64
Fedu int64
Mjob object
Fjob object
reason object
guardian object
traveltime int64
studytime int64
failures int64
schoolsup object
famsup object
paid object
activities object
nursery object
higher object
internet object
romantic object
famrel int64
freetime int64
goout int64
Dalc int64
Walc int64
health int64
absences int64
G1 int64
G2 int64
G3 int64


In [4]:
df.dtypes[0].name

  df.dtypes[0].name


'object'

In [5]:
# replace categorical data with discrete whole numbers
column_keys = df.columns.values
categorical_column_keys = []
categorical_unique_values = {}

for i, dt in enumerate(df.dtypes):
    if dt.name == "object":
        categorical_column_keys.append(df.columns.values[i])

for i in categorical_column_keys:
    unique = df[i].unique()
    categorical_unique_values[i] = unique
    df[i].replace(unique, np.arange(0, len(df[i].unique()), 1, dtype=int), inplace=True)

categorical_unique_values

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[i].replace(unique, np.arange(0, len(df[i].unique()), 1, dtype=int), inplace=True)
  df[i].replace(unique, np.arange(0, len(df[i].unique()), 1, dtype=int), inplace=True)


{'sex': array(['F', 'M'], dtype=object),
 'address': array(['U', 'R'], dtype=object),
 'famsize': array(['GT3', 'LE3'], dtype=object),
 'Pstatus': array(['A', 'T'], dtype=object),
 'Mjob': array(['at_home', 'health', 'other', 'services', 'teacher'], dtype=object),
 'Fjob': array(['teacher', 'other', 'services', 'health', 'at_home'], dtype=object),
 'reason': array(['course', 'other', 'home', 'reputation'], dtype=object),
 'guardian': array(['mother', 'father', 'other'], dtype=object),
 'schoolsup': array(['yes', 'no'], dtype=object),
 'famsup': array(['no', 'yes'], dtype=object),
 'paid': array(['no', 'yes'], dtype=object),
 'activities': array(['no', 'yes'], dtype=object),
 'nursery': array(['yes', 'no'], dtype=object),
 'higher': array(['yes', 'no'], dtype=object),
 'internet': array(['no', 'yes'], dtype=object),
 'romantic': array(['no', 'yes'], dtype=object)}

In [6]:
OUTPUT_KEYS = ["G1", "G2", "G3"]

X = df.drop(OUTPUT_KEYS, axis=1).values
y = df[OUTPUT_KEYS].values

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train

array([[ 1, 17,  0, ...,  3,  4,  6],
       [ 1, 18,  1, ...,  4,  3,  0],
       [ 0, 18,  0, ...,  1,  3,  0],
       ...,
       [ 0, 18,  0, ...,  1,  4,  2],
       [ 0, 17,  1, ...,  1,  1,  0],
       [ 1, 16,  0, ...,  3,  2, 10]])

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().set_output(transform="pandas")
scaled_X_train = scaler.fit_transform(X_train)

# model

In [9]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=64, activation="relu"),
    tf.keras.layers.Dense(units=32, activation="relu"),
    tf.keras.layers.Dense(units=16, activation="relu"),
    tf.keras.layers.Dense(units=3, activation="relu"),
])

model.compile(loss=tf.keras.losses.MeanSquaredError(),
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

model.fit(X_test, y_test, epochs=100)

Epoch 1/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - loss: 114.5618
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 90.1840 
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 68.7322 
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 42.7776
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 24.8875 
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 18.3596 
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 15.6479
Epoch 8/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 11.9364
Epoch 9/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 11.7628
Epoch 10/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 11.3962

<keras.src.callbacks.history.History at 0x1c0cc100850>

In [10]:
model.metrics_names

['loss']

In [11]:
model.evaluate(X_train, y_train)

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 10.1879


10.078645706176758

In [12]:
model.evaluate(X_test, y_test)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 6.8002 


6.954195499420166

In [13]:
X_test.shape

(209, 29)

In [14]:
X_test[0]

array([ 1, 17,  0,  0,  1,  2,  1,  2,  1,  2,  0,  1,  1,  0,  1,  1,  0,
        0,  0,  0,  1,  0,  5,  4,  5,  1,  2,  5, 22])

In [15]:
scaled_X_train.shape

(835, 29)

In [16]:
y_test_pred = model.predict(X_test)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


In [24]:
INDEX = 0
print("INPUT")
for i, val in enumerate(X_test[INDEX]):
    key = column_keys[i]
    is_categorical = key in categorical_column_keys
    print(f"{key}: {categorical_unique_values[key][val] if is_categorical else val}")

print("\nOUTPUT")
for i, val in enumerate(y_test_pred):
    print(f"{OUTPUT_KEYS[i]} - pred: {y_test_pred[INDEX][i]} | real: {y_test[INDEX][i]}")

INPUT
sex: M
age: 17
address: U
famsize: GT3
Pstatus: T
Medu: 2
Fedu: 1
Mjob: other
Fjob: other
reason: home
guardian: mother
traveltime: 1
studytime: 1
failures: 0
schoolsup: no
famsup: yes
paid: no
activities: no
nursery: yes
higher: yes
internet: yes
romantic: no
famrel: 5
freetime: 4
goout: 5
Dalc: 1
Walc: 2
health: 5
absences: 22

OUTPUT
G1 - pred: 9.629240989685059 | real: 9
G2 - pred: 9.75959300994873 | real: 7
G3 - pred: 9.732418060302734 | real: 6


IndexError: list index out of range