# data prep

In [52]:
import pandas as pd
import numpy as np

In [53]:
df = pd.read_csv("data/student.csv")

# delete broken data
df = df.dropna(how='any',axis=0)
df = df.drop(["school", "G1", "G2"], axis=1)
df

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
0,F,18,U,GT3,A,4,4,at_home,teacher,course,...,no,no,4,3,4,1,1,3,6,6
1,F,17,U,GT3,T,1,1,at_home,other,course,...,yes,no,5,3,3,1,1,3,4,6
2,F,15,U,LE3,T,1,1,at_home,other,other,...,yes,no,4,3,2,2,3,3,10,10
3,F,15,U,GT3,T,4,2,health,services,home,...,yes,yes,3,2,2,1,1,5,2,15
4,F,16,U,GT3,T,3,3,other,other,home,...,no,no,4,3,2,1,2,5,4,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039,F,19,R,GT3,T,2,3,services,other,course,...,yes,no,5,4,2,1,2,5,4,10
1040,F,18,U,LE3,T,3,1,teacher,services,course,...,yes,no,4,3,4,1,1,1,4,16
1041,F,18,U,GT3,T,1,1,other,other,course,...,no,no,1,1,1,1,1,5,6,9
1042,M,17,U,LE3,T,3,1,services,services,course,...,yes,no,2,4,5,3,4,2,6,10


In [54]:
for i, _ in enumerate(df.dtypes):
    print(df.columns.values[i], _)

sex object
age int64
address object
famsize object
Pstatus object
Medu int64
Fedu int64
Mjob object
Fjob object
reason object
guardian object
traveltime int64
studytime int64
failures int64
schoolsup object
famsup object
paid object
activities object
nursery object
higher object
internet object
romantic object
famrel int64
freetime int64
goout int64
Dalc int64
Walc int64
health int64
absences int64
G3 int64


In [55]:
df.dtypes[0].name

  df.dtypes[0].name


'object'

In [56]:
# replace categorical data with discrete whole numbers
column_keys = df.columns.values
categorical_column_keys = []
categorical_unique_values = {}

for i, dt in enumerate(df.dtypes):
    if dt.name == "object":
        categorical_column_keys.append(df.columns.values[i])

for i in categorical_column_keys:
    unique = df[i].unique()
    categorical_unique_values[i] = unique
    df[i].replace(unique, np.arange(0, len(df[i].unique()), 1, dtype=int), inplace=True)

categorical_unique_values

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[i].replace(unique, np.arange(0, len(df[i].unique()), 1, dtype=int), inplace=True)
  df[i].replace(unique, np.arange(0, len(df[i].unique()), 1, dtype=int), inplace=True)


{'sex': array(['F', 'M'], dtype=object),
 'address': array(['U', 'R'], dtype=object),
 'famsize': array(['GT3', 'LE3'], dtype=object),
 'Pstatus': array(['A', 'T'], dtype=object),
 'Mjob': array(['at_home', 'health', 'other', 'services', 'teacher'], dtype=object),
 'Fjob': array(['teacher', 'other', 'services', 'health', 'at_home'], dtype=object),
 'reason': array(['course', 'other', 'home', 'reputation'], dtype=object),
 'guardian': array(['mother', 'father', 'other'], dtype=object),
 'schoolsup': array(['yes', 'no'], dtype=object),
 'famsup': array(['no', 'yes'], dtype=object),
 'paid': array(['no', 'yes'], dtype=object),
 'activities': array(['no', 'yes'], dtype=object),
 'nursery': array(['yes', 'no'], dtype=object),
 'higher': array(['yes', 'no'], dtype=object),
 'internet': array(['no', 'yes'], dtype=object),
 'romantic': array(['no', 'yes'], dtype=object)}

In [57]:
OUTPUT_KEYS = ["G3"]

X = df.drop(OUTPUT_KEYS, axis=1).values
y = df[OUTPUT_KEYS].values

In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train

array([[ 1, 17,  0, ...,  3,  4,  6],
       [ 1, 18,  1, ...,  4,  3,  0],
       [ 0, 18,  0, ...,  1,  3,  0],
       ...,
       [ 0, 18,  0, ...,  1,  4,  2],
       [ 0, 17,  1, ...,  1,  1,  0],
       [ 1, 16,  0, ...,  3,  2, 10]])

In [59]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().set_output(transform="pandas")
scaled_X_train = scaler.fit_transform(X_train)

# training approach 1: neural networks
- for non-linear relationships between grades and student profile

In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=64, activation="relu"),
    tf.keras.layers.Dense(units=32, activation="relu"),
    tf.keras.layers.Dense(units=16, activation="relu"),
    tf.keras.layers.Dense(units=len(OUTPUT_KEYS), activation="relu"),
])

model.compile(loss=tf.keras.losses.MeanSquaredError(),
              metrics=[tf.keras.metrics.MeanAbsoluteError()],
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

model.fit(X_test, y_test, epochs=100)

Epoch 1/100


main metric: mean absolute error, to see how far, on average, is the model away from the actual points

In [None]:
model.metrics_names

['loss', 'compile_metrics']

In [None]:
model.evaluate(X_train, y_train)

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 144.8920 - mean_absolute_error: 11.3498


[144.32688903808594, 11.34600830078125]

In [None]:
model.evaluate(X_test, y_test)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 139.5021 - mean_absolute_error: 11.2766 


[140.34449768066406, 11.320573806762695]

In [None]:
X_test.shape

(209, 29)

In [None]:
X_test[0]

array([ 1, 17,  0,  0,  1,  2,  1,  2,  1,  2,  0,  1,  1,  0,  1,  1,  0,
        0,  0,  0,  1,  0,  5,  4,  5,  1,  2,  5, 22])

In [None]:
scaled_X_train.shape

(835, 29)

# inspection

In [None]:
y_test_pred = model.predict(X_test)
y_test_pred

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],

In [None]:
def view_profile(INDEX: int, y_pred) -> None:
    print("INPUT")
    for i, val in enumerate(X_test[INDEX]):
        key = column_keys[i]
        is_categorical = key in categorical_column_keys
        print(f"{key}: {categorical_unique_values[key][val] if is_categorical else val}")

    print("\nOUTPUT")
    for i, val in enumerate(y_pred[INDEX]):
        print(f"{OUTPUT_KEYS[i]} - pred: {y_pred[INDEX][i]} | real: {y_test[INDEX][i]}")

view_profile(5, y_test_pred)

INPUT
sex: M
age: 18
address: U
famsize: GT3
Pstatus: T
Medu: 2
Fedu: 2
Mjob: services
Fjob: other
reason: home
guardian: mother
traveltime: 1
studytime: 2
failures: 0
schoolsup: no
famsup: yes
paid: no
activities: yes
nursery: yes
higher: yes
internet: yes
romantic: no
famrel: 4
freetime: 4
goout: 4
Dalc: 2
Walc: 4
health: 5
absences: 10

OUTPUT
G3 - pred: 0.0 | real: 11


# training approach 1.1: neural networks with regularization

In [None]:
lambda_ = 0.01

model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=64, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(lambda_)),
    tf.keras.layers.Dense(units=32, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(lambda_)),
    tf.keras.layers.Dense(units=16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(lambda_)),
    tf.keras.layers.Dense(units=len(OUTPUT_KEYS), activation="relu"),
])

model.compile(loss=tf.keras.losses.MeanSquaredError(),
              metrics=[tf.keras.metrics.MeanAbsoluteError()],
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

model.fit(X_test, y_test, epochs=100)

Epoch 1/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - loss: 136.9935 - mean_absolute_error: 11.0679
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 127.2485 - mean_absolute_error: 10.7112 
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 95.8326 - mean_absolute_error: 9.2854  
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 62.2129 - mean_absolute_error: 7.2668 
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 34.4819 - mean_absolute_error: 5.0534 
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 18.0533 - mean_absolute_error: 3.2563 
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 15.1796 - mean_absolute_error: 2.8466 
Epoch 8/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - l

<keras.src.callbacks.history.History at 0x23a0f8f3f50>

In [None]:
model.evaluate(X_test, y_test)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 8.8363 - mean_absolute_error: 2.1092 


[8.76176929473877, 2.0523521900177]

doesn't improve much

# training approach 2: simple regression model

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred

array([[11.68824674],
       [ 4.31667521],
       [12.90042249],
       [12.33850483],
       [10.73940779],
       [12.11968825],
       [ 9.24529133],
       [13.6685284 ],
       [11.45252795],
       [11.54370723],
       [10.62137355],
       [ 9.35612457],
       [11.45619536],
       [12.79628304],
       [11.95006801],
       [ 9.79223643],
       [13.1952825 ],
       [12.27550305],
       [12.15885436],
       [10.44520886],
       [12.25849358],
       [11.85704277],
       [ 9.41359287],
       [13.36784453],
       [ 9.79557792],
       [ 8.30859093],
       [ 9.99295921],
       [12.59509029],
       [12.55556084],
       [13.68645216],
       [11.07883085],
       [11.02006388],
       [13.20582591],
       [ 9.08828659],
       [ 9.31169301],
       [10.70425571],
       [12.87326483],
       [11.40740762],
       [12.95507886],
       [10.66183945],
       [13.66921017],
       [12.56480796],
       [11.90215289],
       [10.46693057],
       [10.92321734],
       [ 8

In [None]:
# eval
print("avg absolute error:", np.average(np.absolute(y_pred-y_test)))

avg absolute error: 2.3375930090730543


In [None]:
view_profile(69, y_pred)

INPUT
sex: F
age: 16
address: U
famsize: GT3
Pstatus: T
Medu: 3
Fedu: 2
Mjob: other
Fjob: other
reason: reputation
guardian: mother
traveltime: 1
studytime: 2
failures: 0
schoolsup: no
famsup: yes
paid: no
activities: no
nursery: yes
higher: yes
internet: yes
romantic: no
famrel: 1
freetime: 2
goout: 2
Dalc: 1
Walc: 2
health: 1
absences: 8

OUTPUT
G3 - pred: 12.946077548616357 | real: 16
