In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostRegressor

In [2]:
df = pd.read_csv("../data/EffectsOnMathsStudy.csv")

In [3]:
X = df.drop(["G1", "G2", "G3", "Mjob", "Fjob", "reason", "guardian"], axis=1)
y = df["G3"].map(lambda val: val >= 10)

for column, v in [
    ("school", "GP"),
    ("sex", "M"),
    ("address", "U"),
    ("famsize", "GT3"),
    ("Pstatus", "T"),
]:
    X[column] = X[column].map(lambda val: val == v)
for column in [
    "schoolsup",
    "famsup",
    "paid",
    "activities",
    "nursery",
    "higher",
    "internet",
    "romantic",
]:
    X[column] = X[column].map(lambda val: val == "yes")

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)

In [4]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree_params = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["sqrt", "log2", None],
    "max_depth": [5, 6, 7, 8, 9, None],
}

grid_search = GridSearchCV(decision_tree, param_grid=decision_tree_params, cv=5)
grid_search.fit(X_train, y_train)

In [5]:
print(grid_search.best_params_)

y_predict = grid_search.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'criterion': 'gini', 'max_depth': 6, 'max_features': 'sqrt'}
0.5981012658227848
0.7093821510297483


In [6]:
random_forest = RandomForestClassifier(random_state=42)
random_forest_params = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["sqrt", "log2", None],
    "max_depth": [5, 6, 7, 8, 9, None],
}

grid_search = GridSearchCV(random_forest, param_grid=random_forest_params, cv=5)
grid_search.fit(X_train, y_train)

In [7]:
print(grid_search.best_params_)

y_predict = grid_search.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'criterion': 'gini', 'max_depth': 7, 'max_features': 'log2'}
0.7025316455696202
0.810483870967742


In [8]:
y_train_catboost = y_train.map(lambda val: 1 if val else 0)
y_test_catboost = y_test.map(lambda val: 1 if val else 0)

catboost = CatBoostRegressor(random_state=42)
catboost_params = {
    "depth": [6, 8, 10, 12],
}

grid_search = GridSearchCV(catboost, param_grid=catboost_params, cv=5)
grid_search.fit(X_train, y_train_catboost)

Learning rate set to 0.026453
0:	learn: 0.4781117	total: 146ms	remaining: 2m 25s
1:	learn: 0.4752689	total: 147ms	remaining: 1m 13s
2:	learn: 0.4719060	total: 148ms	remaining: 49.3s
3:	learn: 0.4706996	total: 150ms	remaining: 37.3s
4:	learn: 0.4682599	total: 151ms	remaining: 30s
5:	learn: 0.4660178	total: 152ms	remaining: 25.2s
6:	learn: 0.4630739	total: 153ms	remaining: 21.7s
7:	learn: 0.4598734	total: 154ms	remaining: 19.1s
8:	learn: 0.4573875	total: 155ms	remaining: 17.1s
9:	learn: 0.4550472	total: 157ms	remaining: 15.5s
10:	learn: 0.4537365	total: 158ms	remaining: 14.2s
11:	learn: 0.4512945	total: 159ms	remaining: 13.1s
12:	learn: 0.4489054	total: 160ms	remaining: 12.2s
13:	learn: 0.4465654	total: 161ms	remaining: 11.4s
14:	learn: 0.4453720	total: 163ms	remaining: 10.7s
15:	learn: 0.4427631	total: 164ms	remaining: 10.1s
16:	learn: 0.4399974	total: 165ms	remaining: 9.53s
17:	learn: 0.4376391	total: 166ms	remaining: 9.05s
18:	learn: 0.4356657	total: 167ms	remaining: 8.62s
19:	learn: 

In [9]:
print(grid_search.best_params_)

y_predict = pd.DataFrame(grid_search.predict(X_test)).map(lambda val: bool(round(val)))
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'depth': 10}
0.6645569620253164
0.79296875


In [10]:
logreg = LogisticRegression()
logreg_params = {
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"],
    "max_iter": [500, 1000, 10000],
}

grid_search = GridSearchCV(logreg, param_grid=logreg_params, cv=5)
grid_search.fit(X_train, y_train)



In [11]:
print(grid_search.best_params_)

y_predict = grid_search.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'max_iter': 500, 'solver': 'saga'}
0.680379746835443
0.7780219780219781
