In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostRegressor

In [6]:
wine_quality = pd.read_csv("data/WineQT.csv")
effects_on_maths_study = pd.read_csv("data/EffectsOnMathsStudy.csv")
drinkers_body_signals = pd.read_csv("data/DrinkersBodySignals.csv")

In [3]:
# Wine Quality
x = wine_quality.drop(["Id", "quality"], axis=1)
y = wine_quality["quality"].map(lambda val: val > 5)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.2, random_state=42)

In [4]:
# Alcohol Effects on Study
x = effects_on_maths_study.drop(["G1", "G2", "G3", "Mjob", "Fjob", "reason", "guardian"], axis=1)
y = effects_on_maths_study["G3"].map(lambda val: val >= 10)

for column, v in [
    ("school", "GP"),
    ("sex", "M"),
    ("address", "U"),
    ("famsize", "GT3"),
    ("Pstatus", "T"),
]:
    x[column] = x[column].map(lambda val: val == v)
for column in [
    "schoolsup",
    "famsup",
    "paid",
    "activities",
    "nursery",
    "higher",
    "internet",
    "romantic",
]:
    x[column] = x[column].map(lambda val: val == "yes")

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.2, random_state=42)

In [7]:
# Drinkers body signals
x = drinkers_body_signals.drop(["DRK_YN", "height", "weight"], axis=1)
y = drinkers_body_signals["DRK_YN"].map(lambda val: val == "Y")

x["bmi"] = (drinkers_body_signals["weight"].to_numpy() * 10000) / np.square(drinkers_body_signals["height"].to_numpy())
x["sex"] = x["sex"].map(lambda val: val == "Male")

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.2, random_state=42)

In [8]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree_params = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["sqrt", "log2", None],
    "max_depth": [5, 6, 7, 8, 9, None],
}

grid_search = GridSearchCV(decision_tree, param_grid=decision_tree_params, cv=5)
grid_search.fit(x_train, y_train)

In [9]:
print(grid_search.best_params_)

y_predict = grid_search.predict(x_test)
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'criterion': 'gini', 'max_depth': 7, 'max_features': None}
0.7229651093147323
0.7283593751931813


In [120]:
random_forest = RandomForestClassifier(random_state=42)
random_forest_params = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["sqrt", "log2", None],
    "max_depth": [5, 6, 7, 8, 9, None],
}

grid_search = GridSearchCV(random_forest, param_grid=random_forest_params, cv=5)
grid_search.fit(x_train, y_train)

In [121]:
print(grid_search.best_params_)

y_predict = grid_search.predict(x_test)
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'criterion': 'entropy', 'max_depth': 6, 'max_features': None}
0.7303370786516854
0.12146422628951746


In [122]:
y_train_catboost = y_train.map(lambda val: 1 if val else 0)
y_test_catboost = y_test.map(lambda val: 1 if val else 0)

catboost = CatBoostRegressor(random_state=42)
catboost_params = {
    "depth": [6, 8, 10, 12],
}

grid_search = GridSearchCV(catboost, param_grid=catboost_params, cv=5)
grid_search.fit(x_train, y_train_catboost)

Learning rate set to 0.039391
0:	learn: 0.4356315	total: 1.24ms	remaining: 1.24s
1:	learn: 0.4342921	total: 2.42ms	remaining: 1.21s
2:	learn: 0.4336987	total: 3.42ms	remaining: 1.14s
3:	learn: 0.4322984	total: 4.35ms	remaining: 1.08s
4:	learn: 0.4311925	total: 5.42ms	remaining: 1.08s
5:	learn: 0.4301925	total: 6.54ms	remaining: 1.08s
6:	learn: 0.4289378	total: 7.94ms	remaining: 1.13s
7:	learn: 0.4277841	total: 9.14ms	remaining: 1.13s
8:	learn: 0.4266544	total: 10.3ms	remaining: 1.14s
9:	learn: 0.4257727	total: 11.2ms	remaining: 1.1s
10:	learn: 0.4251848	total: 12.1ms	remaining: 1.08s
11:	learn: 0.4244985	total: 13.3ms	remaining: 1.09s
12:	learn: 0.4238244	total: 14.4ms	remaining: 1.1s
13:	learn: 0.4226908	total: 15.3ms	remaining: 1.08s
14:	learn: 0.4220119	total: 16.1ms	remaining: 1.06s
15:	learn: 0.4213169	total: 17.4ms	remaining: 1.07s
16:	learn: 0.4208157	total: 18.5ms	remaining: 1.07s
17:	learn: 0.4205994	total: 19.1ms	remaining: 1.04s
18:	learn: 0.4199817	total: 20.2ms	remaining: 

In [123]:
print(grid_search.best_params_)

y_predict = pd.DataFrame(grid_search.predict(x_test)).map(lambda val: bool(round(val)))
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'depth': 6}
0.7157814096016343
0.2066999287241625


In [124]:
logreg = LogisticRegression()
logreg_params = {
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"],
    "max_iter": [500, 1000, 10000],
}

grid_search = GridSearchCV(logreg, param_grid=logreg_params, cv=5)
grid_search.fit(x_train, y_train)



In [125]:
print(grid_search.best_params_)

y_predict = grid_search.predict(x_test)
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'max_iter': 500, 'solver': 'lbfgs'}
0.7323799795709908
0.03499079189686925
