In [17]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostRegressor

In [18]:
df = pd.read_csv("../data/DrinkersBodySignals.csv")

In [3]:
X = df.drop(["DRK_YN", "height", "weight"], axis=1)
y = df["DRK_YN"].map(lambda val: val == "Y")

X["bmi"] = (df["weight"].to_numpy() * 10000) / np.square(df["height"].to_numpy())
X["sex"] = X["sex"].map(lambda val: val == "Male")

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)

In [4]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree_params = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["sqrt", "log2", None],
    "max_depth": [5, 6, 7, 8, 9, None],
}

grid_search = GridSearchCV(decision_tree, param_grid=decision_tree_params, cv=5)
grid_search.fit(X_train, y_train)

In [5]:
print(grid_search.best_params_)

y_predict = grid_search.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'criterion': 'gini', 'max_depth': 7, 'max_features': 'sqrt'}
0.6291666666666667
0.6636432350718064


In [6]:
random_forest = RandomForestClassifier(random_state=42)
random_forest_params = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["sqrt", "log2", None],
    "max_depth": [5, 6, 7, 8, 9, None],
}

grid_search = GridSearchCV(random_forest, param_grid=random_forest_params, cv=5)
grid_search.fit(X_train, y_train)

In [7]:
print(grid_search.best_params_)

y_predict = grid_search.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'criterion': 'entropy', 'max_depth': 9, 'max_features': 'sqrt'}
0.6916666666666667
0.7044728434504792


In [8]:
y_train_catboost = y_train.map(lambda val: 1 if val else 0)
y_test_catboost = y_test.map(lambda val: 1 if val else 0)

catboost = CatBoostRegressor(random_state=42)
catboost_params = {
    "depth": [6, 8, 10, 12],
}

grid_search = GridSearchCV(catboost, param_grid=catboost_params, cv=5)
grid_search.fit(X_train, y_train_catboost)

Learning rate set to 0.032678
0:	learn: 0.4954264	total: 145ms	remaining: 2m 24s
1:	learn: 0.4912910	total: 146ms	remaining: 1m 12s
2:	learn: 0.4882638	total: 147ms	remaining: 49s
3:	learn: 0.4851768	total: 149ms	remaining: 37s
4:	learn: 0.4819613	total: 150ms	remaining: 29.8s
5:	learn: 0.4794360	total: 151ms	remaining: 25s
6:	learn: 0.4762694	total: 151ms	remaining: 21.5s
7:	learn: 0.4735635	total: 152ms	remaining: 18.9s
8:	learn: 0.4714923	total: 153ms	remaining: 16.9s
9:	learn: 0.4685381	total: 154ms	remaining: 15.3s
10:	learn: 0.4666235	total: 155ms	remaining: 14s
11:	learn: 0.4639447	total: 156ms	remaining: 12.9s
12:	learn: 0.4610972	total: 157ms	remaining: 11.9s
13:	learn: 0.4599310	total: 158ms	remaining: 11.1s
14:	learn: 0.4575027	total: 159ms	remaining: 10.4s
15:	learn: 0.4552085	total: 160ms	remaining: 9.84s
16:	learn: 0.4525505	total: 161ms	remaining: 9.32s
17:	learn: 0.4506250	total: 162ms	remaining: 8.84s
18:	learn: 0.4485398	total: 163ms	remaining: 8.4s
19:	learn: 0.44672

In [9]:
print(grid_search.best_params_)

y_predict = pd.DataFrame(grid_search.predict(X_test)).map(lambda val: bool(round(val)))
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'depth': 8}
0.6808333333333333
0.6948207171314741


In [15]:
logreg = LogisticRegression()
logreg_params = {
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"],
    "max_iter": [100, 200],
}

grid_search = GridSearchCV(logreg, param_grid=logreg_params, cv=5)
grid_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [16]:
print(grid_search.best_params_)

y_predict = grid_search.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'max_iter': 100, 'solver': 'lbfgs'}
0.7083333333333334
0.7199999999999999
