In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostRegressor

In [2]:
df = pd.read_csv("../data/WineQT.csv")

In [3]:
# Wine Quality
X = df.drop(["Id", "quality"], axis=1)
y = df["quality"].map(lambda val: val > 5)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)

In [4]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree_params = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["sqrt", "log2", None],
    "max_depth": [5, 6, 7, 8, 9, None],
}

grid_search = GridSearchCV(decision_tree, param_grid=decision_tree_params, cv=5)
grid_search.fit(X_train, y_train)

In [5]:
print(grid_search.best_params_)

y_predict = grid_search.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'criterion': 'gini', 'max_depth': 6, 'max_features': None}
0.66775956284153
0.6702819956616052


In [6]:
random_forest = RandomForestClassifier(random_state=42)
random_forest_params = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["sqrt", "log2", None],
    "max_depth": [5, 6, 7, 8, 9, None],
}

grid_search = GridSearchCV(random_forest, param_grid=random_forest_params, cv=5)
grid_search.fit(X_train, y_train)

In [7]:
print(grid_search.best_params_)

y_predict = grid_search.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'criterion': 'entropy', 'max_depth': 9, 'max_features': 'sqrt'}
0.7442622950819672
0.7687747035573123


In [8]:
y_train_catboost = y_train.map(lambda val: 1 if val else 0)
y_test_catboost = y_test.map(lambda val: 1 if val else 0)

catboost = CatBoostRegressor(random_state=42)
catboost_params = {
    "depth": [6, 8, 10, 12],
}

grid_search = GridSearchCV(catboost, param_grid=catboost_params, cv=5)
grid_search.fit(X_train, y_train_catboost)

Learning rate set to 0.03128
0:	learn: 0.4835573	total: 115ms	remaining: 1m 54s
1:	learn: 0.4782885	total: 116ms	remaining: 57.8s
2:	learn: 0.4745495	total: 117ms	remaining: 38.8s
3:	learn: 0.4718490	total: 117ms	remaining: 29.2s
4:	learn: 0.4677725	total: 118ms	remaining: 23.5s
5:	learn: 0.4638706	total: 119ms	remaining: 19.7s
6:	learn: 0.4591985	total: 120ms	remaining: 17s
7:	learn: 0.4559268	total: 121ms	remaining: 15s
8:	learn: 0.4529986	total: 121ms	remaining: 13.4s
9:	learn: 0.4494201	total: 122ms	remaining: 12.1s
10:	learn: 0.4458844	total: 123ms	remaining: 11.1s
11:	learn: 0.4422292	total: 124ms	remaining: 10.2s
12:	learn: 0.4391547	total: 125ms	remaining: 9.47s
13:	learn: 0.4356826	total: 126ms	remaining: 8.84s
14:	learn: 0.4329805	total: 126ms	remaining: 8.3s
15:	learn: 0.4285115	total: 127ms	remaining: 7.82s
16:	learn: 0.4256947	total: 128ms	remaining: 7.39s
17:	learn: 0.4217458	total: 128ms	remaining: 7.01s
18:	learn: 0.4199204	total: 129ms	remaining: 6.65s
19:	learn: 0.417

In [9]:
print(grid_search.best_params_)

y_predict = pd.DataFrame(grid_search.predict(X_test)).map(lambda val: bool(round(val)))
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'depth': 10}
0.7377049180327869
0.7623762376237623


In [10]:
logreg = LogisticRegression()
logreg_params = {
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"],
    "max_iter": [500, 1000, 10000],
}

grid_search = GridSearchCV(logreg, param_grid=logreg_params, cv=5)
grid_search.fit(X_train, y_train)



In [11]:
print(grid_search.best_params_)

y_predict = grid_search.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

{'max_iter': 1000, 'solver': 'sag'}
0.7169398907103826
0.7590697674418604
