In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import polars as pl
import numpy as np
import optuna

In [2]:
train_data = pl.read_csv("../data/dataset.csv", has_header = False)

In [3]:
train_data

column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
-1,1,1,1,-1,-1,-1,-1,-1,1,1,-1,1,-1,1,-1,-1,-1,0,1,1,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,-1,0,1,-1,1,1,-1,1,0,-1,-1,1,1,0,1,1,1,1,-1,-1,0,-1,1,1,1,-1
1,0,1,1,1,-1,-1,-1,-1,1,1,-1,1,0,-1,-1,-1,-1,0,1,1,1,1,1,-1,1,-1,1,0,-1,-1
1,0,1,1,1,-1,-1,-1,1,1,1,-1,-1,0,0,-1,1,1,0,1,1,1,1,-1,-1,1,-1,1,-1,1,-1
1,0,-1,1,1,-1,1,1,-1,1,1,1,1,0,0,-1,1,1,0,-1,1,-1,1,-1,-1,0,-1,1,1,1,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1,-1,1,-1,1,1,1,1,-1,-1,-1,1,1,1,1,-1,-1,1,0,-1,-1,-1,-1,1,1,-1,-1,1,1,1,1
-1,1,1,-1,-1,-1,1,-1,-1,-1,-1,1,1,-1,-1,0,-1,-1,1,-1,1,-1,1,1,1,1,1,1,-1,1,-1
1,-1,1,1,1,-1,1,-1,-1,1,1,1,1,0,-1,-1,1,1,0,1,1,1,1,1,1,1,-1,1,0,1,-1
-1,-1,1,1,1,-1,-1,-1,1,-1,1,1,-1,-1,1,-1,1,1,0,-1,1,-1,1,1,1,1,-1,1,1,1,-1


In [4]:
X = train_data.drop("column_31")
y = train_data["column_31"]

In [5]:
X.head()

column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
-1,1,1,1,-1,-1,-1,-1,-1,1,1,-1,1,-1,1,-1,-1,-1,0,1,1,1,1,-1,-1,-1,-1,1,1,-1
1,1,1,1,1,-1,0,1,-1,1,1,-1,1,0,-1,-1,1,1,0,1,1,1,1,-1,-1,0,-1,1,1,1
1,0,1,1,1,-1,-1,-1,-1,1,1,-1,1,0,-1,-1,-1,-1,0,1,1,1,1,1,-1,1,-1,1,0,-1
1,0,1,1,1,-1,-1,-1,1,1,1,-1,-1,0,0,-1,1,1,0,1,1,1,1,-1,-1,1,-1,1,-1,1
1,0,-1,1,1,-1,1,1,-1,1,1,1,1,0,0,-1,1,1,0,-1,1,-1,1,-1,-1,0,-1,1,1,1


In [6]:
y.head()

column_31
i64
-1
-1
-1
-1
1
1
-1
-1
1
-1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42,
)

## Logistic Regression

In [8]:
# Logistic Regression
classifier = LogisticRegression(solver="lbfgs")

In [9]:
classifier.fit(X_train, y_train)

In [10]:
predictions = classifier.predict(X_test)

In [11]:
accuracy = 100.0 * accuracy_score(predictions, y_test)
print("The accuracy of logistic regression on test data is {}".format(accuracy))

The accuracy of logistic regression on test data is 92.4468566259611


In [12]:
# Cross Validation
from sklearn.model_selection import KFold

In [13]:
# k-fold cross validation
k = 5

fold = KFold(n_splits=k, shuffle=True, random_state=42)

# Accuracy score
score = 0

for train_cv_idx, test_cv_idx in fold.split(X_train):
    X_train_cv = X_train[train_cv_idx]
    y_train_cv = y_train[train_cv_idx]
    
    classifier.fit(X_train_cv, y_train_cv)
    
    X_test_cv  = X_train[test_cv_idx]
    y_test_cv  = y_train[test_cv_idx]

    pred = classifier.predict(X_test_cv)

    score += accuracy_score(pred, y_test_cv)

score = score / k
score

0.928539720014631

## Hyper parameter tuning

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import optuna
from sklearn.model_selection import cross_validate

In [15]:
class Objective:
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __call__(self, trial):
        params = {
            "solver": trial.suggest_categorical("solver", [
                "newton-cg", "lbfgs", "liblinear", "sag", "saga" 
            ]),
            "C": trial.suggest_loguniform("C", 0.0001, 10),
            "max_iter": trial.suggest_int("max_iter", 100, 100000)
        }
        model  = LogisticRegression(**params)
        scores = cross_validate(
            model,
            X=self.X,
            y=self.y,
            scoring='accuracy',
            n_jobs=-1
        )
        return scores["test_score"].mean()

In [16]:
objective = Objective(X_train, y_train)
study     = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=60)


[I 2024-10-04 21:38:17,880] A new study created in memory with name: no-name-4c1dabf4-172d-4eee-bf84-3215b7420387
  "C": trial.suggest_loguniform("C", 0.0001, 10),
[I 2024-10-04 21:38:19,400] Trial 0 finished with value: 0.9208509294051141 and parameters: {'solver': 'newton-cg', 'C': 0.0031462172931186107, 'max_iter': 67301}. Best is trial 0 with value: 0.9208509294051141.
  "C": trial.suggest_loguniform("C", 0.0001, 10),
[I 2024-10-04 21:38:20,227] Trial 1 finished with value: 0.9274092017117322 and parameters: {'solver': 'lbfgs', 'C': 0.019510965868456444, 'max_iter': 18673}. Best is trial 1 with value: 0.9274092017117322.
  "C": trial.suggest_loguniform("C", 0.0001, 10),
[I 2024-10-04 21:38:20,343] Trial 2 finished with value: 0.9257129446551853 and parameters: {'solver': 'liblinear', 'C': 0.012637316852337384, 'max_iter': 25421}. Best is trial 1 with value: 0.9274092017117322.
  "C": trial.suggest_loguniform("C", 0.0001, 10),
[I 2024-10-04 21:38:20,469] Trial 3 finished with value:

In [17]:
print("params: ", study.best_params)

params:  {'solver': 'liblinear', 'C': 0.6648868062396784, 'max_iter': 60215}


In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score

model = LogisticRegression(
    solver=study.best_params["solver"],
    C=study.best_params["C"],
    max_iter=study.best_params["max_iter"]
)

model.fit(X_train, y_train)

pred = model.predict(X_test)

# show results
print("Accucary: {}".format(accuracy_score(y_test, pred)))
print(confusion_matrix(y_test, pred))

Accucary: 0.9240162822252375
[[ 864   92]
 [  76 1179]]


In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score

print("Precision: {}".format(100 * precision_score(y_test, pred)))
print("Recall: {}".format(100 * recall_score(y_test, pred)))
print("F1: {}".format(100 * f1_score(y_test, pred)))

Precision: 92.7616050354052
Recall: 93.94422310756973
F1: 93.34916864608076


## Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import optuna
from sklearn.model_selection import cross_validate

In [21]:
class Objective_DTC:
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __call__(self, trial):
        params = {
            "criterion":         trial.suggest_categorical("criterion",    ["gini", "entropy"]),
            "splitter":          trial.suggest_categorical("splitter",     ["best", "random"]),
            "max_features":      trial.suggest_categorical("max_features", ["sqrt", "log2"]),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 64),
            "max_depth":         trial.suggest_int("max_depth",         2, 64)
        }

        model = DecisionTreeClassifier(**params)

        scores = cross_validate(
            model,
            X=self.X,
            y=self.y,
            scoring="accuracy",
            n_jobs=-1,
        )

        return scores["test_score"].mean()

In [22]:
objective = Objective_DTC(X_train, y_train)
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=60)

[I 2024-10-04 21:39:18,206] A new study created in memory with name: no-name-b9c53b70-2c1e-4262-964a-dfc7c415eb8c
[I 2024-10-04 21:39:18,297] Trial 0 finished with value: 0.9233372511504058 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_features': 'sqrt', 'min_samples_split': 24, 'max_depth': 27}. Best is trial 0 with value: 0.9233372511504058.
[I 2024-10-04 21:39:18,375] Trial 1 finished with value: 0.905473987655679 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_features': 'sqrt', 'min_samples_split': 54, 'max_depth': 37}. Best is trial 0 with value: 0.9233372511504058.
[I 2024-10-04 21:39:18,425] Trial 2 finished with value: 0.8974470455225617 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_features': 'log2', 'min_samples_split': 57, 'max_depth': 29}. Best is trial 0 with value: 0.9233372511504058.
[I 2024-10-04 21:39:18,474] Trial 3 finished with value: 0.7546212549462974 and parameters: {'criterion': 'gini', 'splitter': 'b

In [23]:
print(study.best_params)

{'criterion': 'gini', 'splitter': 'best', 'max_features': 'sqrt', 'min_samples_split': 2, 'max_depth': 45}


In [24]:
# holdout
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

model = DecisionTreeClassifier(
    criterion=study.best_params["criterion"],
    splitter=study.best_params["splitter"],
    max_features=study.best_params["max_features"],
    min_samples_split=study.best_params["min_samples_split"],
    max_depth=study.best_params["max_depth"],
)

model.fit(X_train, y_train)
pred = model.predict(X_test)

print("Accuracy: {}".format(accuracy_score(y_test, pred)))
print("Precision: {}".format(precision_score(y_test, pred)))
print("Recall: {}".format(recall_score(y_test, pred)))
print("F1: {}".format(f1_score(y_test, pred)))

print(confusion_matrix(y_test, pred))

Accuracy: 0.9552238805970149
Precision: 0.9609250398724083
Recall: 0.9601593625498008
F1: 0.9605420486249502
[[ 907   49]
 [  50 1205]]
