In [24]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("gbm-data.csv")
a = df.values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(a[:,1:], a[:,0], test_size=0.8, random_state=241)

In [None]:
for lr in [1, 0.5, 0.3, 0.2, 0.1]:
    gbc = GradientBoostingClassifier(
        learning_rate=lr, n_estimators=250, verbose=False, random_state=241
    )
    gbc.fit(X_train, y_train)

    train_logloss = []
    for y_pred in gbc.staged_decision_function(X_train):
        train_logloss.append(log_loss(y_train, (1 / (1 + np.exp(-y_pred)))))

    test_logloss = []
    for y_pred in gbc.staged_decision_function(X_test):
        test_logloss.append(log_loss(y_test, (1 / (1 + np.exp(-y_pred)))))

    print(
        f"{lr=} | min test logloss: {min(test_logloss)} {test_logloss.index(min(test_logloss))}"
    )

    plt.figure(figsize=(10, 5))  # Set the figure size
    plt.plot(
        np.arange(0, len(train_logloss)),
        train_logloss,
        label="train",
        color="blue",
    )
    plt.plot(
        np.arange(0, len(test_logloss)),
        test_logloss,
        label="test",
        color="red",
    )
    plt.legend()
    plt.title(f"lr={lr}")
    plt.grid()
    plt.show()


In [None]:
rfc = RandomForestClassifier(n_estimators=36, random_state=241)
rfc.fit(X_train, y_train)
y_pred = rfc.predict_proba(X_test)
print(log_loss(y_test, y_pred))
