In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
sns.set()
%matplotlib inline

train = pd.read_csv('train.csv', sep=',')
train.head()
y_train = train['target']
X_train = train.drop(columns=['target', 'feature6', 'feature2'])

def fill_nulls(X):
    null_cols = ["feature4", "feature3", "feature5", 'feature1']
    median_imputer = SimpleImputer(strategy="median")
    for col in null_cols:
        X[col] = median_imputer.fit_transform(X[col].values.reshape(-1,1))
fill_nulls(X_train)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier

In [10]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [50, 100],
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [20, 30, 40],
    'l2_regularization': [0, 1e-3, 1e-2],
}

hgb_clf = HistGradientBoostingClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=10,  # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,  # Use all available cores
    verbose=1
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:\n", grid_search.best_params_)

# Best score
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

Fitting 10 folds for each of 162 candidates, totalling 1620 fits
Best Parameters:
 {'l2_regularization': 0, 'learning_rate': 0.1, 'max_depth': 5, 'max_iter': 100, 'min_samples_leaf': 20}
Best Cross-Validation Accuracy: 0.9229999999999998


In [12]:
from sklearn.model_selection import TunedThresholdClassifierCV
tuned_clf = TunedThresholdClassifierCV(base_classifier=hgb_clf, cv=5, scoring='f1')

# Train the tuned classifier
tuned_clf.fit(X_train, y_train)

# Evaluate the model
print("Optimized Accuracy:", accuracy_score(y_test, y_pred))
print("\nOptimized Classification Report:\n", classification_report(y_test, y_pred))

# Show the best threshold
print(f"Optimized Threshold: {tuned_clf.threshold_}")

TypeError: TunedThresholdClassifierCV.__init__() got an unexpected keyword argument 'base_classifier'

In [4]:
test = pd.read_csv('test.csv', sep=',')
X_test = test.drop(columns=['Id', 'feature6', 'feature2'])
fill_nulls(X_test)

best_hgb_grid = grid_search.best_estimator_
y_train = best_hgb_grid.predict(X_test)

sample = pd.read_csv('sample.csv', sep=',')
sample['target'] = y_train
sample.head(10)
sample.to_csv('HistGradientBoostingClassifier_optimized_benchmark.csv', index=False)

In [19]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Define the parameter distributions
param_dist = {
    'learning_rate': uniform(0.01, 0.09),  # 0.01 to 0.1
    'max_iter': randint(100, 500),         # 100 to 600
    'max_depth': randint(3, 10),           # 3 to 10
    'min_samples_leaf': randint(20, 50),   # 20 to 70
    'l2_regularization': uniform(0, 0.01), # 0 to 0.01
}

# Initialize the classifier
hgb_clf = HistGradientBoostingClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=hgb_clf,
    param_distributions=param_dist,
    n_iter=100,          # Number of parameter settings sampled
    cv=5,               # 5-fold cross-validation
    scoring='accuracy',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:\n", random_search.best_params_)

# Best score
print("Best Cross-Validation Accuracy:", random_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters:
 {'l2_regularization': np.float64(0.009404585843529142), 'learning_rate': np.float64(0.09585357193023286), 'max_depth': 7, 'max_iter': 105, 'min_samples_leaf': 22}
Best Cross-Validation Accuracy: 0.916
