In [142]:
import os

# Set the current working directory
current_directory = os.getcwd()
print("Current working directory:", current_directory)

# Define the data directory (assuming your data is in a folder named 'data' or similar inside the current directory)
DATA_DIR = current_directory  # Current directory where the notebook is running
TRAIN_DIR = os.path.join(DATA_DIR, 'train')  # Assuming the train directory is inside the current directory
TEST_DIR = os.path.join(DATA_DIR, 'test')    # Assuming the test directory is inside the current directory
OUTPUT_DIR = os.path.join(DATA_DIR, 'output')  # Assuming you want to store output in the same directory

# Create directories if they do not exist
os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(TEST_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

Current working directory: /Users/alishasahota/Documents/Queens MMA Program/869 - Machine Learning & AI/Team Project


In [143]:
# Print paths to ensure they are set correctly
print("DATA_DIR:", DATA_DIR)
print("TRAIN_DIR:", TRAIN_DIR)
print("TEST_DIR:", TEST_DIR)
print("OUTPUT_DIR:", OUTPUT_DIR)

DATA_DIR: /Users/alishasahota/Documents/Queens MMA Program/869 - Machine Learning & AI/Team Project
TRAIN_DIR: /Users/alishasahota/Documents/Queens MMA Program/869 - Machine Learning & AI/Team Project/train
TEST_DIR: /Users/alishasahota/Documents/Queens MMA Program/869 - Machine Learning & AI/Team Project/test
OUTPUT_DIR: /Users/alishasahota/Documents/Queens MMA Program/869 - Machine Learning & AI/Team Project/output


In [144]:
#only run this cell once, at the start
train_zip = "train.zip"
test_zip = "test.zip"
sample_sub = "SampleSubmission.csv"

# Create directories if they do not exist
os.makedirs('train', exist_ok=True)
os.makedirs('test', exist_ok=True)

!unzip -o "train.zip" -d "train/"
!unzip -o "test.zip" -d "test/"

Archive:  train.zip
  inflating: train/client_train.csv  
  inflating: train/invoice_train.csv  
Archive:  test.zip
  inflating: test/client_test.csv    
  inflating: test/invoice_test.csv   


In [145]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import time


# Set the current working directory
current_directory = os.getcwd()
print("Current working directory:", current_directory)

Current working directory: /Users/alishasahota/Documents/Queens MMA Program/869 - Machine Learning & AI/Team Project


In [146]:
# Read the CSV files (update these paths as needed)
client_train = pd.read_csv(f'{TRAIN_DIR}/client_train.csv', low_memory=False)
invoice_train = pd.read_csv(f'{TRAIN_DIR}/invoice_train.csv', low_memory=False)
client_test = pd.read_csv(f'{TEST_DIR}/client_test.csv', low_memory=False)
invoice_test = pd.read_csv(f'{TEST_DIR}/invoice_test.csv', low_memory=False)
sample_submission = pd.read_csv(f'{DATA_DIR}/SampleSubmission.csv', low_memory=False)

In [147]:
#compare size of the various datasets
print(client_train.shape, invoice_train.shape, client_test.shape, invoice_test.shape)

(135493, 6) (4476749, 16) (58069, 5) (1939730, 16)


In [148]:
#Getting unique values on the invoice train data
for col in invoice_train.columns:
    print(f"{col} - {invoice_train[col].nunique()}")

client_id - 135493
invoice_date - 8275
tarif_type - 17
counter_number - 201893
counter_statue - 12
counter_code - 42
reading_remarque - 8
counter_coefficient - 16
consommation_level_1 - 8295
consommation_level_2 - 12576
consommation_level_3 - 2253
consommation_level_4 - 12075
old_index - 155648
new_index - 157980
months_number - 1370
counter_type - 2


In [149]:
#Getting unique values on the invoice train data
for col in client_train.columns:
    print(f"{col} - {client_train[col].nunique()}")

disrict - 4
client_id - 135493
client_catg - 3
region - 25
creation_date - 8088
target - 2


In [150]:
#check for missing values
invoice_train.isnull().sum()

client_id               0
invoice_date            0
tarif_type              0
counter_number          0
counter_statue          0
counter_code            0
reading_remarque        0
counter_coefficient     0
consommation_level_1    0
consommation_level_2    0
consommation_level_3    0
consommation_level_4    0
old_index               0
new_index               0
months_number           0
counter_type            0
dtype: int64

In [151]:
#check for missing values
client_train.isnull().sum()

disrict          0
client_id        0
client_catg      0
region           0
creation_date    0
target           0
dtype: int64

In [152]:
##add in plots

In [153]:
#converting data into categorical:

In [154]:
def feature_change(cl, inv):

    cl['client_catg'] = cl['client_catg'].astype('category')
    cl['disrict'] = cl['disrict'].astype('category')
    cl['region'] = cl['region'].astype('category')
    cl['region_group'] = cl['region'].apply(lambda x: 100 if x<100 else 300 if x>300 else 200)
    cl['creation_date'] = pd.to_datetime(cl['creation_date'])
    
    cl['coop_time'] = (2019 - cl['creation_date'].dt.year)*12 - cl['creation_date'].dt.month

    inv['invoice_date'] = pd.to_datetime(inv['invoice_date'], dayfirst=True)
    inv['invoice_month'] = inv['invoice_date'].dt.month
    inv['invoice_year'] = inv['invoice_date'].dt.year
    inv['is_weekday'] = ((pd.DatetimeIndex(inv.invoice_date).dayofweek) // 5 == 1).astype(float)

    return cl, inv

In [155]:
#Fixing creation date format to be consistent with invoice date
client_train['creation_date'] = pd.to_datetime(client_train['creation_date'], dayfirst=True, errors='coerce')
client_test['creation_date'] = pd.to_datetime(client_train['creation_date'], dayfirst=True, errors='coerce')

In [156]:
#apply above feature changes to train and test data: 
client_train1, invoice_train1 = feature_change(client_train, invoice_train)
client_test1, invoice_test1 = feature_change(client_test, invoice_test)

  inv['invoice_date'] = pd.to_datetime(inv['invoice_date'], dayfirst=True)
  inv['invoice_date'] = pd.to_datetime(inv['invoice_date'], dayfirst=True)


In [157]:
# Define the aggregate function with added fields:
def aggregate_by_client_id(invoice_data):
    aggs = {}
    aggs['consommation_level_1'] = ['mean']
    aggs['consommation_level_2'] = ['mean']
    aggs['consommation_level_3'] = ['mean']
    aggs['consommation_level_4'] = ['mean']

    agg_trans = invoice_data.groupby(['client_id']).agg(aggs)
    agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)

    df = (invoice_data.groupby('client_id')
            .size()
            .reset_index(name='{}transactions_count'.format('1')))
    return pd.merge(df, agg_trans, on='client_id', how='left')

#group invoice data by client_id:
agg_train = aggregate_by_client_id(invoice_train)
agg_test = aggregate_by_client_id (invoice_test) 

#Check the shape and head of the aggregated training data: 
print(agg_train.shape)
agg_train.head()

(135493, 6)


Unnamed: 0,client_id,1transactions_count,consommation_level_1_mean,consommation_level_2_mean,consommation_level_3_mean,consommation_level_4_mean
0,train_Client_0,35,352.4,10.571429,0.0,0.0
1,train_Client_1,37,557.540541,0.0,0.0,0.0
2,train_Client_10,18,798.611111,37.888889,0.0,0.0
3,train_Client_100,20,1.2,0.0,0.0,0.0
4,train_Client_1000,14,663.714286,104.857143,117.357143,36.714286


In [158]:
#Combine Train & Test data using Client_id since both client and invoice data have this column:  

train_combined = pd.merge(invoice_train1, client_train1, on='client_id', how='left')

#Ensure "Target" column is present in the training dataset: 
if 'target' not in train_combined.columns: 
    train_combined['target'] = client_train['target']

#Combine test sets:
test_combined = pd.merge(client_test, agg_test, on='client_id', how='left')

train_combined.columns 

Index(['client_id', 'invoice_date', 'tarif_type', 'counter_number',
       'counter_statue', 'counter_code', 'reading_remarque',
       'counter_coefficient', 'consommation_level_1', 'consommation_level_2',
       'consommation_level_3', 'consommation_level_4', 'old_index',
       'new_index', 'months_number', 'counter_type', 'invoice_month',
       'invoice_year', 'is_weekday', 'disrict', 'client_catg', 'region',
       'creation_date', 'target', 'region_group', 'coop_time'],
      dtype='object')

In [159]:
# To Verify merge was successful: 
print(train_combined.head())  # Inspect the first few rows

        client_id invoice_date  tarif_type  counter_number counter_statue  \
0  train_Client_0   2014-03-24          11         1335667              0   
1  train_Client_0   2013-03-29          11         1335667              0   
2  train_Client_0   2015-03-23          11         1335667              0   
3  train_Client_0   2015-07-13          11         1335667              0   
4  train_Client_0   2016-11-17          11         1335667              0   

   counter_code  reading_remarque  counter_coefficient  consommation_level_1  \
0           203                 8                    1                    82   
1           203                 6                    1                  1200   
2           203                 8                    1                   123   
3           207                 8                    1                   102   
4           207                 9                    1                   572   

   consommation_level_2  ...  invoice_month  invoice_yea

In [160]:
# To Verify merge was successful: 
print(test_combined.head())

  disrict         client_id client_catg region creation_date  region_group  \
0      62     test_Client_0          11    307    1994-12-31           300   
1      69     test_Client_1          11    103    2002-05-29           200   
2      62    test_Client_10          11    310    1986-03-13           300   
3      60   test_Client_100          11    101    1996-07-11           200   
4      62  test_Client_1000          11    301    2014-10-14           300   

   coop_time  1transactions_count  consommation_level_1_mean  \
0        288                   37                 488.135135   
1        199                   22                1091.409091   
2        393                   74                 554.040541   
3        269                   40                 244.350000   
4         50                   53                 568.188679   

   consommation_level_2_mean  consommation_level_3_mean  \
0                   3.243243                   0.000000   
1                 843.136364

In [161]:
# Encode categorical variables using one-hot encoding
categorical_cols2 = ['disrict', 'client_catg', 'region']
train_combined = pd.get_dummies(train_combined, columns=categorical_cols2, drop_first=True)
test_combined = pd.get_dummies(test_combined, columns=categorical_cols2, drop_first=True)


In [162]:
print(train_combined.info())  # Check column names and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4476749 entries, 0 to 4476748
Data columns (total 52 columns):
 #   Column                Dtype         
---  ------                -----         
 0   client_id             object        
 1   invoice_date          datetime64[ns]
 2   tarif_type            int64         
 3   counter_number        int64         
 4   counter_statue        object        
 5   counter_code          int64         
 6   reading_remarque      int64         
 7   counter_coefficient   int64         
 8   consommation_level_1  int64         
 9   consommation_level_2  int64         
 10  consommation_level_3  int64         
 11  consommation_level_4  int64         
 12  old_index             int64         
 13  new_index             int64         
 14  months_number         int64         
 15  counter_type          object        
 16  invoice_month         int32         
 17  invoice_year          int32         
 18  is_weekday            float64       
 19  

In [163]:
print(test_combined.info())  # Check column names and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58069 entries, 0 to 58068
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   client_id                  58069 non-null  object        
 1   creation_date              58069 non-null  datetime64[ns]
 2   region_group               58069 non-null  int64         
 3   coop_time                  58069 non-null  int32         
 4   1transactions_count        58069 non-null  int64         
 5   consommation_level_1_mean  58069 non-null  float64       
 6   consommation_level_2_mean  58069 non-null  float64       
 7   consommation_level_3_mean  58069 non-null  float64       
 8   consommation_level_4_mean  58069 non-null  float64       
 9   disrict_62                 58069 non-null  bool          
 10  disrict_63                 58069 non-null  bool          
 11  disrict_69                 58069 non-null  bool          
 12  clie

# Method #2: LGMBoost

In [164]:
from sklearn.model_selection import train_test_split

# Randomly sample 30% of the data given the size of the data is in millions:
sampled_data = train_combined.sample(frac=0.3, random_state=42)  # 30% of the data

# Splitting into training and train-testing datasets
train_data1, test_data1 = train_test_split(sampled_data, test_size=0.2, random_state=42)

print("Training Data:")
print(train_data1)
print("\nTest Data:")
print(test_data1)

Training Data:
                   client_id invoice_date  tarif_type  counter_number  \
799201   train_Client_121643   2015-05-01          40         6707477   
1960046   train_Client_31477   2009-02-13          11            6389   
4209133   train_Client_92728   2017-10-10          40         6818028   
2880550   train_Client_56623   2019-03-27          11          676644   
250653   train_Client_106757   2019-06-14          40          320366   
...                      ...          ...         ...             ...   
1671703   train_Client_23542   2018-02-11          11          641301   
2750926   train_Client_53090   2016-03-16          11            3863   
4327416   train_Client_95933   2012-12-13          40          115073   
708823   train_Client_119109   2013-09-10          40         4237707   
3772369   train_Client_80930   2007-10-25          40           42464   

        counter_statue  counter_code  reading_remarque  counter_coefficient  \
799201               0       

In [165]:
y_train = train_data1['target']
X_train = train_data1.drop('target',axis=1)

feature_name = X_train.columns.tolist()

y_test = test_data1['target']
X_test = test_data1.drop('target',axis=1)

In [166]:
# Drop 'creation_date' from both train and test datasets
X_train = X_train.drop(columns=['creation_date'])
X_test = X_test.drop(columns=['creation_date'])


In [167]:
from sklearn.preprocessing import LabelEncoder

# Drop irrelevant or non-predictive columns
X_train = X_train.drop(['counter_statue', 'counter_type'], axis=1)

# Convert 'client_id' to numerical values (since it's an identifier)
le = LabelEncoder()
X_train['client_id'] = le.fit_transform(X_train['client_id'])

# Use float32 instead of float64 to save memory
for col in X_train.select_dtypes(include=['float64']).columns:
    X_train[col] = X_train[col].astype('float32')

# Use int32 instead of int64 to save memory
for col in X_train.select_dtypes(include=['int64']).columns:
    X_train[col] = X_train[col].astype('int32')

# Drop the original datetime column
X_train = X_train.drop(columns=['invoice_date'])
X_test = X_test.drop(columns=['invoice_date'])

# Align train and test datasets
X_train, X_test = X_train.align(X_test, join='left', axis=1)

# Fill missing columns in test set
X_test = X_test.fillna(0)  # This will fill missing columns in the test set with 0

# Check data after preprocessing
print(X_train.dtypes)

# Check for any missing values
print(X_train.isnull().sum())
print(X_test.isnull().sum())


client_id                 int32
tarif_type                int32
counter_number            int32
counter_code              int32
reading_remarque          int32
counter_coefficient       int32
consommation_level_1      int32
consommation_level_2      int32
consommation_level_3      int32
consommation_level_4      int32
old_index                 int32
new_index                 int32
months_number             int32
invoice_month             int32
invoice_year              int32
is_weekday              float32
region_group              int32
coop_time                 int32
disrict_62                 bool
disrict_63                 bool
disrict_69                 bool
client_catg_12             bool
client_catg_51             bool
region_103                 bool
region_104                 bool
region_105                 bool
region_106                 bool
region_107                 bool
region_199                 bool
region_206                 bool
region_301                 bool
region_3

In [168]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder on 'client_id' and transform it
test_combined['client_id'] = label_encoder.fit_transform(test_combined['client_id'])

# Now you can safely use 'client_id' as part of the feature set

# Check the data after preprocessing
print(test_combined.dtypes)

# Optional: Check for any missing values after encoding
print(test_combined.isnull().sum())

client_id                             int64
creation_date                datetime64[ns]
region_group                          int64
coop_time                             int32
1transactions_count                   int64
consommation_level_1_mean           float64
consommation_level_2_mean           float64
consommation_level_3_mean           float64
consommation_level_4_mean           float64
disrict_62                             bool
disrict_63                             bool
disrict_69                             bool
client_catg_12                         bool
client_catg_51                         bool
region_103                             bool
region_104                             bool
region_105                             bool
region_106                             bool
region_107                             bool
region_206                             bool
region_301                             bool
region_302                             bool
region_303                      

In [169]:
# Align columns to ensure the same structure between train and test
X_train, X_test = X_train.align(X_test, join='left', axis=1)
X_test = X_test.fillna(0)  # Fill missing columns in the test set

In [170]:
# Drop irrelevant or problematic columns: 
X_train = X_train.drop(['Client_8349', 'Client_134328'], axis=1, errors='ignore')
X_test = X_test.drop(['Client_8349', 'Client_134328'], axis=1, errors='ignore')

In [171]:
pip install scikit-learn scipy

Note: you may need to restart the kernel to use updated packages.


In [172]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (1074420, 47)
y_train shape: (1074420,)


In [173]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [174]:
from sklearn.preprocessing import OrdinalEncoder

# Initialize OrdinalEncoder
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

# Fit on training data and transform both datasets
X_train['client_id'] = ordinal_encoder.fit_transform(X_train[['client_id']])
X_test['client_id'] = ordinal_encoder.transform(X_test[['client_id']])


In [175]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
import optuna

# Define the objective function for Optuna optimization
def objective(trial):
    # Define the parameter search space
    param = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'n_estimators': 200,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 30, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)
    }

    # K-fold Cross-Validation setup
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_scores = []

    # Loop over each fold in the cross-validation
    for train_index, val_index in kf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        # Train the model with current parameters
        model = LGBMClassifier(**param)

        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_train_fold, y_train_fold), (X_val_fold, y_val_fold)],
            eval_metric='logloss',
            callbacks=[
                early_stopping(stopping_rounds=10),  # Early stopping via callback
                log_evaluation(period=10)  # Log evaluation info
            ]
        )

        # Predict and evaluate on the validation set
        y_pred_prob = model.predict_proba(X_val_fold)[:, 1]  # Predicted probabilities
        auc_score = roc_auc_score(y_val_fold, y_pred_prob)
        fold_scores.append(auc_score)

    # Return the average AUC score over the folds
    return np.mean(fold_scores)

# Create the Optuna study
study = optuna.create_study(direction='maximize')  # Maximize AUC score
study.optimize(objective, n_trials=20)  # Number of trials to run

# Print the best parameters and best score
print(f"Best AUC score: {study.best_value}")
print(f"Best parameters: {study.best_params}")

# Train the final model with the best parameters found by Optuna
best_params = study.best_params
final_model2 = LGBMClassifier(**best_params)

final_model2.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_metric='logloss',
    callbacks=[
        early_stopping(stopping_rounds=10),
        log_evaluation(period=10)
    ]
)

# Evaluate final model on test set using AUC
y_pred_prob_final = final_model2.predict_proba(X_test)[:, 1]  # Predicted probabilities
final_auc_score = roc_auc_score(y_test, y_pred_prob_final)
print(f"Final Test AUC Score: {final_auc_score}")


[I 2024-11-25 11:49:45,693] A new study created in memory with name: no-name-529b256f-cfcd-408a-8d0b-189b0c832fe0
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019370 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.2705	valid_1's binary_logloss: 0.270778
[20]	training's binary_logloss: 0.26636	valid_1's binary_logloss: 0.266814
[30]	training's binary_logloss: 0.263212	valid_1's binary_logloss: 0.263803
[40]	training's binary_logloss: 0.260574	valid_1's binary_logloss: 0.261264
[50]	training's binary_logloss: 0.258486	valid_1's binary_logloss: 0.259273
[6

[I 2024-11-25 11:50:22,281] Trial 0 finished with value: 0.7399657093192309 and parameters: {'learning_rate': 0.014537215670503211, 'num_leaves': 96, 'max_depth': 10, 'min_child_samples': 43, 'subsample': 0.7990685467871357, 'colsample_bytree': 0.8871493710279088, 'reg_alpha': 0.00011484575275096046, 'reg_lambda': 0.0001234193541293574}. Best is trial 0 with value: 0.7399657093192309.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014342 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.274933	valid_1's binary_logloss: 0.275026
[20]	training's binary_logloss: 0.273353	valid_1's binary_logloss: 0.273537
[30]	training's binary_logloss: 0.271896	valid_1's binary_logloss: 0.272167
[40]	training's binary_logloss: 0.270736	valid_1's binary_logloss: 0.271076
[50]	training's binary_logloss: 0.269558	valid_1's binary_logloss: 0.269953

[I 2024-11-25 11:50:57,304] Trial 1 finished with value: 0.7136762164884429 and parameters: {'learning_rate': 0.004353813944252691, 'num_leaves': 139, 'max_depth': 8, 'min_child_samples': 41, 'subsample': 0.9195668090610682, 'colsample_bytree': 0.6752752170434059, 'reg_alpha': 0.00021358009076295225, 'reg_lambda': 0.0010011168153372912}. Best is trial 0 with value: 0.7399657093192309.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012927 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.263926	valid_1's binary_logloss: 0.264525
[20]	training's binary_logloss: 0.257958	valid_1's binary_logloss: 0.258872
[30]	training's binary_logloss: 0.254712	valid_1's binary_logloss: 0.2559
[40]	training's binary_logloss: 0.252972	valid_1's binary_logloss: 0.254279
[50]	training's binary_logloss: 0.251517	valid_1's binary_logloss: 0.252949
[

[I 2024-11-25 11:51:23,984] Trial 2 finished with value: 0.7649229284088161 and parameters: {'learning_rate': 0.089038596988389, 'num_leaves': 69, 'max_depth': 6, 'min_child_samples': 31, 'subsample': 0.6106400313325074, 'colsample_bytree': 0.5088024778132845, 'reg_alpha': 9.430480269984073e-05, 'reg_lambda': 2.0913264804414203e-05}. Best is trial 2 with value: 0.7649229284088161.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017876 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.268041	valid_1's binary_logloss: 0.268372
[20]	training's binary_logloss: 0.264058	valid_1's binary_logloss: 0.26456
[30]	training's binary_logloss: 0.261769	valid_1's binary_logloss: 0.262367
[40]	training's binary_logloss: 0.259932	valid_1's binary_logloss: 0.260604
[50]	training's binary_logloss: 0.258608	valid_1's binary_logloss: 0.259374


[I 2024-11-25 11:51:47,549] Trial 3 finished with value: 0.7276486577305977 and parameters: {'learning_rate': 0.0512213377047125, 'num_leaves': 109, 'max_depth': 5, 'min_child_samples': 29, 'subsample': 0.68765114748852, 'colsample_bytree': 0.8121934964259412, 'reg_alpha': 1.133199046346157e-05, 'reg_lambda': 0.00010116188144189747}. Best is trial 2 with value: 0.7649229284088161.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018511 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.264602	valid_1's binary_logloss: 0.265164
[20]	training's binary_logloss: 0.258142	valid_1's binary_logloss: 0.259061
[30]	training's binary_logloss: 0.253842	valid_1's binary_logloss: 0.255026
[40]	training's binary_logloss: 0.250535	valid_1's binary_logloss: 0.252011
[50]	training's binary_logloss: 0.247898	valid_1's binary_logloss: 0.24956


[I 2024-11-25 11:52:23,489] Trial 4 finished with value: 0.7944172593464305 and parameters: {'learning_rate': 0.031022179131086926, 'num_leaves': 119, 'max_depth': 12, 'min_child_samples': 37, 'subsample': 0.9389545208888908, 'colsample_bytree': 0.9695593166484897, 'reg_alpha': 0.04257236267192656, 'reg_lambda': 0.08056164091265049}. Best is trial 4 with value: 0.7944172593464305.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018961 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.274013	valid_1's binary_logloss: 0.274176
[20]	training's binary_logloss: 0.271944	valid_1's binary_logloss: 0.27223
[30]	training's binary_logloss: 0.270196	valid_1's binary_logloss: 0.270585
[40]	training's binary_logloss: 0.268616	valid_1's binary_logloss: 0.269088
[50]	training's binary_logloss: 0.267271	valid_1's binary_logloss: 0.267797


[I 2024-11-25 11:52:57,117] Trial 5 finished with value: 0.7115851538377413 and parameters: {'learning_rate': 0.0055592117535819, 'num_leaves': 127, 'max_depth': 8, 'min_child_samples': 50, 'subsample': 0.5560701094816248, 'colsample_bytree': 0.9810424547806632, 'reg_alpha': 0.05208187937450416, 'reg_lambda': 0.005269508690005067}. Best is trial 4 with value: 0.7944172593464305.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.25219	valid_1's binary_logloss: 0.253667
[20]	training's binary_logloss: 0.242007	valid_1's binary_logloss: 0.24444
[30]	training's binary_logloss: 0.235206	valid_1's binary_logloss: 0.238477
[40]	training's binary_logloss: 0.230601	valid_1's binary_logloss: 0.234745
[50]	training's binary_logloss: 0.226551	valid_1's binary_logloss: 0.231627
[

[I 2024-11-25 11:53:31,470] Trial 6 finished with value: 0.879280970785176 and parameters: {'learning_rate': 0.09689346156074638, 'num_leaves': 172, 'max_depth': 11, 'min_child_samples': 15, 'subsample': 0.7931268398582481, 'colsample_bytree': 0.5834649136402668, 'reg_alpha': 0.0013301664574589375, 'reg_lambda': 0.0003337642989198579}. Best is trial 6 with value: 0.879280970785176.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012343 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.265788	valid_1's binary_logloss: 0.266334
[20]	training's binary_logloss: 0.259909	valid_1's binary_logloss: 0.260768
[30]	training's binary_logloss: 0.256251	valid_1's binary_logloss: 0.2574
[40]	training's binary_logloss: 0.253976	valid_1's binary_logloss: 0.255358
[50]	training's binary_logloss: 0.252085	valid_1's binary_logloss: 0.253568
[

[I 2024-11-25 11:54:00,248] Trial 7 finished with value: 0.7613879910436779 and parameters: {'learning_rate': 0.04940547644192594, 'num_leaves': 146, 'max_depth': 7, 'min_child_samples': 34, 'subsample': 0.6023185053209106, 'colsample_bytree': 0.5693761122842268, 'reg_alpha': 0.005137046685268728, 'reg_lambda': 0.0006560138178960562}. Best is trial 6 with value: 0.879280970785176.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.274649	valid_1's binary_logloss: 0.274709
[20]	training's binary_logloss: 0.273224	valid_1's binary_logloss: 0.273326
[30]	training's binary_logloss: 0.271981	valid_1's binary_logloss: 0.272124
[40]	training's binary_logloss: 0.270992	valid_1's binary_logloss: 0.271169
[50]	training's binary_logloss: 0.270205	valid_1's binary_logloss: 0.270428

[I 2024-11-25 11:54:17,730] Trial 8 finished with value: 0.6728155345512631 and parameters: {'learning_rate': 0.014472863684713283, 'num_leaves': 168, 'max_depth': 3, 'min_child_samples': 5, 'subsample': 0.5399052618094062, 'colsample_bytree': 0.900714295766291, 'reg_alpha': 5.030103363636753e-05, 'reg_lambda': 0.0006799429595851864}. Best is trial 6 with value: 0.879280970785176.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.275615	valid_1's binary_logloss: 0.27566
[20]	training's binary_logloss: 0.274616	valid_1's binary_logloss: 0.274715
[30]	training's binary_logloss: 0.273751	valid_1's binary_logloss: 0.273903
[40]	training's binary_logloss: 0.272995	valid_1's binary_logloss: 0.273194
[50]	training's binary_logloss: 0.272268	valid_1's binary_logloss: 0.272503


[I 2024-11-25 11:54:48,905] Trial 9 finished with value: 0.7031943866637798 and parameters: {'learning_rate': 0.003375845978767764, 'num_leaves': 100, 'max_depth': 7, 'min_child_samples': 7, 'subsample': 0.8743085771574042, 'colsample_bytree': 0.5258810434904266, 'reg_alpha': 2.943576753807502e-05, 'reg_lambda': 0.019559328670325733}. Best is trial 6 with value: 0.879280970785176.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.275865	valid_1's binary_logloss: 0.275906
[20]	training's binary_logloss: 0.275111	valid_1's binary_logloss: 0.275196
[30]	training's binary_logloss: 0.274361	valid_1's binary_logloss: 0.274489
[40]	training's binary_logloss: 0.273769	valid_1's binary_logloss: 0.273931
[50]	training's binary_logloss: 0.273128	valid_1's binary_logloss: 0.273327

[I 2024-11-25 11:55:32,936] Trial 10 finished with value: 0.7258638643661331 and parameters: {'learning_rate': 0.0013592879950556018, 'num_leaves': 195, 'max_depth': 12, 'min_child_samples': 17, 'subsample': 0.7504712511158311, 'colsample_bytree': 0.6607723458621022, 'reg_alpha': 0.0016491325213455574, 'reg_lambda': 1.1987078190516416e-05}. Best is trial 6 with value: 0.879280970785176.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015778 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.268848	valid_1's binary_logloss: 0.269119
[20]	training's binary_logloss: 0.2645	valid_1's binary_logloss: 0.264894
[30]	training's binary_logloss: 0.2616	valid_1's binary_logloss: 0.262065
[40]	training's binary_logloss: 0.25948	valid_1's binary_logloss: 0.260027
[50]	training's binary_logloss: 0.257781	valid_1's binary_logloss: 0.258406
[60]

[I 2024-11-25 11:56:00,098] Trial 11 finished with value: 0.7362372022408714 and parameters: {'learning_rate': 0.03390169559749436, 'num_leaves': 31, 'max_depth': 12, 'min_child_samples': 19, 'subsample': 0.9919602178864488, 'colsample_bytree': 0.7320388600163525, 'reg_alpha': 0.06643877001481793, 'reg_lambda': 0.0780303629944185}. Best is trial 6 with value: 0.879280970785176.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016727 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.252162	valid_1's binary_logloss: 0.253799
[20]	training's binary_logloss: 0.24263	valid_1's binary_logloss: 0.245394
[30]	training's binary_logloss: 0.2363	valid_1's binary_logloss: 0.239892
[40]	training's binary_logloss: 0.23183	valid_1's binary_logloss: 0.236295
[50]	training's binary_logloss: 0.227951	valid_1's binary_logloss: 0.233172
[60

[I 2024-11-25 11:56:36,899] Trial 12 finished with value: 0.8793802672703604 and parameters: {'learning_rate': 0.09344214729189412, 'num_leaves': 199, 'max_depth': 10, 'min_child_samples': 18, 'subsample': 0.8290221790792274, 'colsample_bytree': 0.6212979971938778, 'reg_alpha': 0.010845298775271834, 'reg_lambda': 0.003652053461133849}. Best is trial 12 with value: 0.8793802672703604.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015554 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.252006	valid_1's binary_logloss: 0.253552
[20]	training's binary_logloss: 0.241951	valid_1's binary_logloss: 0.244455
[30]	training's binary_logloss: 0.235726	valid_1's binary_logloss: 0.239178
[40]	training's binary_logloss: 0.231328	valid_1's binary_logloss: 0.235701
[50]	training's binary_logloss: 0.227599	valid_1's binary_logloss: 0.232689

[I 2024-11-25 11:57:12,836] Trial 13 finished with value: 0.8815039720776598 and parameters: {'learning_rate': 0.09714611115203126, 'num_leaves': 197, 'max_depth': 10, 'min_child_samples': 19, 'subsample': 0.8056146977133829, 'colsample_bytree': 0.6074715093571618, 'reg_alpha': 0.008195027908044309, 'reg_lambda': 0.0034609834746299646}. Best is trial 13 with value: 0.8815039720776598.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014243 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.266368	valid_1's binary_logloss: 0.266935
[20]	training's binary_logloss: 0.259746	valid_1's binary_logloss: 0.260721
[30]	training's binary_logloss: 0.254667	valid_1's binary_logloss: 0.25596
[40]	training's binary_logloss: 0.251243	valid_1's binary_logloss: 0.252846
[50]	training's binary_logloss: 0.248296	valid_1's binary_logloss: 0.25017
[

[I 2024-11-25 11:57:56,020] Trial 14 finished with value: 0.7891129653537847 and parameters: {'learning_rate': 0.025579263773729703, 'num_leaves': 200, 'max_depth': 10, 'min_child_samples': 22, 'subsample': 0.8256097969157006, 'colsample_bytree': 0.6435240549481749, 'reg_alpha': 0.008973730865778393, 'reg_lambda': 0.004414825671715938}. Best is trial 13 with value: 0.8815039720776598.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.257094	valid_1's binary_logloss: 0.25815
[20]	training's binary_logloss: 0.249086	valid_1's binary_logloss: 0.250849
[30]	training's binary_logloss: 0.244481	valid_1's binary_logloss: 0.246751
[40]	training's binary_logloss: 0.241174	valid_1's binary_logloss: 0.243804
[50]	training's binary_logloss: 0.238756	valid_1's binary_logloss: 0.241755


[I 2024-11-25 11:58:32,527] Trial 15 finished with value: 0.8367418035825696 and parameters: {'learning_rate': 0.06784924692042861, 'num_leaves': 174, 'max_depth': 9, 'min_child_samples': 25, 'subsample': 0.7206699214889751, 'colsample_bytree': 0.7374755630286719, 'reg_alpha': 0.00955675830361487, 'reg_lambda': 0.004402095918388465}. Best is trial 13 with value: 0.8815039720776598.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014826 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.269501	valid_1's binary_logloss: 0.269891
[20]	training's binary_logloss: 0.264528	valid_1's binary_logloss: 0.265209
[30]	training's binary_logloss: 0.260407	valid_1's binary_logloss: 0.261305
[40]	training's binary_logloss: 0.257476	valid_1's binary_logloss: 0.258581
[50]	training's binary_logloss: 0.254767	valid_1's binary_logloss: 0.256049

[I 2024-11-25 11:59:17,932] Trial 16 finished with value: 0.7679143031599105 and parameters: {'learning_rate': 0.018281305230184983, 'num_leaves': 156, 'max_depth': 10, 'min_child_samples': 12, 'subsample': 0.857100059757424, 'colsample_bytree': 0.6068374021816507, 'reg_alpha': 0.018757835815538988, 'reg_lambda': 0.01888759935318131}. Best is trial 13 with value: 0.8815039720776598.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017052 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.260369	valid_1's binary_logloss: 0.261195
[20]	training's binary_logloss: 0.252267	valid_1's binary_logloss: 0.253772
[30]	training's binary_logloss: 0.247477	valid_1's binary_logloss: 0.249494
[40]	training's binary_logloss: 0.244544	valid_1's binary_logloss: 0.246917
[50]	training's binary_logloss: 0.242042	valid_1's binary_logloss: 0.244771

[I 2024-11-25 11:59:53,620] Trial 17 finished with value: 0.8187057711813954 and parameters: {'learning_rate': 0.04844005169193445, 'num_leaves': 186, 'max_depth': 9, 'min_child_samples': 12, 'subsample': 0.7014816652669483, 'colsample_bytree': 0.7942489317087049, 'reg_alpha': 0.00035824122967203977, 'reg_lambda': 0.0019943869050283616}. Best is trial 13 with value: 0.8815039720776598.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.276204	valid_1's binary_logloss: 0.276216
[20]	training's binary_logloss: 0.275793	valid_1's binary_logloss: 0.275823
[30]	training's binary_logloss: 0.275384	valid_1's binary_logloss: 0.275433
[40]	training's binary_logloss: 0.27505	valid_1's binary_logloss: 0.275113
[50]	training's binary_logloss: 0.274691	valid_1's binary_logloss: 0.274769


[I 2024-11-25 12:00:33,871] Trial 18 finished with value: 0.7001289167162472 and parameters: {'learning_rate': 0.0010218580464473551, 'num_leaves': 74, 'max_depth': 11, 'min_child_samples': 24, 'subsample': 0.7684372828941594, 'colsample_bytree': 0.6705147280152481, 'reg_alpha': 0.0035469911058059285, 'reg_lambda': 0.01578572534758881}. Best is trial 13 with value: 0.8815039720776598.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014470 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.255116	valid_1's binary_logloss: 0.256618
[20]	training's binary_logloss: 0.245983	valid_1's binary_logloss: 0.248375
[30]	training's binary_logloss: 0.240292	valid_1's binary_logloss: 0.243425
[40]	training's binary_logloss: 0.236676	valid_1's binary_logloss: 0.240569
[50]	training's binary_logloss: 0.233542	valid_1's binary_logloss: 0.238172

[I 2024-11-25 12:01:13,469] Trial 19 finished with value: 0.8569714383798873 and parameters: {'learning_rate': 0.09690539814475885, 'num_leaves': 182, 'max_depth': 9, 'min_child_samples': 10, 'subsample': 0.6509399787100139, 'colsample_bytree': 0.5535115302436924, 'reg_alpha': 0.018997338528662786, 'reg_lambda': 0.009500433218146656}. Best is trial 13 with value: 0.8815039720776598.


Best AUC score: 0.8815039720776598
Best parameters: {'learning_rate': 0.09714611115203126, 'num_leaves': 197, 'max_depth': 10, 'min_child_samples': 19, 'subsample': 0.8056146977133829, 'colsample_bytree': 0.6074715093571618, 'reg_alpha': 0.008195027908044309, 'reg_lambda': 0.0034609834746299646}
[LightGBM] [Info] Number of positive: 85006, number of negative: 989414
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2518
[LightGBM] [Info] Number of data points in the train set: 1074420, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454391
[LightGBM] [Info] Start training from score -2.454391
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.252195	valid_1's binary_logloss: 0.258883
[20]	

### Using LGM Boost with optuna, and max trails of 20, AUC Score is 72.9%. But ranking is still 50% on competition. 

In [176]:
# Get the feature names used during training
train_features = final_model2.feature_name_

# Align test dataset to match training features
missing_cols = set(train_features) - set(test_combined.columns)
extra_cols = set(test_combined.columns) - set(train_features)

# Add missing columns with default values (e.g., 0)
for col in missing_cols:
    test_combined[col] = 0

# Drop extra columns not present during training
test_combined = test_combined.drop(columns=extra_cols)

# Ensure the column order matches the training dataset
test_combined = test_combined[train_features]

# Make predictions
preds = final_model2.predict(test_combined)  # LightGBM handles this directly

# Convert predictions to a DataFrame
preds_df = pd.DataFrame(preds, columns=['target'])

# Display predictions
print(preds_df.head())


   target
0     0.0
1     0.0
2     0.0
3     0.0
4     0.0


In [177]:
submission = pd.DataFrame(
    {
        'client_id': sample_submission['client_id'],
        'target': preds  # Use the predictions directly
    }
)

print(submission.head())


          client_id  target
0     test_Client_0     0.0
1     test_Client_1     0.0
2    test_Client_10     0.0
3   test_Client_100     0.0
4  test_Client_1000     0.0


In [178]:
submission.to_csv(f'{OUTPUT_DIR}/submissionv13-lgmboost.csv', index=False)

In [101]:
###updates params and reduced # of trails to 10.

from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
import optuna

# Define the objective function for Optuna optimization
def objective(trial):
    # Define the parameter search space
    param = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'n_estimators': 200,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 2, 200),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)
    }

    # K-fold Cross-Validation setup
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_scores = []

    # Loop over each fold in the cross-validation
    for train_index, val_index in kf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        # Train the model with current parameters
        model = LGBMClassifier(**param)

        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_train_fold, y_train_fold), (X_val_fold, y_val_fold)],
            eval_metric='logloss',
            callbacks=[
                early_stopping(stopping_rounds=10),  # Early stopping via callback
                log_evaluation(period=10)  # Log evaluation info
            ]
        )

        # Predict and evaluate on the validation set
        y_pred_prob = model.predict_proba(X_val_fold)[:, 1]  # Predicted probabilities
        auc_score = roc_auc_score(y_val_fold, y_pred_prob)
        fold_scores.append(auc_score)

    # Return the average AUC score over the folds
    return np.mean(fold_scores)

# Create the Optuna study
study = optuna.create_study(direction='maximize')  # Maximize AUC score
study.optimize(objective, n_trials=10)  # Number of trials to run

# Print the best parameters and best score
print(f"Best AUC score: {study.best_value}")
print(f"Best parameters: {study.best_params}")

# Train the final model with the best parameters found by Optuna
best_params = study.best_params
final_model = LGBMClassifier(**best_params)

final_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_metric='logloss',
    callbacks=[
        early_stopping(stopping_rounds=10),
        log_evaluation(period=10)
    ]
)

# Evaluate final model on test set using AUC
y_pred_prob_final = final_model.predict_proba(X_test)[:, 1]  # Predicted probabilities
final_auc_score = roc_auc_score(y_test, y_pred_prob_final)
print(f"Final Test AUC Score: {final_auc_score}")

[I 2024-11-25 10:40:20,878] A new study created in memory with name: no-name-d10d54c1-579b-4714-9a40-0014869088b5
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015351 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.275143	valid_1's binary_logloss: 0.275224
[20]	training's binary_logloss: 0.273873	valid_1's binary_logloss: 0.274028
[30]	training's binary_logloss: 0.272723	valid_1's binary_logloss: 0.272937
[40]	training's binary_logloss: 0.271709	valid_1's binary_logloss: 0.271973
[50]	training's binary_logloss: 0.270815	valid_1's binary_logloss: 0.271122

[I 2024-11-25 10:40:52,901] Trial 0 finished with value: 0.6942086553912115 and parameters: {'learning_rate': 0.0034776315805219392, 'num_leaves': 64, 'max_depth': 8, 'min_child_samples': 19, 'subsample': 0.8044383168060025, 'colsample_bytree': 0.9546404277587586, 'reg_alpha': 0.06751882239610238, 'reg_lambda': 0.00021341028719155114}. Best is trial 0 with value: 0.6942086553912115.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012368 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.276288	valid_1's binary_logloss: 0.276298
[20]	training's binary_logloss: 0.275948	valid_1's binary_logloss: 0.275976
[30]	training's binary_logloss: 0.275615	valid_1's binary_logloss: 0.27566
[40]	training's binary_logloss: 0.275341	valid_1's binary_logloss: 0.275397
[50]	training's binary_logloss: 0.275041	valid_1's binary_logloss: 0.27511
[

[I 2024-11-25 10:41:18,713] Trial 1 finished with value: 0.6799628821772247 and parameters: {'learning_rate': 0.0011814615286072712, 'num_leaves': 43, 'max_depth': 6, 'min_child_samples': 24, 'subsample': 0.5997455350347386, 'colsample_bytree': 0.6484898048045116, 'reg_alpha': 0.0005718055802582659, 'reg_lambda': 0.0001818165388850178}. Best is trial 0 with value: 0.6942086553912115.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013598 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.271345	valid_1's binary_logloss: 0.271488
[20]	training's binary_logloss: 0.268653	valid_1's binary_logloss: 0.268893
[30]	training's binary_logloss: 0.26695	valid_1's binary_logloss: 0.26723
[40]	training's binary_logloss: 0.265745	valid_1's binary_logloss: 0.266053
[50]	training's binary_logloss: 0.264888	valid_1's binary_logloss: 0.265228
[

[I 2024-11-25 10:41:34,120] Trial 2 finished with value: 0.6851079442803436 and parameters: {'learning_rate': 0.09271571530614423, 'num_leaves': 31, 'max_depth': 2, 'min_child_samples': 10, 'subsample': 0.6349777585646299, 'colsample_bytree': 0.7243507100475126, 'reg_alpha': 0.004572743116816374, 'reg_lambda': 0.09977583565075406}. Best is trial 0 with value: 0.6942086553912115.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.275417	valid_1's binary_logloss: 0.275488
[20]	training's binary_logloss: 0.274226	valid_1's binary_logloss: 0.274368
[30]	training's binary_logloss: 0.273146	valid_1's binary_logloss: 0.273352
[40]	training's binary_logloss: 0.272136	valid_1's binary_logloss: 0.272398
[50]	training's binary_logloss: 0.271248	valid_1's binary_logloss: 0.271565

[I 2024-11-25 10:42:13,251] Trial 3 finished with value: 0.713564646712876 and parameters: {'learning_rate': 0.0023909186663532765, 'num_leaves': 168, 'max_depth': 9, 'min_child_samples': 28, 'subsample': 0.5478908571353152, 'colsample_bytree': 0.8061310864483842, 'reg_alpha': 9.62425315816552e-05, 'reg_lambda': 0.014828652611739757}. Best is trial 3 with value: 0.713564646712876.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.276356	valid_1's binary_logloss: 0.276362
[20]	training's binary_logloss: 0.276098	valid_1's binary_logloss: 0.276112
[30]	training's binary_logloss: 0.275841	valid_1's binary_logloss: 0.275865
[40]	training's binary_logloss: 0.275632	valid_1's binary_logloss: 0.275663
[50]	training's binary_logloss: 0.275416	valid_1's binary_logloss: 0.275452

[I 2024-11-25 10:42:38,952] Trial 4 finished with value: 0.6739624877885584 and parameters: {'learning_rate': 0.0012612464241653115, 'num_leaves': 14, 'max_depth': 7, 'min_child_samples': 6, 'subsample': 0.9687722277736853, 'colsample_bytree': 0.5924951736997878, 'reg_alpha': 0.00033100706838119694, 'reg_lambda': 2.921334241798492e-05}. Best is trial 3 with value: 0.713564646712876.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.266823	valid_1's binary_logloss: 0.267112
[20]	training's binary_logloss: 0.263055	valid_1's binary_logloss: 0.263517
[30]	training's binary_logloss: 0.260966	valid_1's binary_logloss: 0.26147
[40]	training's binary_logloss: 0.259611	valid_1's binary_logloss: 0.26015
[50]	training's binary_logloss: 0.258441	valid_1's binary_logloss: 0.259038
[

[I 2024-11-25 10:42:58,729] Trial 5 finished with value: 0.7265218483348646 and parameters: {'learning_rate': 0.09344182773162826, 'num_leaves': 100, 'max_depth': 4, 'min_child_samples': 28, 'subsample': 0.9207979129270363, 'colsample_bytree': 0.8915951206266115, 'reg_alpha': 0.00019457823380594798, 'reg_lambda': 5.714624589910493e-05}. Best is trial 5 with value: 0.7265218483348646.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017100 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.274409	valid_1's binary_logloss: 0.274514
[20]	training's binary_logloss: 0.272495	valid_1's binary_logloss: 0.272701
[30]	training's binary_logloss: 0.270943	valid_1's binary_logloss: 0.27122
[40]	training's binary_logloss: 0.269578	valid_1's binary_logloss: 0.269916
[50]	training's binary_logloss: 0.268435	valid_1's binary_logloss: 0.268819


[I 2024-11-25 10:43:25,845] Trial 6 finished with value: 0.691553522606373 and parameters: {'learning_rate': 0.00742522163016228, 'num_leaves': 80, 'max_depth': 6, 'min_child_samples': 31, 'subsample': 0.6875469349171721, 'colsample_bytree': 0.8473347824724207, 'reg_alpha': 0.00014589701575258693, 'reg_lambda': 0.005784634640251468}. Best is trial 5 with value: 0.7265218483348646.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016648 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.276394	valid_1's binary_logloss: 0.276401
[20]	training's binary_logloss: 0.276165	valid_1's binary_logloss: 0.276183
[30]	training's binary_logloss: 0.275946	valid_1's binary_logloss: 0.275973
[40]	training's binary_logloss: 0.275738	valid_1's binary_logloss: 0.275774
[50]	training's binary_logloss: 0.275551	valid_1's binary_logloss: 0.275594

[I 2024-11-25 10:43:45,662] Trial 7 finished with value: 0.6552651987558594 and parameters: {'learning_rate': 0.001078202948832305, 'num_leaves': 12, 'max_depth': 4, 'min_child_samples': 42, 'subsample': 0.5914729644574888, 'colsample_bytree': 0.8360120262828933, 'reg_alpha': 3.3853408055458166e-05, 'reg_lambda': 1.2569717556404685e-05}. Best is trial 5 with value: 0.7265218483348646.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.272402	valid_1's binary_logloss: 0.272516
[20]	training's binary_logloss: 0.270146	valid_1's binary_logloss: 0.270342
[30]	training's binary_logloss: 0.268462	valid_1's binary_logloss: 0.268724
[40]	training's binary_logloss: 0.267222	valid_1's binary_logloss: 0.267518
[50]	training's binary_logloss: 0.26632	valid_1's binary_logloss: 0.266634


[I 2024-11-25 10:44:03,075] Trial 8 finished with value: 0.6801283832683609 and parameters: {'learning_rate': 0.0662673540237513, 'num_leaves': 18, 'max_depth': 2, 'min_child_samples': 27, 'subsample': 0.8104242052003529, 'colsample_bytree': 0.5873539681802973, 'reg_alpha': 0.0006915460241178857, 'reg_lambda': 1.609797961285384e-05}. Best is trial 5 with value: 0.7265218483348646.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1)


[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015405 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.276214	valid_1's binary_logloss: 0.27623
[20]	training's binary_logloss: 0.275814	valid_1's binary_logloss: 0.275848
[30]	training's binary_logloss: 0.275445	valid_1's binary_logloss: 0.275496
[40]	training's binary_logloss: 0.275134	valid_1's binary_logloss: 0.275198
[50]	training's binary_logloss: 0.274812	valid_1's binary_logloss: 0.274888


[I 2024-11-25 10:44:23,461] Trial 9 finished with value: 0.6598448321215591 and parameters: {'learning_rate': 0.002008823813697954, 'num_leaves': 16, 'max_depth': 4, 'min_child_samples': 41, 'subsample': 0.5740596602147194, 'colsample_bytree': 0.695947652254199, 'reg_alpha': 0.004864927790028652, 'reg_lambda': 0.0005610694773178897}. Best is trial 5 with value: 0.7265218483348646.


Best AUC score: 0.7265218483348646
Best parameters: {'learning_rate': 0.09344182773162826, 'num_leaves': 100, 'max_depth': 4, 'min_child_samples': 28, 'subsample': 0.9207979129270363, 'colsample_bytree': 0.8915951206266115, 'reg_alpha': 0.00019457823380594798, 'reg_lambda': 5.714624589910493e-05}
[LightGBM] [Info] Number of positive: 85006, number of negative: 989414
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021503 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2518
[LightGBM] [Info] Number of data points in the train set: 1074420, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454391
[LightGBM] [Info] Start training from score -2.454391
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.266724	valid_1's binary_logloss: 0.265991
[20]

### Updated Pamameters, LMGBoost with optuna, max trails =10 (to increase the speed of processing) 
### AUC Score: 69% (therefore less trails make it less accurate) . 

In [94]:
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import StratifiedKFold, ParameterSampler
from sklearn.metrics import roc_auc_score
import numpy as np

# K-fold Cross-Validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_params = None
best_score = 0

# Loop over parameter combinations using ParameterSampler
for params in ParameterSampler(param_dist, n_iter=10, random_state=42):
    print(f"Training with parameters: {params}")
    
    fold_scores = []

    # K-fold Cross-Validation
    for train_index, val_index in kf.split(X_train, y_train):  # Using X_train and y_train directly
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        # Train the model with current parameters
        model = LGBMClassifier(
            objective='binary',
            boosting_type='gbdt',
            n_estimators=200,
            **params
        )

        # Early stopping via callbacks
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_train_fold, y_train_fold), (X_val_fold, y_val_fold)],
            eval_metric='logloss',
            callbacks=[
                early_stopping(stopping_rounds=10),  # Early stopping
                log_evaluation(period=10)  # Controls verbosity
            ]
        )

        # Predict and evaluate on the validation set
        y_pred_prob = model.predict_proba(X_val_fold)[:, 1]  # Predicted probabilities
        auc_score = roc_auc_score(y_val_fold, y_pred_prob)
        fold_scores.append(auc_score)

    avg_auc_score = np.mean(fold_scores)
    print(f"Average AUC Score across folds: {avg_auc_score}")

    if avg_auc_score > best_score:
        best_score = avg_auc_score
        best_params = params

print("\nBest Parameters:", best_params)
print("Best AUC Score:", best_score)

# Train final model on full training set with best parameters
final_model = LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    n_estimators=200,
    **best_params
)

final_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_metric='logloss',
    callbacks=[
        early_stopping(stopping_rounds=10),
        log_evaluation(period=10)
    ]
)

# Evaluate final model on test set using AUC
y_pred_prob_final = final_model.predict_proba(X_test)[:, 1]  # Predicted probabilities
final_auc_score = roc_auc_score(y_test, y_pred_prob_final)
print("Final Test AUC Score:", final_auc_score)


Training with parameters: {'subsample': 1, 'reg_lambda': 0.2, 'reg_alpha': 0, 'num_leaves': 31, 'min_child_samples': 30, 'max_depth': 10, 'learning_rate': 0.01, 'colsample_bytree': 1}
[LightGBM] [Info] Number of positive: 68005, number of negative: 791531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019105 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2520
[LightGBM] [Info] Number of data points in the train set: 859536, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079118 -> initscore=-2.454388
[LightGBM] [Info] Start training from score -2.454388
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.273486	valid_1's binary_logloss: 0.273586
[20]	training's binary_logloss: 0.271162	valid_1's binary_logloss: 0.271335
[30]	training's binary_logloss: 0.269328	va

### When using LGMBoost with Cross Validation the final test AUC Score is 70%. This resulted in a 50% score on the leaderboard. 

In [95]:
# Get the feature names used during training
train_features = final_model.feature_name_

# Align test dataset to match training features
missing_cols = set(train_features) - set(test_combined.columns)
extra_cols = set(test_combined.columns) - set(train_features)

# Add missing columns with default values (e.g., 0)
for col in missing_cols:
    test_combined[col] = 0

# Drop extra columns not present during training
test_combined = test_combined.drop(columns=extra_cols)

# Ensure the column order matches the training dataset
test_combined = test_combined[train_features]

# Make predictions
preds = final_model.predict(test_combined)  # LightGBM handles this directly

# Convert predictions to a DataFrame
preds_df = pd.DataFrame(preds, columns=['target'])

# Display predictions
print(preds_df.head())


   target
0     0.0
1     0.0
2     0.0
3     0.0
4     0.0


In [96]:
submission = pd.DataFrame(
    {
        'client_id': sample_submission['client_id'],
        'target': preds  # Use the predictions directly
    }
)

print(submission.head())


          client_id  target
0     test_Client_0     0.0
1     test_Client_1     0.0
2    test_Client_10     0.0
3   test_Client_100     0.0
4  test_Client_1000     0.0


In [97]:
submission.to_csv(f'{OUTPUT_DIR}/submissionv11-lgmboost.csv', index=False)

In [484]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split, ParameterSampler
from sklearn.metrics import roc_auc_score
import numpy as np

# Example param_dist with a wider range of hyperparameters
param_dist = {
    'max_depth': [7,10],
    'min_child_weight': [0.5,1],
    'gamma': [0.3,0.5],
    'subsample': [0.7,1],
    'colsample_bytree': [0.8,1],
    'learning_rate': [0.01],
    'alpha': [0],
    'lambda': [0.2],
}

# Split the data into training and evaluation sets
X_train_split, X_eval, y_train_split, y_eval = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# K-fold Cross-Validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_params = None
best_score = 0

# Loop over parameter combinations using ParameterSampler
for params in ParameterSampler(param_dist, n_iter=10, random_state=42):  # Increased n_iter
    print(f"Training with parameters: {params}")
    
    # Add parameters specific to xgboost.train
    params['objective'] = 'binary:logistic'  # Adjust as per your task
    params['eval_metric'] = 'logloss'

    fold_scores = []

    # K-fold Cross-Validation
    for train_index, val_index in kf.split(X_train_split, y_train_split):
        X_train_fold, X_val_fold = X_train_split.iloc[train_index], X_train_split.iloc[val_index]
        y_train_fold, y_val_fold = y_train_split.iloc[train_index], y_train_split.iloc[val_index]

        # Convert data to DMatrix for compatibility with xgboost.train
        dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
        deval = xgb.DMatrix(X_val_fold, label=y_val_fold)

        # Train the model with more boosting rounds
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=200,  # Increased number of boosting rounds
            evals=[(dtrain, 'train'), (deval, 'eval')],
            early_stopping_rounds=10,
            verbose_eval=False,
        )

        # Predict and evaluate on the validation set
        y_pred_prob = model.predict(deval)  # Predicted probabilities
        auc_score = roc_auc_score(y_val_fold, y_pred_prob)
        fold_scores.append(auc_score)

    avg_auc_score = np.mean(fold_scores)
    print(f"Average AUC Score across folds: {avg_auc_score}")

    if avg_auc_score > best_score:
        best_score = avg_auc_score
        best_params = params

print("\nBest Parameters:", best_params)
print("Best AUC Score:", best_score)

# Train final model on full training set with best parameters
dtrain_full = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Train final model with early stopping based on validation set
final_model = xgb.train(
    best_params,
    dtrain_full,
    num_boost_round=200,  # Increased boosting rounds
    evals=[(dtrain_full, 'train'), (dtest, 'test')],
    early_stopping_rounds=10,
    verbose_eval=True,
)

# Evaluate final model on test set using AUC
y_pred_prob_final = final_model.predict(dtest)  # Predicted probabilities
final_auc_score = roc_auc_score(y_test, y_pred_prob_final)
print("Final Test AUC Score:", final_auc_score)


Training with parameters: {'subsample': 1, 'min_child_weight': 0.5, 'max_depth': 10, 'learning_rate': 0.01, 'lambda': 0.2, 'gamma': 0.5, 'colsample_bytree': 1, 'alpha': 0}
Average AUC Score across folds: 0.7468275135911953
Training with parameters: {'subsample': 1, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'lambda': 0.2, 'gamma': 0.5, 'colsample_bytree': 0.8, 'alpha': 0}
Average AUC Score across folds: 0.754678720831255
Training with parameters: {'subsample': 0.7, 'min_child_weight': 0.5, 'max_depth': 7, 'learning_rate': 0.01, 'lambda': 0.2, 'gamma': 0.5, 'colsample_bytree': 1, 'alpha': 0}
Average AUC Score across folds: 0.7086390435567737
Training with parameters: {'subsample': 1, 'min_child_weight': 0.5, 'max_depth': 7, 'learning_rate': 0.01, 'lambda': 0.2, 'gamma': 0.3, 'colsample_bytree': 1, 'alpha': 0}
Average AUC Score across folds: 0.7057307375974535
Training with parameters: {'subsample': 0.7, 'min_child_weight': 0.5, 'max_depth': 7, 'learning_rate': 0.01, 

### Using LMGBoost with cross validation, the AUC Score was 72.28% with a 55% score. This is the best so far. 

In [485]:
# Get the feature names used during training
train_features = final_model.feature_names

# Align test dataset to match training features
missing_cols = set(train_features) - set(test_combined.columns)
extra_cols = set(test_combined.columns) - set(train_features)

# Add missing columns with default values (e.g., 0)
for col in missing_cols:
    test_combined[col] = 0

# Drop extra columns not present during training
test_combined = test_combined.drop(columns=extra_cols)

# Ensure the column order matches the training dataset
test_combined = test_combined[train_features]

# Create DMatrix for the test dataset
dtest = xgb.DMatrix(test_combined)

# Make predictions
preds = final_model.predict(dtest)

# Convert predictions to a DataFrame
preds_df = pd.DataFrame(preds, columns=['target'])

# Display predictions
print(preds_df.head())

     target
0  0.065487
1  0.314237
2  0.047359
3  0.139631
4  0.148195


In [486]:
submission = pd.DataFrame(
    {
        'client_id': sample_submission['client_id'],
        'target': preds  # Use the predictions directly
    }
)

print(submission.head())


          client_id    target
0     test_Client_0  0.065487
1     test_Client_1  0.314237
2    test_Client_10  0.047359
3   test_Client_100  0.139631
4  test_Client_1000  0.148195


In [487]:
submission.to_csv(f'{OUTPUT_DIR}/submissionv11-lgmboost.csv', index=False)

In [488]:
print(OUTPUT_DIR)
print(submission.head())  # Check the first few rows of the 'submission' DataFrame

/Users/alishasahota/Documents/Queens MMA Program/869 - Machine Learning & AI/Team Project/output
          client_id    target
0     test_Client_0  0.065487
1     test_Client_1  0.314237
2    test_Client_10  0.047359
3   test_Client_100  0.139631
4  test_Client_1000  0.148195
