In [1]:
import dask.dataframe as dd
import numpy as np
from IPython.display import display
from dask.distributed import Client, LocalCluster
import dask

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support)

In [3]:
# Set up a Dask Cluster
cluster = LocalCluster(n_workers=6, threads_per_worker=1, memory_limit='18GB')
client = Client(cluster)

print("client link:")
print(client.dashboard_link)  # Clickable link to the dashboard


client link:
http://127.0.0.1:8787/status


In [4]:
# setup Variables for filepaths
DATA_DIR="../../data"

MEMBERS_FILE=f"{DATA_DIR}/members_v3.csv"
TRANSACTION_FILE=f"{DATA_DIR}/transactions.csv"
TRAIN_FILE=f"{DATA_DIR}/train.csv"
USERLOG_FILE=f"{DATA_DIR}/user_logs.csv"
SAMPLE_SUBMISSION_FILE=f"{DATA_DIR}/sample_submission_zero.csv"

TRANSACTION_V2_FILE=f"{DATA_DIR}/transactions_v2.csv"
TRAIN_V2_FILE=f"{DATA_DIR}/train_v2.csv"
USER_LOGS_V2_FILE=f"{DATA_DIR}/user_logs_v2.csv"
SAMPLE_SUBMISSION_V2_FILE=f"{DATA_DIR}/sample_submission_v2.csv"

In [5]:
print(f"DATA_DIR: {DATA_DIR}")
print(f"TRANSACTION_FILE: {TRANSACTION_FILE}")
print(f"USERLOG_FILE: {USERLOG_FILE}")
print(f"TRAIN_FILE: {TRAIN_FILE}")
print(f"SAMPLE_SUBMISSION_FILE: {SAMPLE_SUBMISSION_FILE}")
print(f"MEMBERS_FILE: {MEMBERS_FILE}")
print()
print(f"TRANSACTION_V2_FILE: {TRANSACTION_V2_FILE}")
print(f"USER_LOGS_V2_FILE: {USER_LOGS_V2_FILE}")
print(f"TRAIN_V2_FILE: {TRAIN_V2_FILE}")
print(f"SAMPLE_SUBMISSION_V2_FILE: {SAMPLE_SUBMISSION_V2_FILE}")

DATA_DIR: ../../data
TRANSACTION_FILE: ../../data/transactions.csv
USERLOG_FILE: ../../data/user_logs.csv
TRAIN_FILE: ../../data/train.csv
SAMPLE_SUBMISSION_FILE: ../../data/sample_submission_zero.csv
MEMBERS_FILE: ../../data/members_v3.csv

TRANSACTION_V2_FILE: ../../data/transactions_v2.csv
USER_LOGS_V2_FILE: ../../data/user_logs_v2.csv
TRAIN_V2_FILE: ../../data/train_v2.csv
SAMPLE_SUBMISSION_V2_FILE: ../../data/sample_submission_v2.csv


# data prep

In [6]:
train = dd.read_csv(TRAIN_FILE)
# train = dd.concat((train, dd.read_csv(TRAIN_V2_FILE)), axis=0, ignore_index=True).reset_index(drop=True)
train = dd.concat([train, dd.read_csv(TRAIN_V2_FILE)])

test = dd.read_csv(SAMPLE_SUBMISSION_V2_FILE)

In [7]:
len(train)

1963891

In [8]:
transactions = dd.read_csv(TRANSACTION_FILE, usecols=['msno'])
transactions = dd.concat([transactions, dd.read_csv(TRANSACTION_V2_FILE, usecols=['msno'])])
transactions = dd.DataFrame(transactions['msno'].value_counts().reset_index()).compute()
transactions.columns = ['msno','trans_count']
train = train.merge(transactions, how='left', on='msno')
test = test.merge(transactions, how='left', on='msno')



In [9]:
transactions = dd.read_csv(TRANSACTION_V2_FILE) 
transactions = transactions.sort_values(by=['transaction_date'], ascending=[False]).reset_index(drop=True)
transactions = transactions.drop_duplicates(subset=['msno'], keep='first')

train = dd.merge(train, transactions, how='left', on='msno')
test = dd.merge(test, transactions, how='left', on='msno')
# transactions=[]

In [10]:
user_logs = dd.read_csv(USER_LOGS_V2_FILE, usecols=['msno'])
user_logs = dd.DataFrame(user_logs['msno'].value_counts().reset_index()).compute()
user_logs.columns = ['msno','logs_count']
train = dd.merge(train, user_logs, how='left', on='msno')
test = dd.merge(test, user_logs, how='left', on='msno')
# user_logs = []; 

In [11]:
train

Unnamed: 0_level_0,msno,is_churn,trans_count,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,logs_count
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,string,int64,int64[pyarrow],int64,int64,int64,int64,int64,int64,int64,int64,int64[pyarrow]
,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...


In [12]:
def transform_df(df):
    df = dd.DataFrame(df)
    df = df.sort_values(by=['date'], ascending=[False])
    df = df.reset_index(drop=True)
    df = df.drop_duplicates(subset=['msno'], keep='first')
    return df

def transform_df2(df):
    df = df.sort_values(by=['date'], ascending=[False])
    df = df.reset_index(drop=True)
    df = df.drop_duplicates(subset=['msno'], keep='first')
    return df

In [13]:
# last_user_logs = []
# last_user_logs.append(transform_df(dd.read_csv(USER_LOGS_V2_FILE)))
# last_user_logs = dd.concat(last_user_logs, axis=0, ignore_index=True).reset_index(drop=True)
# last_user_logs = transform_df2(last_user_logs)
# print ('merging user logs features...')
# train = dd.merge(train, last_user_logs, how='left', on='msno')
# test = dd.merge(test, last_user_logs, how='left', on='msno')
# # last_user_logs=[]

In [14]:
members = dd.read_csv(MEMBERS_FILE)
train = dd.merge(train, members, how='left', on='msno')
test = dd.merge(test, members, how='left', on='msno')
print('members merge...')

members merge...


In [15]:
gender = {'male':1, 'female':2}
train['gender'] = train['gender'].map(gender)
test['gender'] = test['gender'].map(gender)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('gender', 'float64'))

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('gender', 'float64'))



In [16]:
train = train.fillna(0)
test = test.fillna(0)

In [17]:
train = train.compute()
test = test.compute()

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


In [18]:
del members
del transactions
del user_logs


In [19]:
import gc
gc.collect()

625

# training

In [20]:
train

Unnamed: 0,msno,is_churn,trans_count,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,logs_count,city,bd,gender,registered_via,registration_init_time
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,18.0,36.0,2.0,9.0,20050406.0
1,Nb1ZGEmagQeba5E+nQj8VlQoWl+8SFmLZu+Y8ytIamw=,1,23,38.0,30.0,149.0,149.0,0.0,20170307.0,20170406.0,0.0,22,18.0,22.0,2.0,9.0,20060826.0
2,I8dFN2EjFN1mt4Xel8WQX1/g7u6Dg4PBMHLkiDjhUS8=,1,27,38.0,30.0,149.0,149.0,0.0,20170322.0,20170421.0,0.0,30,4.0,43.0,1.0,9.0,20061222.0
3,+THH2QTeGyADYlZvoaYUXCyoS1iLQsHq59ElGxwwGlE=,1,5,38.0,195.0,894.0,894.0,0.0,20170216.0,20170831.0,0.0,26,17.0,28.0,1.0,9.0,20080515.0
4,ngHjqujoWJdkjMy+0t8IATYeN2NAhN/yIYszLXAyfSc=,1,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,22.0,37.0,2.0,9.0,20090105.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328792,PsftdQEI+bQFl8FB2+O4sKM4uRZGO/UvBCDS+ZyWmvk=,0,27,40.0,30.0,149.0,149.0,1.0,20170318.0,20170417.0,0.0,31,4.0,28.0,1.0,9.0,20110812.0
328793,MqyOPaDM7Jz3kV3fu/h9ilHP3TxLaMFE9raYkEHg5Jg=,0,28,41.0,30.0,100.0,100.0,1.0,20170318.0,20170419.0,0.0,4,1.0,0.0,0.0,7.0,20110817.0
328794,SjVZDYaiKgEHpFX1PcFDS94b9CFdaHjg78rfumtm/F4=,0,21,39.0,30.0,149.0,149.0,1.0,20170331.0,20170520.0,0.0,21,15.0,19.0,1.0,9.0,20110823.0
328795,eZ3y0lsY2SVZc2h8T3zB454TuBz6oVDMlFsEpEPQclQ=,0,28,41.0,30.0,99.0,99.0,1.0,20170321.0,20170422.0,0.0,31,18.0,0.0,2.0,7.0,20110830.0


In [21]:
cols = [ c for c in train.columns if c not in ['is_churn', 'msno']]

X_train = train[cols]
y_train = train['is_churn']
X_test = test[cols]
# y_train = train['is_churn']

In [27]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

## Mean baseline

1963891

In [41]:
mean_is_churn = train['is_churn'].mean()

print(f"-- Feed Forward Network -- ")
# Evaluate the model
y_pred_prob = [mean_is_churn]*X_train.shape[0]
y_pred = [1 if x > 0.5 else 0 for x in y_pred_prob]

accuracy = accuracy_score(y_train, y_pred)
print(f"Training Accuracy: {accuracy:.4f}")

logloss = log_loss(y_train, y_pred_prob)
print(f"Training Log Loss: {logloss:.4f}")


-- Feed Forward Network -- 
Training Accuracy: 0.9232
Training Log Loss: 0.2709


## simple feedforward neural netwok

In [31]:
# Standardizing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Define the MLP model in scikit-learn
model = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32),  # Layer sizes similar to Keras
    activation='relu',
    solver='adam',                 # Using 'adadelta' optimizer
    alpha=0.1,                         # L2 regularization
    max_iter=200,                      # Number of iterations for training
    random_state=42
)

# Fit the model
model.fit(X_train_scaled, y_train)
# history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)

print(f"-- Feed Forward Network -- ")
# Evaluate the model
y_pred = model.predict(X_train_scaled)
accuracy = accuracy_score(y_train, y_pred)
print(f"Training Accuracy: {accuracy:.4f}")

y_pred_prob = model.predict_proba(X_train_scaled)
logloss = log_loss(y_train, y_pred_prob)
print(f"Training Log Loss: {logloss:.4f}")

-- Feed Forward Network -- 
Training Accuracy: 0.9590
Training Log Loss: 0.1085


## Decision tree

In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [30]:
# Split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the model
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
y_pred = dt_model.predict(X_train)

# Evaluate
accuracy = accuracy_score(y_train, y_pred)
print(f"-- Decision Tree -- ")
print(f"Training Accuracy: {accuracy:.4f}")

y_pred_prob = model.predict_proba(X_train_scaled)
logloss = log_loss(y_train, y_pred_prob)
print(f"Training Log Loss: {logloss:.4f}")

-- Decision Tree -- 
Training Accuracy: 0.9638
Training Log Loss: 0.1110


## Light GBM

In [32]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score

In [38]:
# Prepare LightGBM dataset
# Ensure your dataframe is named appropriately
X_train_lgb = X_train.copy()
X_train_lgb['trans_count'] = X_train_lgb['trans_count'].astype('int64')
X_train_lgb['logs_count'] = X_train_lgb['logs_count'].astype('int64')

train_data = lgb.Dataset(X_train_lgb, label=y_train)
# test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Set parameters
params = {
    'objective': 'binary',          # For binary classification
    'boosting_type': 'gbdt',        # Gradient Boosting Decision Tree
    'metric': 'binary_logloss',     # Loss metric
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1
}

# Train the model
lgb_model = lgb.train(params, train_data, num_boost_round=100)
# lgb_model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=100, early_stopping_rounds=10)

# Predict
y_pred = lgb_model.predict(X_train_lgb)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

print("-- LightGBM --")
# Evaluate
accuracy = accuracy_score(y_train, y_pred_binary)
print(f"Training Accuracy: {accuracy:.4f}")

# Predict probabilities using LightGBM
y_pred_prob = lgb_model.predict(X_train_lgb)  # This gives probabilities directly

# Compute Log Loss
logloss = log_loss(y_train, y_pred_prob)
print(f"Training Log Loss: {logloss:.4f}")



[LightGBM] [Info] Number of positive: 150801, number of negative: 1813090
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.152223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 968
[LightGBM] [Info] Number of data points in the train set: 1963891, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076787 -> initscore=-2.486827
[LightGBM] [Info] Start training from score -2.486827
-- LightGBM --
Training Accuracy: 0.9664
Training Log Loss: 0.0745


## XGBoost

In [36]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [37]:
# Initialize the model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # For binary classification
    max_depth=5,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

# Fit the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_train)

print(f"-- XGBoost -- ")
# Evaluate
accuracy = accuracy_score(y_train, y_pred)
print(f"Training Accuracy: {accuracy:.4f}")

# Predict probabilities using XGBoost
y_pred_prob = xgb_model.predict_proba(X_train)[:, 1]  # Get probabilities for the positive class

# Compute Log Loss
logloss = log_loss(y_train, y_pred_prob)
print(f"Training Log Loss: {logloss:.4f}")


-- XGBoost -- 
Training Accuracy: 0.9656
Training Log Loss: 0.0794


## hyper parameter tuning

In [43]:
# from sklearn.model_selection import GridSearchCV
# import lightgbm as lgb

# # Prepare the LightGBM model
# lgb_estimator = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', random_state=42)

# # Define the parameter grid
# param_grid = {
#     'num_leaves': [31, 50, 100],         # Increasing leaves can improve accuracy
#     'max_depth': [-1, 10, 20],          # -1 means no limit
#     'learning_rate': [0.01, 0.05, 0.1], # Smaller rates need more boosting rounds
#     'n_estimators': [50, 100, 200],     # Number of boosting rounds
#     'min_child_samples': [10, 20, 30],  # Minimum number of data points per leaf
# }

# # Perform Grid Search
# grid_search = GridSearchCV(
#     estimator=lgb_estimator,
#     param_grid=param_grid,
#     scoring='neg_log_loss',  # Minimize log loss
#     cv=3,                    # 3-fold cross-validation
#     verbose=1
# )

# # Fit the model
# grid_search.fit(X_train, y_train)

# # Best parameters and score
# print(f"Best Parameters: {grid_search.best_params_}")
# print(f"Best Log Loss: {-grid_search.best_score_:.4f}")


In [45]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [51]:

# Define the objective function for Optuna
def objective(trial):
    # Define a reduced hyperparameter search space
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),                 # Reduced range
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 30),   # Smaller range
        'n_estimators': trial.suggest_int('n_estimators', 50, 150),            # Moderate boosting rounds
        'max_depth': trial.suggest_int('max_depth', -1, 15),                   # Practical range
    }

    # Split data for validation
    X_train_part, X_valid, y_train_part, y_valid = train_test_split(
        X_train_lgb, y_train, test_size=0.2, random_state=42
    )

    # Train the model
    model = lgb.LGBMClassifier(**param)
    # model = model.train(params, train_data, num_boost_round=100)

    model.fit(X_train_part, y_train_part, eval_set=[(X_valid, y_valid)])
            #   eval_metric='logloss',  verbose=0) # early_stopping_rounds=10,

    # Predict and calculate log loss on the validation set
    y_valid_pred = model.predict_proba(X_valid)[:, 1]
    return log_loss(y_valid, y_valid_pred)

# Create an Optuna study with fewer trials
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # Limit to 20 trials

# Best parameters and score
print(f"Best Parameters: {study.best_params}")
print(f"Best Log Loss: {study.best_value:.4f}")

# Train the final model with the best parameters
best_params = study.best_params
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X_train_lgb, y_train)

# Evaluate the final model
y_train_pred = final_model.predict_proba(X_train_lgb)[:, 1]
train_logloss = log_loss(y_train, y_train_pred)
print(f"Final Training Log Loss: {train_logloss:.4f}")


[I 2025-01-12 20:31:02,866] A new study created in memory with name: no-name-ef2e0287-315a-429f-ade8-e8c5082b2145
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:31:11,369] Trial 0 finished with value: 0.1105862790975042 and parameters: {'learning_rate': 0.011542297519835237, 'num_leaves': 28, 'min_child_samples': 29, 'n_estimators': 100, 'max_depth': 14}. Best is trial 0 with value: 0.1105862790975042.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056000 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:31:15,819] Trial 1 finished with value: 0.09580206312960858 and parameters: {'learning_rate': 0.07078417077243734, 'num_leaves': 38, 'min_child_samples': 17, 'n_estimators': 79, 'max_depth': 2}. Best is trial 1 with value: 0.09580206312960858.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053657 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:31:27,020] Trial 2 finished with value: 0.101063503919151 and parameters: {'learning_rate': 0.011838105692582675, 'num_leaves': 31, 'min_child_samples': 20, 'n_estimators': 122, 'max_depth': 12}. Best is trial 1 with value: 0.09580206312960858.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054939 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:31:34,111] Trial 3 finished with value: 0.07990792684407444 and parameters: {'learning_rate': 0.05037437577658824, 'num_leaves': 37, 'min_child_samples': 29, 'n_estimators': 67, 'max_depth': 14}. Best is trial 3 with value: 0.07990792684407444.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055936 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:31:42,162] Trial 4 finished with value: 0.08714925741432154 and parameters: {'learning_rate': 0.027282687515298262, 'num_leaves': 36, 'min_child_samples': 27, 'n_estimators': 81, 'max_depth': -1}. Best is trial 3 with value: 0.07990792684407444.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058840 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:31:48,349] Trial 5 finished with value: 0.1302206816268464 and parameters: {'learning_rate': 0.012651340204080729, 'num_leaves': 45, 'min_child_samples': 29, 'n_estimators': 59, 'max_depth': -1}. Best is trial 3 with value: 0.07990792684407444.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050643 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:31:52,952] Trial 6 finished with value: 0.12674323385763728 and parameters: {'learning_rate': 0.03482486262015551, 'num_leaves': 50, 'min_child_samples': 27, 'n_estimators': 97, 'max_depth': 1}. Best is trial 3 with value: 0.07990792684407444.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056557 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:32:03,514] Trial 7 finished with value: 0.08690157528214329 and parameters: {'learning_rate': 0.019593585090576026, 'num_leaves': 50, 'min_child_samples': 23, 'n_estimators': 115, 'max_depth': 8}. Best is trial 3 with value: 0.07990792684407444.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055666 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:32:12,018] Trial 8 finished with value: 0.07668946969166358 and parameters: {'learning_rate': 0.08309885623140148, 'num_leaves': 28, 'min_child_samples': 19, 'n_estimators': 90, 'max_depth': -1}. Best is trial 8 with value: 0.07668946969166358.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055398 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:32:23,588] Trial 9 finished with value: 0.07567796241755222 and parameters: {'learning_rate': 0.06962442021440195, 'num_leaves': 35, 'min_child_samples': 23, 'n_estimators': 138, 'max_depth': 12}. Best is trial 9 with value: 0.07567796241755222.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050203 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:32:34,443] Trial 10 finished with value: 0.07618975403474569 and parameters: {'learning_rate': 0.09824522445747652, 'num_leaves': 21, 'min_child_samples': 11, 'n_estimators': 149, 'max_depth': 8}. Best is trial 9 with value: 0.07567796241755222.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:32:44,972] Trial 11 finished with value: 0.07632902960525458 and parameters: {'learning_rate': 0.09579613353391302, 'num_leaves': 20, 'min_child_samples': 10, 'n_estimators': 150, 'max_depth': 9}. Best is trial 9 with value: 0.07567796241755222.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054797 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:32:55,892] Trial 12 finished with value: 0.07839419882669009 and parameters: {'learning_rate': 0.058512426874154956, 'num_leaves': 22, 'min_child_samples': 10, 'n_estimators': 150, 'max_depth': 5}. Best is trial 9 with value: 0.07567796241755222.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057200 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:33:07,865] Trial 13 finished with value: 0.07653442756337114 and parameters: {'learning_rate': 0.04350501235111091, 'num_leaves': 42, 'min_child_samples': 14, 'n_estimators': 128, 'max_depth': 11}. Best is trial 9 with value: 0.07567796241755222.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054579 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:33:17,892] Trial 14 finished with value: 0.07782196249564767 and parameters: {'learning_rate': 0.07264846147641425, 'num_leaves': 24, 'min_child_samples': 24, 'n_estimators': 136, 'max_depth': 5}. Best is trial 9 with value: 0.07567796241755222.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055648 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:33:28,604] Trial 15 finished with value: 0.07531796275663362 and parameters: {'learning_rate': 0.09496788635642363, 'num_leaves': 31, 'min_child_samples': 15, 'n_estimators': 135, 'max_depth': 10}. Best is trial 15 with value: 0.07531796275663362.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056311 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:33:38,282] Trial 16 finished with value: 0.07670540158597214 and parameters: {'learning_rate': 0.05901126718576082, 'num_leaves': 32, 'min_child_samples': 15, 'n_estimators': 111, 'max_depth': 11}. Best is trial 15 with value: 0.07531796275663362.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058504 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:33:51,732] Trial 17 finished with value: 0.07683096868587785 and parameters: {'learning_rate': 0.037501415558769784, 'num_leaves': 41, 'min_child_samples': 23, 'n_estimators': 138, 'max_depth': 15}. Best is trial 15 with value: 0.07531796275663362.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:34:04,133] Trial 18 finished with value: 0.07963680252545714 and parameters: {'learning_rate': 0.027215872426379448, 'num_leaves': 33, 'min_child_samples': 14, 'n_estimators': 135, 'max_depth': 10}. Best is trial 15 with value: 0.07531796275663362.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 120524, number of negative: 1450588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 1571112, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076713 -> initscore=-2.487875
[LightGBM] [Info] Start training from score -2.487875


[I 2025-01-12 20:34:13,579] Trial 19 finished with value: 0.07770309178062951 and parameters: {'learning_rate': 0.06576323074456261, 'num_leaves': 29, 'min_child_samples': 20, 'n_estimators': 112, 'max_depth': 6}. Best is trial 15 with value: 0.07531796275663362.


Best Parameters: {'learning_rate': 0.09496788635642363, 'num_leaves': 31, 'min_child_samples': 15, 'n_estimators': 135, 'max_depth': 10}
Best Log Loss: 0.0753
[LightGBM] [Info] Number of positive: 150801, number of negative: 1813090
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 968
[LightGBM] [Info] Number of data points in the train set: 1963891, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076787 -> initscore=-2.486827
[LightGBM] [Info] Start training from score -2.486827
Final Training Log Loss: 0.0735
