In [1]:
import dask.dataframe as dd
import numpy as np
from IPython.display import display
from dask.distributed import Client, LocalCluster
import dask

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support)

In [3]:
# Set up a Dask Cluster
cluster = LocalCluster(n_workers=6, threads_per_worker=1, memory_limit='18GB')
client = Client(cluster)

print("client link:")
print(client.dashboard_link)  # Clickable link to the dashboard


client link:
http://127.0.0.1:8787/status


In [4]:
# setup Variables for filepaths
DATA_DIR="../../data"

MEMBERS_FILE=f"{DATA_DIR}/members_v3.csv"
TRANSACTION_FILE=f"{DATA_DIR}/transactions.csv"
TRAIN_FILE=f"{DATA_DIR}/train.csv"
USERLOG_FILE=f"{DATA_DIR}/user_logs.csv"
SAMPLE_SUBMISSION_FILE=f"{DATA_DIR}/sample_submission_zero.csv"

TRANSACTION_V2_FILE=f"{DATA_DIR}/transactions_v2.csv"
TRAIN_V2_FILE=f"{DATA_DIR}/train_v2.csv"
USER_LOGS_V2_FILE=f"{DATA_DIR}/user_logs_v2.csv"
SAMPLE_SUBMISSION_V2_FILE=f"{DATA_DIR}/sample_submission_v2.csv"

In [5]:
print(f"DATA_DIR: {DATA_DIR}")
print(f"TRANSACTION_FILE: {TRANSACTION_FILE}")
print(f"USERLOG_FILE: {USERLOG_FILE}")
print(f"TRAIN_FILE: {TRAIN_FILE}")
print(f"SAMPLE_SUBMISSION_FILE: {SAMPLE_SUBMISSION_FILE}")
print(f"MEMBERS_FILE: {MEMBERS_FILE}")
print()
print(f"TRANSACTION_V2_FILE: {TRANSACTION_V2_FILE}")
print(f"USER_LOGS_V2_FILE: {USER_LOGS_V2_FILE}")
print(f"TRAIN_V2_FILE: {TRAIN_V2_FILE}")
print(f"SAMPLE_SUBMISSION_V2_FILE: {SAMPLE_SUBMISSION_V2_FILE}")

DATA_DIR: ../../data
TRANSACTION_FILE: ../../data/transactions.csv
USERLOG_FILE: ../../data/user_logs.csv
TRAIN_FILE: ../../data/train.csv
SAMPLE_SUBMISSION_FILE: ../../data/sample_submission_zero.csv
MEMBERS_FILE: ../../data/members_v3.csv

TRANSACTION_V2_FILE: ../../data/transactions_v2.csv
USER_LOGS_V2_FILE: ../../data/user_logs_v2.csv
TRAIN_V2_FILE: ../../data/train_v2.csv
SAMPLE_SUBMISSION_V2_FILE: ../../data/sample_submission_v2.csv


# data prep

In [6]:
train = dd.read_csv(TRAIN_FILE)
# train = dd.concat((train, dd.read_csv(TRAIN_V2_FILE)), axis=0, ignore_index=True).reset_index(drop=True)
# train = dd.concat([train, dd.read_csv(TRAIN_V2_FILE)])



In [7]:
len(train)

992931

In [8]:
transactions = dd.read_csv(TRANSACTION_FILE, usecols=['msno'])
transactions = dd.DataFrame(transactions['msno'].value_counts().reset_index()).compute()
transactions.columns = ['msno','trans_count']

train = train.merge(transactions, how='left', on='msno')



In [9]:
transactions = dd.read_csv(TRANSACTION_FILE) 
transactions = transactions.sort_values(by=['transaction_date'], ascending=[False]).reset_index(drop=True)
transactions = transactions.drop_duplicates(subset=['msno'], keep='first')

train = dd.merge(train, transactions, how='left', on='msno')
# transactions=[]

In [10]:
user_logs = dd.read_csv(USERLOG_FILE, usecols=['msno'])
user_logs = dd.DataFrame(user_logs['msno'].value_counts().reset_index()).compute()
user_logs.columns = ['msno','logs_count']


train = dd.merge(train, user_logs, how='left', on='msno')
# user_logs = []; 

In [11]:
train

Unnamed: 0_level_0,msno,is_churn,trans_count,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,logs_count
npartitions=27,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,string,int64,int64[pyarrow],int64,int64,int64,int64,int64,int64,int64,int64,int64[pyarrow]
,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...


In [12]:
# def transform_df(df):
#     df = dd.DataFrame(df)
#     df = df.sort_values(by=['date'], ascending=[False])
#     df = df.reset_index(drop=True)
#     df = df.drop_duplicates(subset=['msno'], keep='first')
#     return df

# def transform_df2(df):
#     df = df.sort_values(by=['date'], ascending=[False])
#     df = df.reset_index(drop=True)
#     df = df.drop_duplicates(subset=['msno'], keep='first')
#     return df

In [13]:
# last_user_logs = []
# last_user_logs.append(transform_df(dd.read_csv(USER_LOGS_V2_FILE)))
# last_user_logs = dd.concat(last_user_logs, axis=0, ignore_index=True).reset_index(drop=True)
# last_user_logs = transform_df2(last_user_logs)
# print ('merging user logs features...')
# train = dd.merge(train, last_user_logs, how='left', on='msno')
# test = dd.merge(test, last_user_logs, how='left', on='msno')
# # last_user_logs=[]

In [14]:
members = dd.read_csv(MEMBERS_FILE)
train = dd.merge(train, members, how='left', on='msno')
print('members merge...')

members merge...


In [15]:
gender = {'male':1, 'female':2}
train['gender'] = train['gender'].map(gender)


You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('gender', 'float64'))



In [16]:
train = train.fillna(0)


In [17]:
train = train.compute()


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


In [18]:
del members
del transactions
del user_logs


import gc
gc.collect()

500

In [19]:
test = dd.read_csv(TRAIN_V2_FILE)

test_transactions = dd.read_csv(TRANSACTION_FILE, usecols=['msno'])
test_transactions = dd.concat([test_transactions, dd.read_csv(TRANSACTION_V2_FILE, usecols=['msno'])])
test_transactions = dd.DataFrame(test_transactions['msno'].value_counts().reset_index()).compute()
test_transactions.columns = ['msno','trans_count']

test = test.merge(test_transactions, how='left', on='msno')

test_transactions = dd.read_csv(TRANSACTION_V2_FILE) 
test_transactions = test_transactions.sort_values(by=['transaction_date'], ascending=[False]).reset_index(drop=True)
test_transactions = test_transactions.drop_duplicates(subset=['msno'], keep='first')


test = dd.merge(test, test_transactions, how='left', on='msno')


test_user_logs = dd.read_csv(USER_LOGS_V2_FILE, usecols=['msno'])
test_user_logs = dd.DataFrame(test_user_logs['msno'].value_counts().reset_index()).compute()
test_user_logs.columns = ['msno','logs_count']

test = dd.merge(test, test_user_logs, how='left', on='msno')

members = dd.read_csv(MEMBERS_FILE)

test = dd.merge(test, members, how='left', on='msno')

gender = {'male':1, 'female':2}
test['gender'] = test['gender'].map(gender)

test = test.fillna(0)
test = test.compute()

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('gender', 'float64'))

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


In [22]:
del members
del test_transactions
del test_user_logs

In [24]:
import gc
gc.collect()

131

## training

In [25]:
cols = [ c for c in train.columns if c not in ['is_churn', 'msno']]

X_train = train[cols]
y_train = train['is_churn']
X_test = test[cols]
y_test = test['is_churn']

In [26]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

In [27]:
mean_is_churn = train['is_churn'].mean()

print(f"-- Mean Baseline -- ")
# Evaluate the model
y_pred_prob = [mean_is_churn]*X_train.shape[0]
y_pred = [1 if x > 0.5 else 0 for x in y_pred_prob]

accuracy = accuracy_score(y_train, y_pred)
print(f"Training Accuracy: {accuracy:.4f}")

logloss = log_loss(y_train, y_pred_prob)
print(f"Training Log Loss: {logloss:.4f}")


y_pred_prob = [mean_is_churn]*y_test.shape[0]
y_pred = [1 if x > 0.5 else 0 for x in y_pred_prob]

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

logloss = log_loss(y_test, y_pred_prob)
print(f"Test Log Loss: {logloss:.4f}")


-- Mean Baseline -- 
Training Accuracy: 0.9361
Training Log Loss: 0.2376
Test Accuracy: 0.9101
Test Log Loss: 0.3075


## simple feedforward neural netwok

In [None]:
# Standardizing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# Define the MLP model in scikit-learn
model = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32),  # Layer sizes similar to Keras
    activation='relu',
    solver='adam',                 # Using 'adadelta' optimizer
    alpha=0.1,                         # L2 regularization
    max_iter=200,                      # Number of iterations for training
    random_state=42
)

# Fit the model
model.fit(X_train_scaled, y_train)
# history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)

print(f"-- Feed Forward Network -- ")
# Evaluate the model
y_pred = model.predict(X_train_scaled)
accuracy = accuracy_score(y_train, y_pred)
print(f"Training Accuracy: {accuracy:.4f}")

y_pred_prob = model.predict_proba(X_train_scaled)
logloss = log_loss(y_train, y_pred_prob)
print(f"Training Log Loss: {logloss:.4f}")


y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

y_pred_prob = model.predict_proba(X_test_scaled)
logloss = log_loss(y_test, y_pred_prob)
print(f"Test Log Loss: {logloss:.4f}")

-- Feed Forward Network -- 
Training Accuracy: 0.9668
Training Log Loss: 0.0765
Testing Accuracy: 0.9399
Testing Log Loss: 0.3268


## Decision tree

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [32]:
# Split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the model
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

print(f"-- Decision Tree -- ")
# Make predictions
y_pred = dt_model.predict(X_train_scaled)

# Evaluate
accuracy = accuracy_score(y_train, y_pred)
print(f"Training Accuracy: {accuracy:.4f}")

y_pred_prob = model.predict_proba(X_train_scaled)
logloss = log_loss(y_train, y_pred_prob)
print(f"Training Log Loss: {logloss:.4f}")


y_pred = dt_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

y_pred_prob = model.predict_proba(X_test_scaled)
logloss = log_loss(y_test, y_pred_prob)
print(f"Test Log Loss: {logloss:.4f}")

-- Decision Tree -- 
Training Accuracy: 0.0639




Training Log Loss: 0.0765
Test Accuracy: 0.0899




Test Log Loss: 0.3268


## Light GBM

In [29]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score

In [30]:
# Prepare LightGBM dataset
# Ensure your dataframe is named appropriately
X_train_lgb = X_train.copy()
X_train_lgb['trans_count'] = X_train_lgb['trans_count'].astype('int64')
X_train_lgb['logs_count'] = X_train_lgb['logs_count'].astype('int64')

X_test_lgb = X_test.copy()
X_test_lgb['trans_count'] = X_test_lgb['trans_count'].astype('int64')
X_test_lgb['logs_count'] = X_test_lgb['logs_count'].astype('int64')

train_data = lgb.Dataset(X_train_lgb, label=y_train)
test_data = lgb.Dataset(X_test_lgb, label=y_test, reference=train_data)

# Set parameters
params = {
    'objective': 'binary',          # For binary classification
    'boosting_type': 'gbdt',        # Gradient Boosting Decision Tree
    'metric': 'binary_logloss',     # Loss metric
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1
}

# Train the model
lgb_model = lgb.train(params, train_data, num_boost_round=100)
# lgb_model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=100, early_stopping_rounds=10)

print("-- LightGBM --")
# Predict
y_pred = lgb_model.predict(X_train_lgb)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

# Evaluate
accuracy = accuracy_score(y_train, y_pred_binary)
print(f"Training Accuracy: {accuracy:.4f}")

# Predict probabilities using LightGBM
y_pred_prob = lgb_model.predict(X_train_lgb)  # This gives probabilities directly

# Compute Log Loss
logloss = log_loss(y_train, y_pred_prob)
print(f"Training Log Loss: {logloss:.4f}")



# Predict
y_pred = lgb_model.predict(X_test_lgb)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

# Evaluate
accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Test Accuracy: {accuracy:.4f}")

# Predict probabilities using LightGBM
y_pred_prob = lgb_model.predict(X_test_lgb)  # This gives probabilities directly

# Compute Log Loss
logloss = log_loss(y_test, y_pred_prob)
print(f"Test Log Loss: {logloss:.4f}")



[LightGBM] [Info] Number of positive: 63471, number of negative: 929460
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.086825 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1004
[LightGBM] [Info] Number of data points in the train set: 992931, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.063923 -> initscore=-2.684021
[LightGBM] [Info] Start training from score -2.684021
-- LightGBM --
Training Accuracy: 0.9698
Training Log Loss: 0.0628
Test Accuracy: 0.9387
Test Log Loss: 0.3480


## XGBoost

In [33]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [34]:
# Initialize the model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # For binary classification
    max_depth=5,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

# Fit the model
xgb_model.fit(X_train, y_train)

print(f"-- XGBoost -- ")
# Make predictions
y_pred = xgb_model.predict(X_train)

# Evaluate
accuracy = accuracy_score(y_train, y_pred)
print(f"Training Accuracy: {accuracy:.4f}")

# Predict probabilities using XGBoost
y_pred_prob = xgb_model.predict_proba(X_train)[:, 1]  # Get probabilities for the positive class

# Compute Log Loss
logloss = log_loss(y_train, y_pred_prob)
print(f"Training Log Loss: {logloss:.4f}")


# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Predict probabilities using XGBoost
y_pred_prob = xgb_model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

# Compute Log Loss
logloss = log_loss(y_test, y_pred_prob)
print(f"Test Log Loss: {logloss:.4f}")


-- XGBoost -- 
Training Accuracy: 0.9694
Training Log Loss: 0.0689
Test Accuracy: 0.9396
Test Log Loss: 0.3430


## hyper parameter tuning

In [35]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [37]:

# Define the objective function for Optuna
def objective(trial):
    # Define a reduced hyperparameter search space
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),                 # Reduced range
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 30),   # Smaller range
        'n_estimators': trial.suggest_int('n_estimators', 50, 150),            # Moderate boosting rounds
        'max_depth': trial.suggest_int('max_depth', -1, 15),                   # Practical range
    }

    # Split data for validation
    X_train_part, X_valid, y_train_part, y_valid = train_test_split(
        X_train_lgb, y_train, test_size=0.2, random_state=42
    )

    # Train the model
    model = lgb.LGBMClassifier(**param)
    # model = model.train(params, train_data, num_boost_round=100)

    model.fit(X_train_part, y_train_part, eval_set=[(X_valid, y_valid)])
            #   eval_metric='logloss',  verbose=0) # early_stopping_rounds=10,

    # Predict and calculate log loss on the validation set
    y_valid_pred = model.predict_proba(X_valid)[:, 1]
    return log_loss(y_valid, y_valid_pred)

# Create an Optuna study with fewer trials
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # Limit to 20 trials

# Best parameters and score
print(f"Best Parameters: {study.best_params}")
print(f"Best Log Loss: {study.best_value:.4f}")

# Train the final model with the best parameters
best_params = study.best_params
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X_train_lgb, y_train)

# Evaluate the final model
y_train_pred = final_model.predict_proba(X_train_lgb)[:, 1]
train_logloss = log_loss(y_train, y_train_pred)
print(f"Final Training Log Loss: {train_logloss:.4f}")

# Evaluate the final model
y_test_pred = final_model.predict_proba(X_test_lgb)[:, 1]
test_logloss = log_loss(y_test, y_test_pred)
print(f"Final Test Log Loss: {test_logloss:.4f}")


[I 2025-01-13 01:21:12,592] A new study created in memory with name: no-name-e1eb99a3-5314-482d-9d15-09123b82c71d
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027604 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:21:15,237] Trial 0 finished with value: 0.11144796531703383 and parameters: {'learning_rate': 0.014909017190986735, 'num_leaves': 42, 'min_child_samples': 13, 'n_estimators': 50, 'max_depth': 8}. Best is trial 0 with value: 0.11144796531703383.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022670 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:21:19,166] Trial 1 finished with value: 0.0655599425108864 and parameters: {'learning_rate': 0.09239529510397033, 'num_leaves': 27, 'min_child_samples': 13, 'n_estimators': 91, 'max_depth': 10}. Best is trial 1 with value: 0.0655599425108864.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021964 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:21:24,511] Trial 2 finished with value: 0.0641786844899428 and parameters: {'learning_rate': 0.0954531206571992, 'num_leaves': 31, 'min_child_samples': 24, 'n_estimators': 125, 'max_depth': 13}. Best is trial 2 with value: 0.0641786844899428.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:21:27,415] Trial 3 finished with value: 0.06767447299488068 and parameters: {'learning_rate': 0.0889721263760913, 'num_leaves': 46, 'min_child_samples': 19, 'n_estimators': 56, 'max_depth': 6}. Best is trial 2 with value: 0.0641786844899428.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:21:32,832] Trial 4 finished with value: 0.06568702063991158 and parameters: {'learning_rate': 0.05280827159751681, 'num_leaves': 41, 'min_child_samples': 14, 'n_estimators': 95, 'max_depth': 13}. Best is trial 2 with value: 0.0641786844899428.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:21:36,474] Trial 5 finished with value: 0.06972076216132696 and parameters: {'learning_rate': 0.041576737754622485, 'num_leaves': 48, 'min_child_samples': 14, 'n_estimators': 76, 'max_depth': 7}. Best is trial 2 with value: 0.0641786844899428.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026934 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:21:39,016] Trial 6 finished with value: 0.1101363301393907 and parameters: {'learning_rate': 0.013030806636634367, 'num_leaves': 28, 'min_child_samples': 15, 'n_estimators': 60, 'max_depth': 15}. Best is trial 2 with value: 0.0641786844899428.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:21:43,628] Trial 7 finished with value: 0.07090025881171172 and parameters: {'learning_rate': 0.03758256507411383, 'num_leaves': 36, 'min_child_samples': 28, 'n_estimators': 118, 'max_depth': 5}. Best is trial 2 with value: 0.0641786844899428.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023820 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:21:49,841] Trial 8 finished with value: 0.06900224888649138 and parameters: {'learning_rate': 0.023567013424302813, 'num_leaves': 41, 'min_child_samples': 23, 'n_estimators': 145, 'max_depth': 8}. Best is trial 2 with value: 0.0641786844899428.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026500 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:21:51,982] Trial 9 finished with value: 0.08250259787217804 and parameters: {'learning_rate': 0.05547308617742812, 'num_leaves': 49, 'min_child_samples': 24, 'n_estimators': 77, 'max_depth': 2}. Best is trial 2 with value: 0.0641786844899428.




  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028621 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:21:56,970] Trial 10 finished with value: 0.07396438379179772 and parameters: {'learning_rate': 0.024133906381713404, 'num_leaves': 20, 'min_child_samples': 30, 'n_estimators': 124, 'max_depth': -1}. Best is trial 2 with value: 0.0641786844899428.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023649 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:22:01,676] Trial 11 finished with value: 0.06491662519132428 and parameters: {'learning_rate': 0.09738516128402146, 'num_leaves': 28, 'min_child_samples': 10, 'n_estimators': 110, 'max_depth': 11}. Best is trial 2 with value: 0.0641786844899428.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:22:07,044] Trial 12 finished with value: 0.06514120515298778 and parameters: {'learning_rate': 0.06886984350762432, 'num_leaves': 30, 'min_child_samples': 19, 'n_estimators': 119, 'max_depth': 12}. Best is trial 2 with value: 0.0641786844899428.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028835 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:22:12,916] Trial 13 finished with value: 0.06353930947710773 and parameters: {'learning_rate': 0.09767443321673915, 'num_leaves': 33, 'min_child_samples': 24, 'n_estimators': 139, 'max_depth': 15}. Best is trial 13 with value: 0.06353930947710773.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026653 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:22:19,460] Trial 14 finished with value: 0.06386573464208979 and parameters: {'learning_rate': 0.06467960621585452, 'num_leaves': 34, 'min_child_samples': 25, 'n_estimators': 146, 'max_depth': 15}. Best is trial 13 with value: 0.06353930947710773.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:22:25,963] Trial 15 finished with value: 0.06375764657325793 and parameters: {'learning_rate': 0.0615710769528543, 'num_leaves': 36, 'min_child_samples': 26, 'n_estimators': 149, 'max_depth': 15}. Best is trial 13 with value: 0.06353930947710773.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:22:31,921] Trial 16 finished with value: 0.06402176274025845 and parameters: {'learning_rate': 0.06925298960970899, 'num_leaves': 36, 'min_child_samples': 21, 'n_estimators': 135, 'max_depth': 15}. Best is trial 13 with value: 0.06353930947710773.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025267 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:22:37,934] Trial 17 finished with value: 0.07012284416810646 and parameters: {'learning_rate': 0.03035966559304507, 'num_leaves': 22, 'min_child_samples': 27, 'n_estimators': 150, 'max_depth': 10}. Best is trial 13 with value: 0.06353930947710773.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028338 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:22:42,165] Trial 18 finished with value: 0.07523542894623725 and parameters: {'learning_rate': 0.047202156679456465, 'num_leaves': 38, 'min_child_samples': 21, 'n_estimators': 136, 'max_depth': 3}. Best is trial 13 with value: 0.06353930947710773.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),  # Focused range


[LightGBM] [Info] Number of positive: 50866, number of negative: 743478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1012
[LightGBM] [Info] Number of data points in the train set: 794344, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064035 -> initscore=-2.682144
[LightGBM] [Info] Start training from score -2.682144


[I 2025-01-13 01:22:47,743] Trial 19 finished with value: 0.0657677506224352 and parameters: {'learning_rate': 0.07213076928707594, 'num_leaves': 24, 'min_child_samples': 27, 'n_estimators': 136, 'max_depth': 13}. Best is trial 13 with value: 0.06353930947710773.


Best Parameters: {'learning_rate': 0.09767443321673915, 'num_leaves': 33, 'min_child_samples': 24, 'n_estimators': 139, 'max_depth': 15}
Best Log Loss: 0.0635
[LightGBM] [Info] Number of positive: 63471, number of negative: 929460
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1004
[LightGBM] [Info] Number of data points in the train set: 992931, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.063923 -> initscore=-2.684021
[LightGBM] [Info] Start training from score -2.684021
Final Training Log Loss: 0.0612
Final Test Log Loss: 0.3646


# report
```
-- Mean Baseline -- 
Train Accuracy: 0.9361
Train Log Loss: 0.2376
Test Accuracy: 0.9101
Test Log Loss: 0.3075

-- Feed Forward Network -- 
Train Accuracy: 0.9668
Train Log Loss: 0.0765
Test Accuracy: 0.9399
Test Log Loss: 0.3268

-- Decision Tree -- 
Train Accuracy: 0.0639
Train Log Loss: 0.0765
Test Accuracy: 0.0899
Test Log Loss: 0.3268

-- LightGBM --
Train Accuracy: 0.9698
Train Log Loss: 0.0628
Test Accuracy: 0.9387
Test Log Loss: 0.3480

-- XGBoost -- 
Train Accuracy: 0.9694
Train Log Loss: 0.0689
Test Accuracy: 0.9396
Test Log Loss: 0.3430

-- Tunned LightGBM --
Train Log Loss: 0.0612
Test Log Loss: 0.3646
```


In [38]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], shape=(970960,))

In [40]:
submission = test[['msno', 'is_churn']].copy()
submission

Unnamed: 0,msno,is_churn
0,moRTKhKIDvb+C8ZHOgmaF4dXMLk0jOn65d7a8tQ2Eds=,1
1,t5rqTxCnG7s5VBgEfdkQCezv5KBK7+DMujNibYgylrs=,1
2,1AzXWFlRO6EfMBzfBdk98sBVnjzY7U1G24mVFNdzGNQ=,1
3,qpV8BYuYz/Z7LFqEuo2QEMfwWWxdCIQQT4X/XsPXwhc=,1
4,LZjqFj4TwHsByrOSyjUp9l/B9WOF34HGX0Hx7uiQ5xQ=,1
...,...,...
162517,PsftdQEI+bQFl8FB2+O4sKM4uRZGO/UvBCDS+ZyWmvk=,0
162518,MqyOPaDM7Jz3kV3fu/h9ilHP3TxLaMFE9raYkEHg5Jg=,0
162519,SjVZDYaiKgEHpFX1PcFDS94b9CFdaHjg78rfumtm/F4=,0
162520,eZ3y0lsY2SVZc2h8T3zB454TuBz6oVDMlFsEpEPQclQ=,0


In [42]:
submission['is_churn'] = y_pred


In [45]:
submission

Unnamed: 0,msno,is_churn
0,moRTKhKIDvb+C8ZHOgmaF4dXMLk0jOn65d7a8tQ2Eds=,0
1,t5rqTxCnG7s5VBgEfdkQCezv5KBK7+DMujNibYgylrs=,0
2,1AzXWFlRO6EfMBzfBdk98sBVnjzY7U1G24mVFNdzGNQ=,0
3,qpV8BYuYz/Z7LFqEuo2QEMfwWWxdCIQQT4X/XsPXwhc=,1
4,LZjqFj4TwHsByrOSyjUp9l/B9WOF34HGX0Hx7uiQ5xQ=,0
...,...,...
162517,PsftdQEI+bQFl8FB2+O4sKM4uRZGO/UvBCDS+ZyWmvk=,0
162518,MqyOPaDM7Jz3kV3fu/h9ilHP3TxLaMFE9raYkEHg5Jg=,0
162519,SjVZDYaiKgEHpFX1PcFDS94b9CFdaHjg78rfumtm/F4=,0
162520,eZ3y0lsY2SVZc2h8T3zB454TuBz6oVDMlFsEpEPQclQ=,0
