# LGBMClassifier

Related notebooks:

1. EDA - https://www.kaggle.com/agorinenko/feb-2022-part1-eda
2. CatBoostClassifier - https://www.kaggle.com/agorinenko/feb-2022-part2-cat-boost-classifier

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate

from lightgbm import LGBMClassifier

# Load data from eda notebook

In [None]:
train_df = pd.read_csv('../input/feb-2022-eda/train.csv')
test_df = pd.read_csv('../input/feb-2022-eda/test.csv')

In [None]:
train_df['gcd'] = train_df['gcd']/10000
test_df['gcd'] = test_df['gcd']/10000

Let's separate the target variable and the features.

In [None]:
features_columns = [e for e in train_df.columns if e != 'row_id' and e != 'target']

Encode the target variable.

In [None]:
target_col = 'target_num'

le = LabelEncoder()
train_df[target_col] = le.fit_transform(train_df.target)

train_df.head()

In [None]:
X_train = train_df[features_columns].astype(np.float64)
y_train = train_df[target_col].astype(np.float64)

X_test = test_df[features_columns].astype(np.float64)

# Tuning global parameters

In [None]:
!pip install optuna -q

In [None]:
!pip install lightgbm==3.3.2 -q 

In [None]:
import optuna
from sklearn.metrics import accuracy_score
from lightgbm import early_stopping
from lightgbm import log_evaluation

In [None]:
def objective(trial):
    train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.3)

    param = {
        'objective': 'multiclass', 
        'metric': 'multi_logloss',
        'device': 'cpu', 
        'random_state': 42,
        'verbosity': -1,
        'max_bin': trial.suggest_int('max_bin', 255, 455, step=25),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.551, step=0.05),
        'max_depth': trial.suggest_int("max_depth", 1, 12, step=1),
        'num_leaves': trial.suggest_int('num_leaves', 31, 4031, step=100),
        'n_estimators': trial.suggest_int('n_estimators', 1000, 4000, step=200),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 1920, step=100),
    }


    lgbm = LGBMClassifier(**param)
    record= {}
    lgbm.fit(train_x, train_y, eval_metric='multi_logloss', eval_set=[(valid_x, valid_y)], callbacks=[early_stopping(100, verbose=0), log_evaluation(0)])

    preds = lgbm.predict(valid_x)
    preds = preds.flatten()
    
    accuracy = accuracy_score(valid_y, preds)
    return accuracy

In [None]:
# %%time

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50)

# print("Number of finished trials: {}".format(len(study.trials)))

# print("Best trial:")
# trial = study.best_trial

# print("  Value: {}".format(trial.value))

# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

Best params:
1. max_bin: 405
1. learning_rate: 0.101
1. max_depth: 7
1. num_leaves: 2431
1. n_estimators: 1600
1. min_data_in_leaf: 920

# Validate

In [None]:
trial_params = {
    'objective': 'multiclass', 
    'metric': 'multi_logloss',
    'device': 'cpu', 
    'random_state': 42,
    'verbosity': -1,
    'max_bin': 405,
    'learning_rate': 0.101,
    'max_depth': 7,
    'num_leaves': 2431,
    'n_estimators': 1600,
    'min_data_in_leaf': 920
}
model = LGBMClassifier(**trial_params)

In [None]:
scores = cross_validate(model, 
                        X_train, y_train, 
                        cv=5,                      
                        scoring=('accuracy'))

In [None]:
print(f'Mean validation accuracy score: {scores["test_score"].mean()}')

# Train model

In [None]:
%%time

model.fit(X_train, y_train)

# Predict

In [None]:
y_pred = model.predict(X_test)

# Submission

In [None]:
def save_submission(y_pred):  
    y_pred = y_pred.astype(np.int64)
    y_pred_class = le.inverse_transform(y_pred)
    submission = test_df[['row_id']].copy() 
    submission["target"] = y_pred_class
    
    assert len(y_pred_class) == submission.shape[0]
    assert 2 == submission.shape[1]
    
    submission.to_csv("submission.csv",index=False)
    return submission

In [None]:
save_submission(y_pred.flatten()).head()