# Light GBM

# Model & Tahmin

In [2]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from warnings import filterwarnings
filterwarnings('ignore')

diabetes = pd.read_csv("diabetes.csv")

In [3]:
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
#X = df["Pregnancies"]
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [4]:
from lightgbm import LGBMClassifier

In [5]:
lgbm_model = LGBMClassifier().fit(X_train, y_train)

In [6]:
y_pred = lgbm_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7229437229437229

# Model Tuning

In [11]:
?lgbm_model

[0;31mType:[0m           LGBMClassifier
[0;31mString form:[0m    LGBMClassifier()
[0;31mFile:[0m           /opt/anaconda3/lib/python3.9/site-packages/lightgbm/sklearn.py
[0;31mDocstring:[0m      LightGBM classifier.
[0;31mInit docstring:[0m
Construct a gradient boosting model.

Parameters
----------
boosting_type : str, optional (default='gbdt')
    'gbdt', traditional Gradient Boosting Decision Tree.
    'dart', Dropouts meet Multiple Additive Regression Trees.
    'goss', Gradient-based One-Side Sampling.
    'rf', Random Forest.
num_leaves : int, optional (default=31)
    Maximum tree leaves for base learners.
max_depth : int, optional (default=-1)
    Maximum tree depth for base learners, <=0 means no limit.
learning_rate : float, optional (default=0.1)
    Boosting learning rate.
    You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate
    in training using ``reset_parameter`` callback.
    Note, that this will ignore the ``learning_rate`` 

In [8]:
lgbm_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_child_samples": [5,10,20]}

In [9]:
lgbm = LGBMClassifier()

lgbm_cv_model = GridSearchCV(lgbm, lgbm_params, 
                             cv = 10, 
                             n_jobs = -1, 
                             verbose = 2)



In [12]:
lgbm_cv_model.fit(X_train, y_train)

Fitting 10 folds for each of 576 candidates, totalling 5760 fits
[CV] END learning_rate=0.1, max_depth=3, min_child_samples=5, n_estimators=100, subsample=0.6; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=3, min_child_samples=5, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=3, min_child_samples=5, n_estimators=500, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=3, min_child_samples=5, n_estimators=500, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=3, min_child_samples=5, n_estimators=500, subsample=1.0; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=3, min_child_samples=5, n_estimators=500, subsample=1.0; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=3, min_child_samples=5, n_estimators=1000, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=3, min_child_samples=5, n_estimators=1000, subsample=0.6; total time=   0.1s
[CV] END learning_rat

GridSearchCV(cv=10, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.01, 0.02, 0.05],
                         'max_depth': [3, 4, 5, 6],
                         'min_child_samples': [5, 10, 20],
                         'n_estimators': [100, 500, 1000, 2000],
                         'subsample': [0.6, 0.8, 1.0]},
             verbose=2)

In [13]:
lgbm_cv_model.best_params_

{'learning_rate': 0.05,
 'max_depth': 3,
 'min_child_samples': 20,
 'n_estimators': 100,
 'subsample': 0.6}

In [14]:
lgbm = LGBMClassifier(learning_rate = 0.05, 
                       max_depth = 3,
                       subsample = 0.6,
                       n_estimators = 100,
                       min_child_samples = 20)

In [15]:
lgbm_tuned = lgbm.fit(X_train,y_train)

In [16]:
y_pred = lgbm_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

0.7489177489177489

[CV] END learning_rate=0.05, max_depth=4, min_child_samples=10, n_estimators=500, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.05, max_depth=4, min_child_samples=10, n_estimators=500, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.05, max_depth=4, min_child_samples=10, n_estimators=500, subsample=1.0; total time=   0.1s
[CV] END learning_rate=0.05, max_depth=4, min_child_samples=10, n_estimators=500, subsample=1.0; total time=   0.1s
[CV] END learning_rate=0.05, max_depth=4, min_child_samples=10, n_estimators=1000, subsample=0.6; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=4, min_child_samples=10, n_estimators=1000, subsample=0.6; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=4, min_child_samples=10, n_estimators=1000, subsample=1.0; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=4, min_child_samples=10, n_estimators=1000, subsample=1.0; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=4, min_child_samples=10, n_es