# **LightGBM Model**


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
plt.xkcd()

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
PATH = "../input/"
print(os.listdir(PATH))

# Any results you write to the current directory are saved as output.

## Read in the data reducing memory pattern for variables.
The implementation was copied over from [this kernel](https://www.kaggle.com/gemartin/load-data-reduce-memory-usage)

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
application_train = import_data(PATH+'application_train.csv')
application_test = import_data(PATH+'application_test.csv')

The following 2 cells with cleaning criteria were inherited from [this kernel](https://www.kaggle.com/kingychiu/home-credit-eda-distributions-and-outliers)

In [None]:
application_train = application_train[application_train['AMT_INCOME_TOTAL'] != 1.170000e+08]
application_train = application_train[application_train['AMT_REQ_CREDIT_BUREAU_QRT'] != 261]
application_train = application_train[application_train['OBS_30_CNT_SOCIAL_CIRCLE'] < 300]

In [None]:
application_train['DAYS_EMPLOYED'] = (application_train['DAYS_EMPLOYED'].apply(lambda x: x if x != 365243 else np.nan))

## Additional numerical features
The credit length feature idea is due [@oskird](https://www.kaggle.com/sz8416) implemented [here in the corresponding kernel](https://www.kaggle.com/sz8416/eda-baseline-model-using-application)

In [None]:
def feat_ext_source(df):
    x1 = df['EXT_SOURCE_1'].fillna(-1) + 1e-1
    x2 = df['EXT_SOURCE_2'].fillna(-1) + 1e-1
    x3 = df['EXT_SOURCE_3'].fillna(-1) + 1e-1
    
    df['EXT_SOURCE_1over2_NAminus1_Add0.1'] = x1/x2
    df['EXT_SOURCE_2over1_NAminus1_Add0.1'] = x2/x1
    df['EXT_SOURCE_1over3_NAminus1_Add0.1'] = x1/x3
    df['EXT_SOURCE_3over1_NAminus1_Add0.1'] = x3/x1
    df['EXT_SOURCE_2over3_NAminus1_Add0.1'] = x2/x3
    df['EXT_SOURCE_3over2_NAminus1_Add0.1'] = x3/x2
    
    df['EXT_SOURCE_na1_2'] = (df['EXT_SOURCE_1'].isnull()) * (df['EXT_SOURCE_2'].fillna(0))
    df['EXT_SOURCE_na1_3'] = (df['EXT_SOURCE_1'].isnull()) * (df['EXT_SOURCE_3'].fillna(0))
    df['EXT_SOURCE_na2_1'] = (df['EXT_SOURCE_2'].isnull()) * (df['EXT_SOURCE_1'].fillna(0))
    df['EXT_SOURCE_na2_3'] = (df['EXT_SOURCE_2'].isnull()) * (df['EXT_SOURCE_3'].fillna(0))
    df['EXT_SOURCE_na3_1'] = (df['EXT_SOURCE_3'].isnull()) * (df['EXT_SOURCE_1'].fillna(0))
    df['EXT_SOURCE_na3_2'] = (df['EXT_SOURCE_3'].isnull()) * (df['EXT_SOURCE_2'].fillna(0))
    
    df['CREDIT_LENGTH'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    
    return df

In [None]:
application_train = feat_ext_source(application_train)
application_test  = feat_ext_source(application_test)

## Categorical encoding
The function was taken from [this kernel](https://www.kaggle.com/sz8416/simple-intro-eda-baseline-model-with-gridsearch). It allows to do OneHotEncoding (OHE) keeping only those columns that are common to train and test samples. OHE is performed using `pd.get_dummies`, which allows to convert categorical features, while keeping numerical untouched

In [None]:
# use this if you want to convert categorical features to dummies(default)
def cat_to_dummy(train, test):
    train_d = pd.get_dummies(train, drop_first=False)
    test_d = pd.get_dummies(test, drop_first=False)
    # make sure that the number of features in train and test should be same
    for i in train_d.columns:
        if i not in test_d.columns:
            if i!='TARGET':
                train_d = train_d.drop(i, axis=1)
    for j in test_d.columns:
        if j not in train_d.columns:
            if j!='TARGET':
                test_d = test_d.drop(i, axis=1)
    print('Memory usage of train increases from {:.2f} to {:.2f} MB'.format(train.memory_usage().sum() / 1024**2, 
                                                                            train_d.memory_usage().sum() / 1024**2))
    print('Memory usage of test increases from {:.2f} to {:.2f} MB'.format(test.memory_usage().sum() / 1024**2, 
                                                                            test_d.memory_usage().sum() / 1024**2))
    return train_d, test_d

application_train_ohe, application_test_ohe = cat_to_dummy(application_train, application_test)

In [None]:
# use this if you want to convert categorical features to dummies(default)
def cat_to_int(train, test):
    mem_orig_train = train.memory_usage().sum() / 1024**2
    mem_orig_test  = test .memory_usage().sum() / 1024**2
    categorical_feats = [ f for f in train.columns if train[f].dtype == 'object' or train[f].dtype.name == 'category' ]
    print('---------------------')
    print(categorical_feats)
    for f_ in categorical_feats:
        train[f_], indexer = pd.factorize(train[f_])
        test[f_] = indexer.get_indexer(test[f_])
    print('Memory usage of train increases from {:.2f} to {:.2f} MB'.format(mem_orig_train, 
                                                                            train.memory_usage().sum() / 1024**2))
    print('Memory usage of test increases from {:.2f} to {:.2f} MB'.format(mem_orig_test, 
                                                                            test.memory_usage().sum() / 1024**2))
    return categorical_feats, train, test

categorical_feats, application_train_ohe, application_test_ohe = cat_to_int(application_train, application_test)

## Deal with category imbalance
Use a standard library (`imblearn`) to to random undersampling on the dominating category. Use if if you want to repeat the HP optimisation

In [None]:

X_rus, y_rus = (application_train_ohe.drop(['SK_ID_CURR', 'TARGET'], axis=1),
                application_train_ohe['TARGET'])

In [None]:

target = application_train_ohe.loc[:,'TARGET']
target.shape

In [None]:
total = 225359 + 19830


# Import libraries
from matplotlib import pyplot as plt
import numpy as np


# Creating dataset
counts = ['1_count' , '0_count']

data = [19830 ,  225359]

# Creating plot
fig = plt.figure(figsize =(10, 7))
plt.pie(data, labels = counts)

# show plot
plt.show()



# Burda modeli train ve test set diye ayiriyoruz ayrica targeti da ayriyoruz.
# In this part we are splitting the data set (Train and Test sets) beside we are creating target and features.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_rus, y_rus, test_size=0.20, random_state=314)

# Ikinci asamada validation için Kfold mu stratifiedKfold mu kullanıcağımızı öğrenmek için Targettaki data dağılımına baktık ve dağılımda büyük bir eşitsizlik olduğundan dolayı startifiedKfold kullanmaya karar verdik.

# In the second part, we are finding the correct validation algorithm between Kfold and stratifiedkfold. In order to decide it, we are looking the balance between target variable. We saw a huge difference between percentages. Therefore, we are deciding to use StratifiedKfold.

In [None]:
count1 = 0
count0 = 0
print(y_train.value_counts())

total = 225359 + 19830


# Import libraries
from matplotlib import pyplot as plt
import numpy as np


# Creating dataset
counts = ['1_count' , '0_count']

data = [19830 ,  225359]

# Creating plot
fig = plt.figure(figsize =(10, 7))
plt.pie(data, labels = counts)

# show plot
plt.show()

#comment: big difference between percentages. The strategy that i will use will be StratifiedKFold. This is because of the
#huge difference in target values.

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from imblearn.datasets import make_imbalance

In [None]:
y_train.value_counts().plot(kind='bar')
plt.title('label balance')
plt.xlabel('label values')
plt.ylabel('amount per label')
plt.show()

In [None]:

from sklearn.impute import SimpleImputer

## Filling the nan values with most frequent value.

In [None]:
imputer = SimpleImputer(strategy='most_frequent')
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_train.columns = X_train.columns

In [None]:
imputer = SimpleImputer(strategy='most_frequent')
imputed_X_val = pd.DataFrame(imputer.fit_transform(X_val))
imputed_X_val.columns = X_val.columns

# StratifiedKfold algoritmasi ile validation, bu asama da accuracy score yüzde 90 üzeri çıktığından dolayı burda overfitting yaptığımızı düşünüyoruz ama nedenini çözemedik.

# We use stratifiedKfold. However we observe very high accuracy scores. We tought that we make a mistake which cause an overfitting.

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cross_val_score(model, imputed_X_val, y_val, cv=skf) #overfitting

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cross_val_score(model, X_val_B, y_val_B, cv=skf)

# We tried with other methodes but we could not find anything.

In [None]:
# fold_no = 1
# for train_index, test_index in skf.split(application_train_ohe, target):
#     train = application_train_ohe.loc[train_index,:]
#     test = application_train_ohe.loc[test_index,:]
#     #print(sum(test['TARGET']))
#     print('Fold',str(fold_no),'Class Ratio:',(test['TARGET'].count()/len(application_train_ohe['TARGET'])))
#     fold_no += 1

In [None]:
# data = application_train_ohe.dropna(subset = ["TARGET"])
# data["TARGET"].isnull().sum()

In [None]:
# from sklearn import preprocessing
# from sklearn import utils




In [None]:
# def train_model(train, test, fold_no):
#     y_train = train['TARGET']
#     X_train = train.drop(["TARGET" , "SK_ID_CURR"],axis=1)
#     y_test = test['TARGET']
#     X_test = test.drop(["TARGET" ,"SK_ID_CURR" ],axis=1)
    
    
    
#     lab_enc = preprocessing.LabelEncoder()
#     y_train = lab_enc.fit_transform(y_train)
#     print(y_train.shape , X_train.shape)
    
#     model.fit(X_train,y_train)
#     predictions = model.predict(X_test)
#     print('Fold',str(fold_no),'Accuracy:',accuracy_score(y_test,predictions))

In [None]:
# fold_no = 1
# for train_index, test_index in skf.split(data, target):
#     train = data.loc[train_index,:]
#     test = data.loc[test_index,:]
#     train_model(train,test,fold_no)
#     fold_no += 1

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# fold_no = 1
# for train_index, test_index in skf.split(application_train_ohe, target):
#     train = application_train_ohe.loc[train_index,:]
#     test = application_train_ohe.loc[test_index,:]

#     fold_no += 1 
#     #in order to build folds and 

# Balance the data
Bizde stratified k-fold u uygulayamayinca benzerini ama daha simple versyonunu uygulamaya karar verdik ve datayi oranlarini esitledik.

In [None]:
X_train_B, y_train_B = make_imbalance(imputed_X_train, y_train, sampling_strategy={0: 19000, 1: 19000},random_state=14)


In [None]:
X_val_B, y_val_B = make_imbalance(imputed_X_val, y_val, sampling_strategy={0: 5010, 1: 5010},random_state=14)

# Modeli default parametrelerle oluşturduk.

# We create model with default parametres.

In [None]:
model = lgb.LGBMClassifier(max_depth=-1 ,n_estimators = 100 , n_jobs=4) #model with basic parametre
#print(X_train.shape , y_train.shape)
model.fit(X_train_B,y_train_B)
predictions = model.predict(X_val_B)
print('Accuracy:',accuracy_score(y_val_B,predictions))


### Prepare learning rate shrinkage

In [None]:
def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

### Use test subset for early stopping criterion 
This allows us to avoid overtraining and we do not need to optimise the number of trees

In [None]:
import lightgbm as lgb
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_val_B,y_val_B)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

### Set up HyperParameter search
We use random search, which is more flexible and more efficient than a grid search

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [None]:
#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 5

import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=200)
clf.fit(X_train_B,y_train_B)
predictions = clf.predict(X_val_B)
print('Accuracy:',accuracy_score(y_val_B,predictions))

gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='roc_auc',
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

In [None]:
gs.fit(X_train_B, y_train_B, **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

In [None]:
opt_parameters = {'colsample_bytree': 0.950, 'min_child_samples': 301, 'min_child_weight': 0.1, 'num_leaves': 28, 'reg_alpha': 0, 'reg_lambda': 100, 'subsample': 0.93264}

In [None]:
gs.best_params_

# Lofo importance hyperparametreleri optimize etmek icin kullanmaya calistik ancak hata aldik.

In [None]:
# !pip install git+https://github.com/aerdem4/lofo-importance

In [None]:
# from lofo.lofo_importance import LOFOImportance, plot_importance

In [None]:
# lofo_imp = LOFOImportance(X_train, model = clf,scoring='neg_mean_squared_error')
# importance_df = lofo_imp.get_importance()
# plot_importance(importance_df, figsize=(12, 12))

## FINAL MODEL
We set best parametre.


In [None]:
#Configure from the HP optimisation
#clf_final = lgb.LGBMClassifier(**gs.best_estimator_.get_params())

#Configure locally from hardcoded values
clf_final = lgb.LGBMClassifier(**clf.get_params())
#set optimal parameters
clf_final.set_params(**opt_parameters)

#Train the final model with learning rate decay
clf_final.fit(X_train_B, y_train_B, **fit_params, callbacks=[lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_0995)]  )