# Light Gradient Boosting Model testing
Aim of this notebook is to review the light gradient boosting model which can be used during a binary classification challenge.

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Light Gradient Boosting

In [None]:
# Import modules for model analysis
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Import lightgbm modules
import lightgbm as lgb

In [None]:
# Read in the data
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv',index_col=0)
test  = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col=0)

train.head()

In [None]:
# Check the memory consumed by the DataFrame
train.info(memory_usage='deep')

In [None]:
# Memory usage by variable in MB
train.memory_usage(deep=True) * 1e-6

In [None]:
# Lets reduce the memory usage of the features
# First - check the integer values and downcast
def int_downcast(df):
    int_cols = df.select_dtypes(include=['int64'])

    for col in int_cols.columns:
        print(col, 'min:',df[col].min(),'; max:',df[col].max())
        df[col] = pd.to_numeric(df[col], downcast ='integer')
    return df

int_downcast(train)
train.memory_usage(deep=True) * 1e-6

In [None]:
# Second - check the float values and downcast. Method will have to be applied to the train and test DataFrames
def float_downcast(df):
    float_cols = df.select_dtypes(include=['float64'])

    for col in float_cols.columns:
#         print(col, 'min:',df[col].min(),'; max:',df[col].max())
        df[col] = pd.to_numeric(df[col], downcast ='float')
    return df

float_downcast(train)
float_downcast(test)

In [None]:
# Check the memory usage by feature
train.memory_usage(deep=True) * 1e-6
test.memory_usage(deep=True) * 1e-6

In [None]:
# Review the memory usage by DataFrame
train.info(memory_usage='deep')
test.info(memory_usage='deep')

# Missing value treatment

In [None]:
# Check for missing values
train.isnull().sum()
test.isnull().sum()

# Add a dummy missing value for a row with missing data
features = [x for x in train.columns.values if x[0]=="f"]
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

# Model Analysis

In [None]:
X = train.drop('claim', axis=1)
y = train['claim']

In [None]:
# Prepare the data to be used within the model. Make use of the lgb.Dataset() method to optimise the memory usage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6, stratify=y)

# Using LGB dataset method and train

In [None]:
# Review using the LGB dataset and model build methods
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

print(f'{type(lgb_train)}')
print(f'{lgb_train.data.info()}')

In [None]:
print(type(lgb_train))
lgb_train.data.head()

In [None]:
# Specify the configurations as a dict
params = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0,
    'device': 'gpu'
}

# train - verbose_eval option switches off the log outputs
gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=lgb_eval,
    early_stopping_rounds=100,
    verbose_eval=-1,
)

# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# Compute and print metrics
print(f"AUC : {roc_auc_score(y_test, y_pred)}")

In [None]:
# Feature importance
lgb.plot_importance(gbm, max_num_features=15);
plt.show()

In [None]:
# Let's create a function to allow for future quick reviews of the same baseline model. Will allow for easy review of feature engineering and selection processing steps
def base_model(train, dep):
    
    # Create feature variables
    X = train
    y = dep
    
    # Prepare the data to be used within the model. Make use of the lgb.Dataset() method to optimise the memory usage
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6, stratify=y)
    
    # Review using the LGB dataset and model build methods
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)
    
    # Run the model
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'is_unbalance': 'true',
        'boosting': 'gbdt',
        'num_leaves': 31,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.5,
        'bagging_freq': 20,
        'learning_rate': 0.05,
        'verbose': 0,
        'device': 'gpu'
    }

    # train - verbose_eval option switches off the log outputs
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=lgb_eval,
        early_stopping_rounds=100,
        verbose_eval=-1,
    )

    # predict
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    # Compute and print metrics
    print(f"AUC : {roc_auc_score(y_test, y_pred)}")
    return model

# Make submission

In [None]:
def submission_sample(model, df_test, model_name):
    sample = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
    sample['claim'] = model.predict(df_test)
    return sample.to_csv(f'submission_{model_name}.csv',index=False)

## Feature Engineering
***
After creating the initial baseline model we can start to perform some feature engineering steps. With feature engineering we are aiming to see if additional variables can be created that will help to improve the model.
***
1. Binning
    * Create binned values (quantiles, deciles)
2. Feature scaling
    * MinMax scaling
    * Standardization
    * Winsorizing 
3. Statistical transformations
    * Log
    * Polynomials
4. Feature Interactions
    * Use PolynomialFeatures
***
Prior to this feature engineering we can review teh missing value replacement assessment.
* Replace with mean / median / mode
* End of tail imputation - works best with normally distributed features

Lets go back to reviewing the Train and Test DataFrames


In [None]:
# Lets confirm the feature data types
print(f'Train : \n{train.dtypes.value_counts()}')
print(f'Test : \n{test.dtypes.value_counts()}')

### Review missing value replacement

In [None]:
# List of column names for review
# column_names = [col for col in train_miss.columns]
column_names = [col for col in X.columns]

In [None]:
# Create function for the missing value review
def impute_miss_values(df_train, df_test, strategy='mean'):
    # create the imputer, the strategy can be mean and median.
    imputer = SimpleImputer(missing_values=np.nan, strategy=strategy)

    # fit the imputer to the train data
    imputer.fit(df_train)

    # apply the transformation to the train and test
    train_imp = pd.DataFrame(imputer.transform(df_train), columns=column_names)
    test_imp = pd.DataFrame(imputer.transform(df_test), columns=column_names)
    return train_imp, test_imp

Median value replacement has helped to benefit the score accuracy the most. Lets review end of tail imputation as a comparison

In [None]:
# Update the train and test set to have the missing values as median
X, test = impute_miss_values(X, test, strategy='median')
# Confirm the model output still aligns to previous versions
lgb_median = base_model(X, dep=train['claim'])

### Binning

* Making use of the quartile binning, none of the variables appear to have added to the most important features
* Appears to be a slight improvement by adding decile values for the features
***
They could be a feature to add in future iterations of the model but they are not adding a lot to the final AUC improvements

### Feature scaling

In [None]:
# Aiming to review the impact of using the scaling features
# MinMaxScaler()
# Create function for the scaling review
# def impute_scaler(df_train, df_test, scaler=MinMaxScaler()):
    
#     # apply the transformation to the train and test
#     train_imp = pd.DataFrame(scaler.fit_transform(df_train), columns=column_names)
#     test_imp = pd.DataFrame(scaler.fit_transform(df_test), columns=column_names)
#     return train_imp, test_imp

In [None]:
# # Update the train and test set to have the missing values as median
# X, test = impute_scaler(X, test, scaler=StandardScaler())
# # Confirm the model output still aligns to previous versions
# lgb_standard = base_model(X, dep=train['claim'])

In [None]:
# submission_sample(lgb_standard, test, 'lgb_standard')

### Winsorizing

In [None]:
# def impute_winsor(df, prob=0.01):
    
#     # Review each of the columns and apply the clipping
#     for col in df.columns:
#         quant = np.quantile(df[col], [prob, (1-prob)])
#         df.loc[(df[col] <= quant[0]), col] = quant[0]
#         df.loc[(df[col] >= quant[1]), col] = quant[1]
    
#     return df

In [None]:
# X_new = impute_winsor(X)
# test_new = impute_winsor(test)
# # Review the model output
# lgb_winsor = base_model(X_new, dep=train['claim'])
# submission_sample(lgb_winsor, test_new, 'lgb_winsor')

Making use of the clipping didn't really benefit the model

### Polynomial Features

In [None]:
# from sklearn.preprocessing import PolynomialFeatures

# # Interactions between features
# def poly_interactions(df_train, df_test):
    
#     # Set-up the interactions feature
#     interactions = PolynomialFeatures(interaction_only=True)
    
#     # apply the transformation to the train and test
#     train_imp = pd.DataFrame(interactions.fit_transform(df_train), columns=column_names)
#     test_imp = pd.DataFrame(interactions.fit_transform(df_test), columns=column_names)
#     return train_imp, test_imp

# Polynomial Features
# def poly_features(df_train, df_test, num_features=2):
    
#     # Set-up the interactions feature
#     poly = PolynomialFeatures(num_features)
    
#     # apply the transformation to the train and test
#     train_imp = pd.DataFrame(poly.fit_transform(df_train), columns=column_names)
#     test_imp = pd.DataFrame(poly.fit_transform(df_test), columns=column_names)
#     return train_imp, test_imp

In [None]:
# Update the train and test set
# X_pi, test_pi = poly_interactions(X, test)
# # Run the model
# lgb_poly_i = base_model(X_pi, dep=train['claim'])
# submission_sample(lgb_poly_i, test_pi, 'lgb_poly_i')

In [None]:
# Update the train and test set
# X_pf, test_pf = poly_features(X, test)
# # Run the model
# lgb_poly_f = base_model(X_pf, dep=train['claim'])
# submission_sample(lgb_poly_f, test_pf, 'lgb_poly_f')

Doesn't appear to like creating the polynomial features. May have to try in the future with less input features.

# Feature Selection
***
Aims to reduce the dimensionality of the dataset
***
1. Remove co-linear features
2. Remove features with large number of missing values
3. Keep importance features

In [None]:
# Threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = X.corr().abs()

# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
to_drop

# Remove the columns from the train and test set
# X = X.drop(columns = to_drop)

In [None]:
# Lets review thresholds for removing co-linear features
threshold_range = np.arange(0, 0.91, 0.025)

to_drop_dict = dict()
for thres in threshold_range:
    to_drop_dict[thres] = [column for column in upper.columns if any(upper[column] > thres)]

to_drop_dict

In [None]:
# Lets review a heatmap of the correlations
sns.heatmap(upper);

It appears that the majority of the features are randomly correlated

In [None]:
# Remove features with zero importance - doesn't appear to be working. Review in future work.
# feature_importances = pd.DataFrame({'feature': list(X.columns), 
#                             'importance': lgb_median.feature_importance})
# # Find the features with zero importance
# zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
# print('\nThere are %d features with 0.0 importance' % len(zero_features))

### Hyperparameter tuning

In [None]:
# Let's create a function to allow for future quick reviews of the same baseline model. Will allow for easy review of feature engineering and selection processing steps
def model_tuning(train, dep):
    
    # Create feature variables
    X = train
    y = dep
    
    # Prepare the data to be used within the model. Make use of the lgb.Dataset() method to optimise the memory usage
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6, stratify=y)
    
    # Review using the LGB dataset and model build methods
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)
    
    # Run the model
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'is_unbalance': 'true',
        'boosting': 'gbdt',
#         'num_leaves': 31,
#         'feature_fraction': 0.5,
#         'bagging_fraction': 0.5,
        'bagging_freq': 20,
#         'learning_rate': 0.05,
        'verbose': 0,
        'device': 'gpu'
    }

    # train - verbose_eval option switches off the log outputs
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=lgb_eval,
        early_stopping_rounds=100,
        verbose_eval=-1,
    )

    # Hyperparameter tuning
    parameters = {'num_leaves':[20,40,60,80,100], 
                  'min_child_samples':[5,10,15],
                  'max_depth':[-1,5,10,20],
                  'learning_rate':[0.05,0.1,0.2],
                  'reg_alpha':[0,0.01,0.03],
                  'feature_fraction': [0.5, 0.6, 0.7],
                  'bagging_fraction': [0.5, 0.6, 0.7]
                 }
    
    # Setup the random grid search
    gs = RandomizedSearchCV(
        estimator=model, 
        param_distributions=parameters, 
        n_iter=100,
        scoring='roc_auc',
        cv=3,
        refit=True,
        random_state=6,
        verbose=True)

    # predict
    y_pred = model.predict(X_test, num_iteration=gs.best_params_)
    # Compute and print metrics
    print(f"AUC : {roc_auc_score(y_test, y_pred)}")
    return model

In [None]:
# Confirm the model output - doesn't seem to be working
# lgb_median_hyper = model_tuning(X, dep=train['claim'])