In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# for time of training
from time import time

# training the lightGBM model
import lightgbm as lgb
from lightgbm import LGBMClassifier


# evaluation metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer

# selection better model 
from sklearn.model_selection import StratifiedKFold
import pprint

In [None]:
# reading data using pandas
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/train.csv')
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv')

In [None]:
# print the first 5 rows of the train_dataset
train_data.head()

In [None]:
# column name
train_data.columns

In [None]:
# checking the datatype of every column
train_data.dtypes

In [None]:
# check the classes of the 'target' column
train_data['target'].value_counts()

In [None]:
# check for any null values found or not
train_data.isnull().sum().sum()

In [None]:
# convert f_27 column into the numeric feature
def convert_f27_numeric(df):
    # convert object dtype into categorical variables
    categories = [chr(c) for c in range(65, 85)]

    for i in range(0, 10): 
        df[f'p_{i}'] = list(df['f_27'].map(lambda x: x[i]))
        df[f'p_{i}'] = pd.Categorical(df[f'p_{i}'], categories=categories)

    # mapping the alphabets number into categories
    map_letters = {"A":0 , "B":1 ,"C":2 ,"D":3 ,"E":4 ,"F":5 ,"G":6 ,"H":7 ,"I":8 ,"J":9 ,"K":10 ,"L":11 ,"M":12 ,"N":13 ,"O":14,"P":15,"Q":16 ,"R":17 ,"S":18 ,"T":19}

    for i in range(10):
        df[f'encode_col_{i}'] = df.f_27.str.get(i).map(map_letters)
    
    for letter in list(map_letters):
        df[f"Count_{letter}"] = df["f_27"].str.count(letter)
    
    return df

In [None]:
train_data = convert_f27_numeric(train_data)

# prepares X and y variable 
X = train_data.drop(columns=['target', 'f_27'], axis=1)
y = train_data['target']

In [None]:
# wrapper for measuring time and performances of different optmizers
def performance_report(optimizer, X, y, title="model", callbacks=None):
    start = time()
    
    if callbacks is not None:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
        
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           + u"\u00B1"+" %.3f") % (time() - start, 
                                   len(optimizer.cv_results_['params']),
                                   best_score,
                                   best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

In [None]:
# Converting average precision score into a scorer suitable for model selection
roc_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)

In [None]:
# compiling the LGBMClassifier with some parameters
clf_model = LGBMClassifier(
    objective= 'binary',
    metric= "auc",
    boosting= 'gbdt',
    device = 'cpu',
    n_jobs=-1, 
    verbose=-1,
    random_state=0)

In [None]:
# declare parameters names for hyperparameters tuning purpose
search_spaces = {
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),     # Boosting learning rate
    'n_estimators': Integer(30, 5000),                   # Number of boosted trees to fit
    'num_leaves': Integer(2, 512),                       # Maximum tree leaves for base learners
    'max_depth': Integer(-1, 256),                       # Maximum tree depth for base learners, <=0 means no limit
    'min_child_samples': Integer(1, 256),                # Minimal number of data in one leaf
    'max_bin': Integer(100, 1000),                       # Max number of bins that feature values will be bucketed
    'subsample': Real(0.01, 1.0, 'uniform'),             # Subsample ratio of the training instance
    'subsample_freq': Integer(0, 10),                    # Frequency of subsample, <=0 means no enable
    'colsample_bytree': Real(0.01, 1.0, 'uniform'),      # Subsample ratio of columns when constructing each tree
    'min_child_weight': Real(0.01, 10.0, 'uniform'),     # Minimum sum of instance weight (hessian) needed in a child (leaf)
    'reg_lambda': Real(1e-9, 100.0, 'log-uniform'),      # L2 regularization
    'reg_alpha': Real(1e-9, 100.0, 'log-uniform'),       # L1 regularization
    'scale_pos_weight': Real(1.0, 500.0, 'uniform'),     # Weighting of the minority class (Only for binary classification)
}

In [None]:
# cross validation method for selecting better model
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
# Baayes CV for tuning the hyperparmeters
opt = BayesSearchCV(estimator=clf_model,                                    
                    search_spaces=search_spaces,                      
                    scoring=roc_auc,                                  
                    cv=skf,                                           
                    n_iter=3000,                                      # max number of trials
                    n_points=3,                                       # number of hyperparameter sets evaluated at the same time
                    n_jobs=-1,                                        # number of jobs
                    iid=False,                                        # if not iid it optimizes on the cv score
                    return_train_score=False,                         
                    refit=False,                                      
                    optimizer_kwargs={'base_estimator': 'GP'},        # optmizer parameters: we use Gaussian Process (GP)
                    random_state=0)           

In [None]:
# We stop if the gain of the optimization becomes too small
overdone_control = DeltaYStopper(delta=0.0001)               

# We impose a time limit (60 minutes)
time_limit_control = DeadlineStopper(total_time=60 * 60)     

best_params = performance_report(opt, X, y,'LightGBM', 
                          callbacks=[overdone_control, time_limit_control])

In [None]:
#compile model with the best parameters
model = LGBMClassifier(
    device = 'cpu',
    boosting_type='gbdt',
    metric='auc',
    objective='binary',
    n_jobs=1, 
    verbose=-1,
    random_state=0,
    **best_params)

In [None]:
# train model using best parameters
model.fit(X, y)

In [None]:
# convert test_data 'f_27' into the numeric 
test_data = convert_f27_numeric(test_data)

In [None]:
# drop column
test_data = test_data.drop(columns=['f_27'], axis=1)

In [None]:
# predict the test_data
predicted_data = model.predict_proba(test_data)[:,1]

In [None]:
# preparing the submission dataframe
submission_df = pd.DataFrame()
submission_df['id'] = test_data['id']
submission_df['target'] = predicted_data
submission_df.to_csv('submission.csv', index=False)