## Importing important libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from numbers import Real
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn import decomposition
from sklearn import preprocessing
from functools import partial
import optuna
from skopt import space
from skopt import gp_minimize 
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
from lightgbm import LGBMClassifier
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Reading the train and test datasets

In [None]:
train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
X_test = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")
sample_solution = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")

In [None]:
train.head()

In [None]:
train.target.hist()

The above graph of the target column data shows that the problem is of classification. A classification regression is suitable to predict. 

In [None]:
X_test.head()

## Checking Null values, if any, in both training and testing data

In [None]:
train.isnull().sum()

In [None]:
X_test.isnull().sum()

There are no null values in both the training data and the testing data

In [None]:
print(train.shape)
print(X_test.shape)

## Dividing dependent and independent variables and adding new features
It can be seen that columns like "Id" are unique, hence wont contribute for our predictions. Therefore, these must be removed from both training and testing datasets.

In [None]:
y = train['target']
train.pop('target')
train.pop('id')
X_test.pop('id')
X=train
del train

In [None]:
print(X.shape)
print(X_test.shape)

In [None]:
X.head()

## Hyperparameter Tuning begins here
### Important note
The following cell can be uncommented to run the hyperparameter tunning process which uses hyperopt method

In [None]:
"""def optimize(params, x, y):
    
    model = LGBMClassifier(**params)
    kf = model_selection.StratifiedKFold(n_splits=5)
    accuracies = []
    for idx in kf.split(X=x, y = y):
        train_idx, test_idx = idx[0], idx[1]
      
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]

        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

       
        model.fit(X_train, y_train, eval_set = [(X_test, y_test)], 
                  early_stopping_rounds = 300, verbose = False)
        
        preds = model.predict_proba(X_test)[:,1]
        fold_acc = metrics.roc_auc_score(y_test, preds)
        accuracies.append(fold_acc)
        
    return -1.0*np.mean(accuracies)"""

### Important note
The following cell can be uncommented to run the hyperparameter tunning process which fmin function and defining the parameters for model

In [None]:
"""param_space = {
        "max_depth": scope.int(hp.quniform("max_depth", 3, 18, 1)),
        "n_estimators": scope.int(hp.quniform("n_estimators", 100, 600, 1)),
        'min_child_weight' : scope.int(hp.quniform('min_child_weight', 0, 10, 1)),
        'reg_alpha' : scope.int(hp.quniform('reg_alpha', 40,180,1)),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        
        'random_state':42,
    }
        
optimization_function = partial(optimize,  x = X, y = y)

trials = Trials()

result = fmin(
        fn = optimization_function,
        space = param_space,
        algo = tpe.suggest, 
        max_evals = 15,
        trials = trials, 
    )
print(result)"""


### Results obtained from Hyperparameter Tuning using hyperopt with accuracy 0.854 are given as under:
{'colsample_bytree': 0.8637743391307819, 
 'max_depth': 16.0, 'min_child_weight': 6.0, 
 'n_estimators': 576.0, 'reg_alpha': 78.0, 
 'reg_lambda': 0.7199776533647606}

## Performing 5 fold cross validation using LGBMClassifier

In [None]:
params_lgbm =  {
        'boosting_type': 'gbdt',
        "max_depth": 16,
        "n_estimators": 576,
        'min_child_weight' : 6,
        'reg_alpha' : 78.0,
        'reg_lambda' : 0.7199,
        'colsample_bytree' : 0.863,
        'random_state' : 42,
        'n_jobs': -1,
        'metric': 'AUC',
        'verbosity': -1,
    }
folds = model_selection.StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
y_pred = np.zeros(len(X_test))
scores = []
for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = LGBMClassifier(**params_lgbm)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False, early_stopping_rounds = 300)
    
    final_preds = model.predict_proba(X_val)[:,1]
    fold_score = metrics.roc_auc_score(y_val, final_preds)
    scores.append(fold_score)
    y_pred += model.predict_proba(X_test)[:,1] / folds.n_splits 

print(scores)

### Final Submission

In [None]:
sample_solution.head()

In [None]:
sample_solution['target'] = y_pred
sample_solution.to_csv('Submission.csv',index = False)
