In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

!pip install AutoViz
!pip install xlrd
from autoviz.AutoViz_Class import AutoViz_Class

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score

from sklearn.preprocessing import QuantileTransformer, StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest

import lightgbm as lgb
from lightgbm import LGBMClassifier

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from IPython.display import display

import warnings
warnings.filterwarnings('ignore')


In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv', index_col=0)
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv', index_col=0)

train.head()

In [None]:
#AutoViz

AV = AutoViz_Class()
dft = AV.AutoViz('../input/tabular-playground-series-mar-2021/train.csv',depVar='target')

In [None]:
#Label Encoding categorical cols

alldata = pd.concat([train, test], axis = 0, ignore_index = True)
lentrain = len(train)

label = LabelEncoder() 

catcols = train.select_dtypes(include=['object']).columns.tolist()
catindices = [catcols.index(i) for i in catcols]
catcols.append('target')
for col in catcols:
    label.fit(alldata[col])
    alldata[col] = label.transform(alldata[col])

In [None]:
train_preprocessed = alldata[:lentrain]
test_preprocessed = alldata[lentrain:]
predictors = train_preprocessed.columns[:-1]
target = train_preprocessed.columns[-1]

X = train_preprocessed[predictors]
y = train_preprocessed[target]
X_test = test_preprocessed[predictors]
y_test = test_preprocessed[target]

In [None]:
N_SPLITS = 5
N_TRIALS = 5 
TIME = 3600*1.5 

FIXED_PARAMS = {'n_estimators': 10000,
                'learning_rate': 0.05,
                'metric': 'auc',
                'verbosity': -1,
                'n_jobs': -1}

In [None]:
train_preprocessed.info()

In [None]:
skfold = StratifiedKFold(N_SPLITS, shuffle = True)

def objective(trial, cv=skfold):
    
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.9),
        'subsample': trial.suggest_float('subsample', 0.01, 0.9),
        'cat_smooth': trial.suggest_float('cat_smooth', 10, 100.0),  
        'cat_l2': trial.suggest_int('cat_l2', 1, 20),
    }
    
    params.update(FIXED_PARAMS)
    
    auclist = []
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'auc', valid_name='valid_1') 
    
    for kfold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
                
        d_train = lgb.Dataset(X_train, label=y_train)
        d_valid = lgb.Dataset(X_val, label=y_val)
      
        model = lgb.train(params,
                      train_set=d_train,
                      valid_sets=[d_train, d_valid],
                      verbose_eval=0,
                      early_stopping_rounds=100,
                      callbacks=[pruning_callback])
    
        preds = model.predict(X_val)
        auc_score = roc_auc_score(y_val, preds)
        auclist.append(auc_score)
        
    
    return np.mean(auclist)

In [None]:
study = optuna.create_study(study_name = 'lgbm_parameter_opt', direction = 'maximize',
                            pruner=optuna.pruners.MedianPruner(n_warmup_steps=25))

#study.enqueue_trial()

#study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)
study.optimize(objective, timeout=TIME, show_progress_bar=True) 

trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
best_params = FIXED_PARAMS.copy()
best_params.update(trial.params)

In [None]:
# import joblib
# joblib.dump(study, 'study.pkl')

In [None]:
best_params

In [None]:
study.best_value

In [None]:
plot_optimization_history(study)

In [None]:
# Importance
plot_param_importances(study)

In [None]:
final_model = LGBMClassifier(**best_params)

In [None]:
test_preds = []
accuracies = []
aucs = []
skfold = StratifiedKFold(N_TRIALS, shuffle = True)

for kfold, (train_idx, val_idx) in enumerate(skfold.split(X, y)):
        
        final_model.fit(X.loc[train_idx], 
                        y.loc[train_idx])
        print('Fitted {}'.format(type(final_model).__name__))
        
        y_val = y.iloc[val_idx]
        
        preds = final_model.predict(X.loc[val_idx])
        probs = final_model.predict_proba(X.loc[val_idx])[:, 1]
        
        accuracy = accuracy_score(y_val, preds)
        accuracies.append(accuracy)
        print('Fold: {}\t Validation Accuracy: {}\n'.format(kfold, accuracy))
        
        auc = roc_auc_score(y_val, probs)
        aucs.append(auc)
        
        print('Fold: {}\t Validation AUC: {}\n'.format(kfold, auc))
        
        test_preds.append(final_model.predict_proba(X_test))
        
print("Best Parameters mean Accuracy: {}".format(np.mean(accuracies)))
print("Best Parameters mean AucScore: {}".format(np.mean(aucs)))

In [None]:
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')


test_predictions = np.mean(test_preds, axis = 0)
predictions_df = pd.DataFrame(test_predictions[:, 1] , columns = ["target"])
predictions_df['id'] = test['id']

predictions_df.to_csv("TPS_MAR_optuna_pruning.csv", index = False)