In [None]:
from datetime import datetime
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.utils import shuffle

import optuna
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import RobustScaler

In [None]:
datenow = datetime.now().strftime('%d%m%Y_%H%M%S')
modelname = 'xgboost'
# modelname = 'lgbm'

In [None]:
train_df = pd.read_csv('../input/sep2021-tps-stratifiedkfold/StratifiedKFold_SEP2021_TPS.csv',index_col='id')
test_df = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv',index_col='id')
submission_df = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
features = train_df.select_dtypes(include='float64').columns.tolist()
features.remove('Fold')
target = ['claim']

In [None]:
train_df['n_missing'] = train_df[features].isna().sum(axis=1)
test_df['n_missing'] = test_df[features].isna().sum(axis=1)

train_df['std'] = train_df[features].std(axis=1)
test_df['std'] = test_df[features].std(axis=1)

features += ['n_missing','std']

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
train_df[features] = imputer.fit_transform(train_df[features])
test_df[features] = imputer.transform(test_df[features])

In [None]:
rsk = RobustScaler()
train_df[features] = rsk.fit_transform(train_df[features])
test_df[features] = rsk.transform(test_df[features])

In [None]:
def get_nth_fold(n=0):
    train_idx = (train_df['Fold'] != n)
    val_idx = (train_df['Fold'] == n)
    
    return train_df.loc[train_idx][features],train_df.loc[train_idx][target],train_df.loc[val_idx][features],train_df.loc[val_idx][target]


In [None]:
def fit_n_folds(model,n=10,optimize=False):
    
    val_auc_scores = []
    test_preds = []
    final_valid_predictions = {}
    for i in range(n): 
        
        if not optimize:
            print(f'** Processing Fold {i} ***')
        
        train_auc_score = 0
        val_auc_score = 0
        
        X_train,y_train,X_val,y_val = get_nth_fold(n=i)
        
        
        
        model.fit(X_train,y_train.values.ravel(), eval_set=[(X_val, y_val.values.ravel())],eval_metric='auc',early_stopping_rounds=50,verbose=False)
        y_pred = model.predict_proba(X_train)[:,1]
        train_auc_score = roc_auc_score(y_train.values.ravel(),y_pred)
        
        y_val_pred = model.predict_proba(X_val)[:,1]
        
        valid_index = y_val.index.values
        final_valid_predictions.update(dict(zip(valid_index,y_val_pred)))

        val_auc_score = roc_auc_score(y_val.values.ravel(),y_val_pred)
        val_auc_scores.append(val_auc_score)
        
        
        if not optimize:
            test_pred = model.predict_proba(test_df[features])[:,1]
            test_preds.append(test_pred)
            print(f'Fold {i} Train AUC - {train_auc_score},Val AUC - {val_auc_score}')
        
    
    if optimize:
        return np.mean(val_auc_scores)
    else: 
        final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions,orient='index').reset_index()
        final_valid_predictions.to_csv(f'oof_{modelname}_{datenow}.csv',index=0)
        print(f'Average Val AUC across folds - {np.mean(val_auc_scores)} std - {np.std(val_auc_scores)}')
        return test_preds

In [None]:
def objective(trial):
    
    params  = {
      'n_estimators': trial.suggest_categorical('n_estimators',[10000]),
      'learning_rate': trial.suggest_float('learning_rate',1e-3,5e-1,log=True),
      'max_depth': trial.suggest_int('max_depth',3,12),
#       'min_child_weight': 126,
      'colsample_bytree': trial.suggest_float('colsample_bytree',0.2,0.99,log=True),
      'subsample': trial.suggest_float('subsample',0.2,0.99,log=True),
      'eval_metric': trial.suggest_categorical('eval_metric',['auc']),
      'use_label_encoder':trial.suggest_categorical('use_label_encoder',[False]),
#       'n_jobs': trial.suggest_categorical('n_jobs',[-1]),
      'gamma': trial.suggest_categorical('gamma',[0, 0.25, 0.5, 1.0]),
      'reg_lambda': trial.suggest_categorical('reg_lambda',[0.1, 1.0, 5.0, 10.0, 50.0, 100.0]),
      'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']),
      'gpu_id': trial.suggest_categorical('gpu_id',[0]),
      'predictor' : trial.suggest_categorical('predictor',['gpu_predictor']),
      'random_state': trial.suggest_categorical('random_state',[42])
     }  
    model = XGBClassifier(**params)

    auc = fit_n_folds(model,n=1,optimize=True)

    return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=100)
print('**BEST TRIAL**')
print(study.best_trial)

In [None]:
if modelname == 'xgboost':
    params = study.best_trial.params

    model = XGBClassifier(**params)

    preds = fit_n_folds(model,n=10,optimize=False)
    
    submission_df.iloc[:,1:] = np.mean(np.stack(preds,axis=0),axis=0)
    submission_df.to_csv(f'submission_{modelname}_{datenow}.csv',index=0)

In [None]:
sns.histplot(x=submission_df['claim']);

In [None]:
test_df_copy = test_df.reset_index().copy()
test_df_pseudo = test_df_copy[(submission_df['claim'] < 0.10) | (submission_df['claim'] > 0.80) ][features]

test_df_pseudo.loc[(submission_df['claim'] < 0.10),'claim'] = 0
test_df_pseudo.loc[(submission_df['claim'] > 0.80),'claim'] = 1

test_df_pseudo = shuffle(test_df_pseudo,random_state=42)

In [None]:
test_df_pseudo['claim'].value_counts()

In [None]:
def fit_n_folds_pseudo(model,n=10,optimize=False):
    
    val_auc_scores = []
    test_preds = []
    final_valid_predictions = {}
    for i in range(n): 
        
        if not optimize:
            print(f'** Processing Fold {i} ***')
        
        train_auc_score = 0
        val_auc_score = 0
        
        X_train,y_train,X_val,y_val = get_nth_fold(n=i)
        
        X_train = pd.concat([X_train,test_df_pseudo[features]],axis=0)
        y_train = pd.concat([y_train,test_df_pseudo[target]],axis=0)
        
        model.fit(X_train,y_train.values.ravel(), eval_set=[(X_val, y_val.values.ravel())],eval_metric='auc',early_stopping_rounds=50,verbose=False)
        y_pred = model.predict_proba(X_train)[:,1]
        train_auc_score = roc_auc_score(y_train.values.ravel(),y_pred)
        
        y_val_pred = model.predict_proba(X_val)[:,1]
        
        valid_index = y_val.index.values
        final_valid_predictions.update(dict(zip(valid_index,y_val_pred)))

        val_auc_score = roc_auc_score(y_val.values.ravel(),y_val_pred)
        val_auc_scores.append(val_auc_score)
        
        
        if not optimize:
            test_pred = model.predict_proba(test_df[features])[:,1]
            test_preds.append(test_pred)
            print(f'Fold {i} Train AUC - {train_auc_score},Val AUC - {val_auc_score}')
        
    
    if optimize:
        return np.mean(val_auc_scores)
    else: 
        final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions,orient='index').reset_index()
        final_valid_predictions.to_csv(f'oof_pseudo_{modelname}_{datenow}.csv',index=0)
        print(f'Average Val AUC across folds - {np.mean(val_auc_scores)} std - {np.std(val_auc_scores)}')
        return test_preds

In [None]:
def objective(trial):
    
    params  = {
      'n_estimators': trial.suggest_categorical('n_estimators',[10000]),
      'learning_rate': trial.suggest_float('learning_rate',1e-3,5e-1,log=True),
      'max_depth': trial.suggest_int('max_depth',3,12),
#       'min_child_weight': 126,
      'colsample_bytree': trial.suggest_float('colsample_bytree',0.2,0.99,log=True),
      'subsample': trial.suggest_float('subsample',0.2,0.99,log=True),
      'eval_metric': trial.suggest_categorical('eval_metric',['auc']),
      'use_label_encoder':trial.suggest_categorical('use_label_encoder',[False]),
#       'n_jobs': trial.suggest_categorical('n_jobs',[-1]),
      'gamma': trial.suggest_categorical('gamma',[0, 0.25, 0.5, 1.0]),
      'reg_lambda': trial.suggest_categorical('reg_lambda',[0.1, 1.0, 5.0, 10.0, 50.0, 100.0]),
      'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']),
      'gpu_id': trial.suggest_categorical('gpu_id',[0]),
      'predictor' : trial.suggest_categorical('predictor',['gpu_predictor']),
      'random_state': trial.suggest_categorical('random_state',[42])
     }  
    model = XGBClassifier(**params)

    auc = fit_n_folds_pseudo(model,n=1,optimize=True)

    return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=100)
print('**BEST TRIAL**')
print(study.best_trial)

In [None]:
if modelname == 'xgboost':
    params = study.best_trial.params
    model = XGBClassifier(**params)

    preds = fit_n_folds_pseudo(model,n=10,optimize=False)
    
    submission_df.iloc[:,1:] = np.mean(np.stack(preds,axis=0),axis=0)
    submission_df.to_csv(f'submission_pseudo_{modelname}_{datenow}.csv',index=0)