In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
n_Trials=5

In [None]:
df=pd.read_csv('../input/train-folds/train_folds.csv')

In [None]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
import xgboost as xg
import joblib
import lightgbm as lgb
import optuna
from sklearn.tree import DecisionTreeClassifier

In [None]:
def objective(trial):
    
    param = {
        'tree_method':'gpu_hist',  
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 4000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'use_label_encoder':False
    }
    
    
    roc_auc_score=0
    clf = xg.XGBClassifier(**param)
    for fold in range(1):
        
        df_train=df[df.kfold!=fold]
        df_test=df[df.kfold==fold]

        x_train=df_train.drop(['target','id','kfold'],axis=1).values
        y_train=df_train.target.values

        x_test=df_test.drop(['target','id','kfold'],axis=1).values
        y_test=df_test.target.values

    #     clf=RandomForestClassifier()
        clf.fit(x_train,y_train)

        y_pred = clf.predict_proba(x_test)[:,1]
        roc_auc_score = roc_auc_score+sklearn.metrics.roc_auc_score(y_test,y_pred)
    
    return roc_auc_score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_Trials)

In [None]:
study.best_params

In [None]:
n_folds=len(df['kfold'].value_counts())
print(n_folds)

In [None]:
def run_folds(df,fold):
    df_train=df[df.kfold!=fold]
    df_test=df[df.kfold==fold]
    
    x_train=df_train.drop(['target','id','kfold'],axis=1).values
    y_train=df_train.target.values
    
    x_test=df_test.drop(['target','id','kfold'],axis=1).values
    y_test=df_test.target.values
    
    param=study.best_params
    param['n_estimators']=4000
    param['tree_method']='gpu_hist'
    param['use_label_encoder']=False
    
    clf= xg.XGBClassifier(**param)
    clf.fit(x_train,y_train)
    
    y_pred = clf.predict_proba(x_test)[:,1]
    roc_auc_score = sklearn.metrics.roc_auc_score(y_test,y_pred)
    print(f"Fold={fold}, roc_auc_score={roc_auc_score}")
    
    File_name = 'model_xgbclf' + str(fold)
    joblib.dump(
    clf,File_name)
    
    df_test.loc[:,"xgbclf_pred"]=y_pred
    
    return df_test[['id','target','kfold','xgbclf_pred']]


dfs=[]
    
for i in range(n_folds):
    temp_df=run_folds(df,i)
    dfs.append(temp_df)
df_pred=pd.concat(dfs)

print(df_pred.shape)

In [None]:
df_pred.to_csv("xgb_clf_predictions.csv",index=False)

In [None]:
df_pred.head()

In [None]:
df.head()