## Hey Everyone, In this notebook, we'll see how we can use optuna for hyperparameter tuning and how to use KFolds to make a prediction !!
## Let's go !!

In [None]:
import numpy as np
import pandas as pd
train_data=pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test_data=pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
ss=pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
from sklearn.model_selection import KFold
train_data['fold']=-1
kf=KFold(n_splits=5,shuffle=True,random_state=42)
for fold,(ti,vi) in enumerate(kf.split(train_data)):
    train_data.loc[vi,'fold']=fold

In [None]:
train_data.fold.value_counts()

# EDA

In [None]:
train_data.describe()

In [None]:
train_data.isnull().any()

In [None]:
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
sns.histplot(data=train_data,x='id',y='claim')

In [None]:
sns.countplot(data=train_data,x='claim')

In [None]:
# normalize data
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
traindf=train_data.copy()
testdf=test_data.copy()
from sklearn.preprocessing import StandardScaler
useful_cols=[col for col in testdf.columns if (col!='id' and col !='fold')] 

for col in useful_cols:
    mean_=traindf[col].mean()
    std_=traindf[col].std()
    traindf[col]=(traindf[col]-mean_)/std_
    testdf[col]=(testdf[col]-mean_)/std_ 
#     min_=traindf[col].min()
#     max_ =traindf[col].max()
#     traindf[col]=(traindf[col]-min_)/(max_-min_)
#     testdf[col]=(testdf[col]-min_)/(max_-min_)
    
#     traindf[col]=traindf[col].fillna(traindf[col].mean())
#     testdf[col]=testdf[col].fillna(traindf[col].mean())


In [None]:
testdf.isnull().sum().sum()

## Its good that the two classes are pretty balanced

# Training

In [None]:
from sklearn import metrics

# lets first define a function that'll help us know how good/bad our model is doing
def get_scores(y_preds,y):
    return {
        'Accuracy':metrics.accuracy_score(y_preds,y),
        'Precision':metrics.precision_score(y_preds,y),
        'Recall':metrics.recall_score(y_preds,y),
        'F1':metrics.f1_score(y_preds,y),
        'ROC_AUC': metrics.roc_auc_score(y_preds,y)
    }

In [None]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split

In [None]:
X=traindf[useful_cols]
y=traindf['claim']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
def train_model(model):
    model_=model
    model_.fit(X_train,y_train)
    y_preds=model_.predict(X_val)
    return model_,get_scores(y_preds,y_val)

## Here I created a list of models and trained all of them to see how are they performing

In [None]:
model_list=[
            DecisionTreeClassifier(random_state=42), 
            RandomForestClassifier(random_state=42),
            XGBClassifier(random_state=42,tree_method='gpu_hist'), 
            LGBMClassifier(random_state=42), 
            LogisticRegression(random_state=42),
            svm.SVC(random_state=42),
            CatBoostClassifier(random_state=42,verbose=100),
            AdaBoostClassifier(random_state=42)
           ]
model_names=['Decision Tree', 'Random Forest', 'XG Boost', 'Light GBM', 'Logistic Regression','SVM','CatBoost','AdaBoost']


In [None]:
# # Now lets train all the models and see how are they doing
# model_store=[]
# scores = pd.DataFrame(columns=['Name','Accuracy','Precision',
#                                 'Recall',
#                                 'F1',
#                                 'ROC_AUC'])
# for i in range(len(model_list)):
#     model,score=train_model(model_list[i])
#     scores.loc[i]=[model_names[i]]+list(score.values())
#     model_store.append(model)
#     print(model_list[i], ' done')

In [None]:
# figure, axis = plt.subplots(2, 3)
# figure.set_figheight(15)
# figure.set_figwidth(20)

# for i in range(2):
#     for j in range(3):
#         axis[i,j].set_xlim([.3,.9])
        
# axis[0, 0].barh(scores['Name'],scores['Accuracy'],height=.5)
# axis[0, 0].set_title("Accuracy Score")
  
# axis[0, 1].barh(scores['Name'],scores['Precision'],height=.5)
# axis[0, 1].set_title("Precision")

# axis[1, 0].barh(scores['Name'],scores['Recall'],height=.5)
# axis[1, 0].set_title("Recall")

# axis[1, 2].barh(scores['Name'],scores['F1'],height=.5)
# axis[1, 2].set_title("F1")

# axis[0, 2].barh(scores['Name'],scores['ROC_AUC'],height=.5)
# axis[0, 2].set_title('ROC_AUC')

# axis[1, 1].set_visible(False)

# plt.show()

# Hyperparameter Tuning using optuna 

In [None]:
import optuna
import sklearn
def objective(trial):
    score=0
    n_estimators = trial.suggest_int('n_estimators', 10, 1000)
    max_depth = trial.suggest_int('max_depth', 1, 27)
    reg_lambda = trial.suggest_loguniform('reg_lambda', 0.1, 5)
    alpha = trial.suggest_loguniform('alpha', .1, 5)
    min_child_weight= trial.suggest_loguniform('min_child_weight', 1, 50)
    clf = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth,
                        reg_lambda=reg_lambda,alpha=alpha,min_child_weight=min_child_weight, 
                        tree_method='gpu_hist',random_state=42)
    clf.fit(X_train[useful_cols],y_train)
    preds=clf.predict(X_val[useful_cols])
    return metrics.roc_auc_score(preds,y_val)

In [None]:
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=10)

In [None]:
# best_params=study.best_trial.params
best_params={'n_estimators': 433, 'max_depth': 27, 
             'reg_lambda': 0.5955576227964456, 'alpha': 3.8018858996918654, 
             'min_child_weight': 5.2345922504984905}

In [None]:
## Finally predict for each KFold and take mean to get final submission
import sklearn
predictions=[]
for i in range(5):
    train=traindf.loc[traindf.fold!=i]
    val=traindf.loc[traindf.fold==i]
    Xtrain=train[useful_cols]
    ytrain=train['claim']
    Xval=val[useful_cols]
    yval=val['claim']
    clf = XGBClassifier(**best_params, tree_method='gpu_hist', random_state=i)
    clf.fit(Xtrain,ytrain)
    preds=clf.predict_proba(testdf[useful_cols])[:,1]
    predictions.append(preds)
    print('fold ' +str(i), get_scores(clf.predict(Xval),yval))

In [None]:
preds=np.mean(predictions,axis=0)
ss['claim']=preds
ss.to_csv('submission,csv',index=False)