## We are combining 3 diffrent models for our final predictions(xgboost,lightgbm,catboost).This is called model blending

## Import libarires

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
import optuna

In [None]:
df=pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv")
df_test=pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv")
sample_submission=pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")
test_df=df_test.copy()


In [None]:
#taking only useful columns for predictions
useful_cols=[c for c in df.columns if c not in ("id","target","kfold")]
df_test=df_test[useful_cols]

## xgboost

In [None]:
#First we dividing the data into 5 folds so our model won't overfit
kf=KFold(n_splits=5,random_state=42,shuffle=True)
for fold,(train_index,valid_index) in enumerate(kf.split(X=df)):
    df.loc[valid_index,"kfold"]=fold
    
    
final_test_pred=[]
final_valid_pred={}
scores=[]



for fold in range(5):
    xtrain=df[df.kfold != fold].reset_index(drop=True)#training set=4 folds
    xvalid=df[df.kfold == fold].reset_index(drop=True)#validation set=1 fold
    xtest=df_test.copy()
    
    valid_id=xvalid.id.values.tolist()#since we are shuffiling the data we need to kepp track of the id
    #test_id=xtest.id.values.tolist()
    
    
    ytrain=xtrain.target
    yvalid=xvalid.target
    
    xtrain=xtrain[useful_cols]#taking only useful columns
    xvalid=xvalid[useful_cols]
    
    #our model
    model=XGBClassifier(random_state=fold,
                        predictor="gpu_predictor",
                        tree_method='gpu_hist')
    
    model.fit(xtrain,ytrain)
    pred_valid=model.predict(xvalid)#predicting on validation set
    pred_test=model.predict(xtest)#predicting on test set
    final_test_pred.append(pred_test)#appending the test predictions
    final_valid_pred.update(dict(zip(valid_id,pred_valid)))#updating the vaid predicions
    score=roc_auc_score(yvalid,pred_valid)
    print(score,fold)
    scores.append(score)#calculating and appendint the scores
    
    
    
print(np.mean(scores)) 
#converting our valid predictions to a dataframe
final_valid_pred=pd.DataFrame.from_dict(final_valid_pred,orient='index').reset_index()
final_valid_pred.columns=['id',"preds_1"]
final_valid_pred.to_csv('train_pred_1.csv',index=False)

#converting our test predictions to a dataframe
sample_submission.target=np.mean(np.column_stack(final_test_pred),axis=1)
sample_submission.columns=['id','preds_1']
sample_submission.to_csv("test_pred_1.csv",index=False)
    
    

## lightgbm

In [None]:
kf=KFold(n_splits=5,random_state=42,shuffle=True)
for fold,(train_index,valid_index) in enumerate(kf.split(X=df)):
    df.loc[valid_index,"kfold"]=fold
    
    
final_test_pred=[]
final_valid_pred={}
scores=[]



for fold in range(5):
    xtrain=df[df.kfold != fold].reset_index(drop=True)
    xvalid=df[df.kfold == fold].reset_index(drop=True)
    xtest=df_test.copy()
    
    valid_id=xvalid.id.values.tolist()
    #test_id=xtest.id.values.tolist()
    
    
    ytrain=xtrain.target
    yvalid=xvalid.target
    
    xtrain=xtrain[useful_cols]
    xvalid=xvalid[useful_cols]
    
    model=LGBMClassifier(device = "gpu",
                    gpu_platform_id=0,
                     gpu_device_id = 0)
    
    model.fit(xtrain,ytrain)
    pred_valid=model.predict(xvalid)
    pred_test=model.predict(xtest)
    final_test_pred.append(pred_test)
    final_valid_pred.update(dict(zip(valid_id,pred_valid)))
    score=roc_auc_score(yvalid,pred_valid)
    print(score,fold)
    scores.append(score)
    
    
    
print(np.mean(scores))    
    
final_valid_pred=pd.DataFrame.from_dict(final_valid_pred,orient='index').reset_index()
final_valid_pred.columns=['id',"preds_2"]
final_valid_pred.to_csv('train_pred_2.csv',index=False)

sample_submission.target=np.mean(np.column_stack(final_test_pred),axis=1)
sample_submission.columns=['id','preds_2']
sample_submission.to_csv("test_pred_2.csv",index=False)    
    

## catboost

In [None]:
kf=KFold(n_splits=5,random_state=42,shuffle=True)
for fold,(train_index,valid_index) in enumerate(kf.split(X=df)):
    df.loc[valid_index,"kfold"]=fold
    
    
final_test_pred=[]
final_valid_pred={}
scores=[]



for fold in range(5):
    xtrain=df[df.kfold != fold].reset_index(drop=True)
    xvalid=df[df.kfold == fold].reset_index(drop=True)
    xtest=df_test.copy()
    
    valid_id=xvalid.id.values.tolist()
    #test_id=xtest.id.values.tolist()
    
    
    ytrain=xtrain.target
    yvalid=xvalid.target
    
    xtrain=xtrain[useful_cols]
    xvalid=xvalid[useful_cols]
    
    model=CatBoostClassifier(task_type = "GPU")
    
    model.fit(xtrain,ytrain)
    pred_valid=model.predict(xvalid)
    pred_test=model.predict(xtest)
    final_test_pred.append(pred_test)
    final_valid_pred.update(dict(zip(valid_id,pred_valid)))
    score=roc_auc_score(yvalid,pred_valid)
    print(score,fold)
    scores.append(score)
    
    
    
print(np.mean(scores))    

final_valid_pred=pd.DataFrame.from_dict(final_valid_pred,orient='index').reset_index(
)
final_valid_pred.columns=['id',"preds_3"]
final_valid_pred.to_csv('train_pred_3.csv',index=False)

sample_submission.target=np.mean(np.column_stack(final_test_pred),axis=1)
sample_submission.columns=['id','preds_3']
sample_submission.to_csv("test_pred_3.csv",index=False)
    
    
    

In [None]:
#reading the prediction on vaidation set of the 3 model
df1=pd.read_csv("./train_pred_1.csv")
df2=pd.read_csv("./train_pred_2.csv")
df3=pd.read_csv("./train_pred_3.csv")

#reading the prediction of test set of the 3 model
df_test1=pd.read_csv("./test_pred_1.csv")
df_test2=pd.read_csv("./test_pred_2.csv")
df_test3=pd.read_csv("./test_pred_3.csv")

#we are merging the predictions with the original dataframe

df=df.merge(df1,on="id",how="left")
df=df.merge(df2,on="id",how="left")
df=df.merge(df3,on="id",how="left")

test_df=test_df.merge(df1,on="id",how="left")
test_df=test_df.merge(df2,on="id",how="left")
test_df=test_df.merge(df3,on="id",how="left")



In [None]:
def objective(trial):
    
    #These are the feature we are goint to use for predictions
    final_useful_features=['preds_1','preds_2','preds_3']

    kf=KFold(n_splits=5,random_state=42,shuffle=True)
    for fold,(train_index,valid_index) in enumerate(kf.split(X=df)):
        df.loc[valid_index,"kfold"]=fold


    final_pred=[]
    final_valid_pred={}
    scores=[]



    for fold in range(5):
        xtrain=df[df.kfold != fold].reset_index(drop=True)
        xvalid=df[df.kfold == fold].reset_index(drop=True)
        xtest=test_df.copy()

        valid_id=xvalid.id.values.tolist()
        #test_id=xtest.id.values.tolist()


        ytrain=xtrain.target
        yvalid=xvalid.target

        xtrain=xtrain[final_useful_features]
        xvalid=xvalid[final_useful_features]
        xtest=xtest[final_useful_features]


        params = {
            'learning_rate': 0.07853392035787837,
            'reg_lambda': 1.7549293092194938e-05,
            'reg_alpha': 14.68267919457715, 
            'subsample': 0.8031450486786944, 
            'colsample_bytree': 0.170759104940733, 
            'max_depth': 3
        }

        model = XGBClassifier(
            random_state=fold,
            n_jobs=4,
            n_estimators=5000,
            **params
        )

        model.fit(xtrain,ytrain)
        pred_valid=model.predict(xvalid)
        pred_test=model.predict(xtest)
        final_pred.append(pred_test)
        final_valid_pred.update(dict(zip(valid_id,pred_valid)))
        score=roc_auc_score(yvalid,pred_valid)
        print(score,fold)
        scores.append(score)
    
    
    
print(np.mean(scores)) 
    
    

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

## Submission

In [None]:
sample_submission.target=np.mean(np.column_stack(final_test_pred),axis=1)
sample_submission.columns=['id','target']
sample_submission.to_csv("my_output1.csv",index=False)