<b>Problem Statement:</b>  The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting the biological response of molecules given various chemical properties. We have to predict molecule response probability.

<b>Problem type:</b> A binary classification problem.

<b>Evaluation matrix:</b> Submissions are evaluated on area under the <b>ROC(receiver operating characteristic)</b> curve between the predicted probability and the observed target.

<h2 id="Approach">Approach to the problem</h2>
Idea is to develop a generalized approach for solving any binary classification problem
<ol>
    <li>Performing exploratory data analysis (EDA) and Data Preparation (DP).</li>
    <ol>
        <li><a href="#FeatureSummary">Understanding Training dataset features (EDA)</a></li>
        <li><a href="#Downcasting">Down Casting Training and Testing datasets (DP)</a></li>
        <li><a href="#Target">Understanding Target feature distribution (EDA)</a></li>
        <li><a href="#Corr">Correlation check (EDA)</a></li>
        <li><a href="#TrainVisual">Visualizing Training dataset (EDA)</a></li>
        <li><a href="#Normalizing">Normalizing dataset (DP)</a></li>
    </ol>
    <li>Feature Engineering.</li>
    <ol>
        <li><a href="#AggFeatures">Creating Aggregated features</a></li>
    </ol>
    <li>Training Linear and Gradient Boost Base models.</li>
    <ol>
        <li><a href="#LogisticRegression">Logistic Regression</a></li>
         <li><a href="#Ridge">Ridge Classifier</a></li>
        <li><a href="#LGBM">LGBM Classification</a></li>
        <li><a href="#CatBoost">CatBoost Classification</a></li>
        <li><a href="#XGBoost">XGBoost Classification</a></li>
    </ol>
<!--    <li>Basic Blending.</li>
    <ol>
        <li><a href="#Ratios">Calculating best blending Ratios (using training preditions to calculate blending ratios)</a></li>
        <li><a href="#FinalPred">Calculating blended prediction</a></li>
    </ol> -->
</ol>

In [None]:
#REQUIRED LIBRARIES

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression,RidgeClassifier
# from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.utils.extmath import softmax
# import pickle
# from sklearn.externals import joblib

# import pandas_profiling as pp

warnings.filterwarnings('ignore')
gc.enable()
%matplotlib inline

In [None]:
#CHECKING ALL AVAILABLE FILES
path='/kaggle/input/tabular-playground-series-oct-2021/'
data_files=list(os.listdir(path))
df_files=pd.DataFrame(data_files,columns=['file_name'])
df_files['size_in_gb']=df_files.file_name.apply(lambda x: round(os.path.getsize(path+x)/(1024*1024*1024),4))
df_files['type']=df_files.file_name.apply(lambda x:'file' if os.path.isfile(path+x) else 'directory')
df_files['file_count']=df_files[['file_name','type']].apply(lambda x: 0 if x['type']=='file' else len(os.listdir(path+x['file_name'])),axis=1)

print('Following files are available under path:',path)
display(df_files)

In [None]:
#ALL CUSTOM FUNCTIONS

#FUNCTION FOR PROVIDING FEATURE SUMMARY
def feature_summary(df_fa):
    print('DataFrame shape')
    print('rows:',df_fa.shape[0])
    print('cols:',df_fa.shape[1])
    col_list=['null','unique_count','data_type','max/min','mean','median','mode','std','skewness','sample_values']
    df=pd.DataFrame(index=df_fa.columns,columns=col_list)
    df['null']=list([len(df_fa[col][df_fa[col].isnull()]) for i,col in enumerate(df_fa.columns)])
    #df['%_Null']=list([len(df_fa[col][df_fa[col].isnull()])/df_fa.shape[0]*100 for i,col in enumerate(df_fa.columns)])
    df['unique_count']=list([len(df_fa[col].unique()) for i,col in enumerate(df_fa.columns)])
    df['data_type']=list([df_fa[col].dtype for i,col in enumerate(df_fa.columns)])
    for i,col in enumerate(df_fa.columns):
        if 'float' in str(df_fa[col].dtype) or 'int' in str(df_fa[col].dtype):
            df.at[col,'max/min']=str(round(df_fa[col].max(),2))+'/'+str(round(df_fa[col].min(),2))
            df.at[col,'mean']=round(df_fa[col].mean(),4)
            df.at[col,'median']=round(df_fa[col].median(),4)
            df.at[col,'mode']=round(df_fa[col].mode()[0],4)
            df.at[col,'std']=round(df_fa[col].std(),4)
            df.at[col,'skewness']=round(df_fa[col].skew(),4)
        elif 'datetime64[ns]' in str(df_fa[col].dtype):
            df.at[col,'max/min']=str(df_fa[col].max())+'/'+str(df_fa[col].min())
        df.at[col,'sample_values']=list(df_fa[col].unique())
    display(df_fa.head())      
    return(df.fillna('-'))

#EXTENDING RIDGE CLASSIFIER WITH PREDICT PROBABILITY FUNCITON

class RidgeClassifierwithProba(RidgeClassifier):
    def predict_proba(self, X):
        d = self.decision_function(X)
        d_2d = np.c_[-d, d]
        return softmax(d_2d)

#PREDICTION FUNCTIONS

def response_predictor(X,y,test,model,model_name):  

    df_preds=pd.DataFrame()
    df_preds_x=pd.DataFrame()
    k=1
    splits=5
    avg_score=0

    #CREATING STRATIFIED FOLDS
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=200)
    print('\nStarting KFold iterations...')
    for train_index,test_index in skf.split(X,y):
        df_X=X[train_index,:]
        df_y=y[train_index]
        val_X=X[test_index,:]
        val_y=y[test_index]
       

    #FITTING MODEL
        model.fit(df_X,df_y)

    #PREDICTING ON VALIDATION DATA
        col_name=model_name+'xpreds_'+str(k)
        preds_x=pd.Series(model.predict_proba(val_X)[:,1])
        df_preds_x[col_name]=pd.Series(model.predict_proba(X)[:,1])

    #CALCULATING ACCURACY
        acc=roc_auc_score(val_y,preds_x)
        print('Iteration:',k,'  roc_auc_score:',acc)
        if k==1:
            score=acc
            best_model=model
            preds=pd.Series(model.predict_proba(test)[:,1])
            col_name=model_name+'preds_'+str(k)
            df_preds[col_name]=preds
        else:
            preds1=pd.Series(model.predict_proba(test)[:,1])
            preds=preds+preds1
            col_name=model_name+'preds_'+str(k)
            df_preds[col_name]=preds1
            if score<acc:
                score=acc
                best_model=model
        avg_score=avg_score+acc        
        k=k+1
    print('\n Best score:',score,' Avg Score:',avg_score/splits)
    #TAKING AVERAGE OF PREDICTIONS
    preds=preds/splits
    
    print('Saving test and train predictions per iteration...')
    df_preds.to_csv(model_name+'.csv',index=False)
    df_preds_x.to_csv(model_name+'_.csv',index=False)
    x_preds=df_preds_x.mean(axis=1)
    del df_preds,df_preds_x
    gc.collect()
    return preds,best_model,x_preds 

In [None]:
%%time
#READING TRAIN DATASET

df_train=pd.read_csv(path+'train.csv')

<h2 id="FeatureSummary">Understanding Training dataset features</h2>
Userstanding Training dataset features using basic statistical measures

<h4>Observations</h4>
<ul>
    <li>No missing values in train dataset</li>
</ul>

<br><a href="#Approach">back to main menu</a>

In [None]:
pd.set_option('display.max_rows', len(df_train.columns))
feature_summary(df_train)

#EXPAND BELOW SECTION TO SEE DETAILED ANALYSIS

In [None]:
#CREATING LIST OF CATEGORICAL AND CONTINUOUS FEATURES
features=[feature for i,feature in enumerate(df_train.columns) if feature!='id' and feature!='target']
cat_features=[feature for i,feature in enumerate(df_train.columns) if df_train[feature].dtype=='int64' and feature!='id' and feature!='target']
cont_features=[feature for i,feature in enumerate(df_train.columns) if df_train[feature].dtype=='float64']

print('Number of all features excluding id and target: ',len(features),'\nNumber of Categorical features excluding id and target: ',len(cat_features),
      '\nNumber of Continuous features: ',len(cont_features))

In [None]:
gc.collect()

<h2 id="Downcasting">Down Casting Training and Testing datasets</h2>
Checking possibility for down casting dataset datatypes. This will help in reducing overall dataset size.

<h4>Observations</h4>
<ul>
    <li>We have only two data types in dataset float64 and int64</li>
    <li>As in most of the cases values range between 0 to 1. Therefore this is an ideal case for downcasting data types</li>
    <li>With downcasting able to reduce training dataset size from 2.1 GB to 963.2 MB</li>
    <li>With downcasting able to reduce training dataset size from 1.1 GB to 481.1 MB</li>
</ul>

<br><a href="#Approach">back to main menu</a>

In [None]:
#CHECKING TRAINING DATASET MEMORY USAGE BEFORE DOWNCASTING
df_train.info(memory_usage='deep')

In [None]:
%%time
#DOWNCASTING TRAINING DATASET
for column in df_train.columns:
    if df_train[column].dtype == "float64":
        df_train[column]=pd.to_numeric(df_train[column], downcast="float")
    if df_train[column].dtype == "int64":
        df_train[column]=pd.to_numeric(df_train[column], downcast="integer")

In [None]:
##CHECKING TRAINING DATASET MEMORY USAGE AFTER DOWNCASTING
df_train.info(memory_usage='deep')

In [None]:
gc.collect()

In [None]:
%%time
#READING TEST DATASET AND SUBMISSION FILE
df_test=pd.read_csv(path+'test.csv')
df_submission=pd.read_csv(path+'sample_submission.csv')

In [None]:
#CHECKING TESTING DATASET MEMORY USAGE BEFORE DOWNCASTING
df_test.info(memory_usage='deep')

In [None]:
%%time
#DOWNCASTING TESTING DATASET
for column in df_test.columns:
    if df_test[column].dtype == "float64":
        df_test[column]=pd.to_numeric(df_test[column], downcast="float")
    if df_test[column].dtype == "int64":
        df_test[column]=pd.to_numeric(df_test[column], downcast="integer")

In [None]:
#CHECKING TESTING DATASET MEMORY USAGE AFTER DOWNCASTING
df_test.info(memory_usage='deep')

In [None]:
gc.collect()

<h2 id="Target">Understanding Target feature distribution</h2>
Lets visualize Target feature.

<h4>Observation</h4>
As observations have almost equal count of response and no response observations, this is a balanced dataset. 

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
#Understanding Target (claim) feature distribution
pie_labels=['Response-'+str(df_train['target'][df_train.target==1].count()),'No Response-'+
            str(df_train['target'][df_train.target==0].count())]
pie_share=[df_train['target'][df_train.target==1].count()/df_train['target'].count(),
           df_train['target'][df_train.target==0].count()/df_train['target'].count()]
figureObject, axesObject = plt.subplots(figsize=(6,6))
pie_colors=('lightblue','lightgrey')
pie_explode=(.01,.01)
axesObject.pie(pie_share,labels=pie_labels,explode=pie_explode,autopct='%.2f%%',colors=pie_colors,startangle=30,shadow=True)
axesObject.axis('equal')
plt.title('Percentage of Response - No Response Observations',color='blue',fontsize=12)
plt.show()

<h2 id="Corr">Correlation Check</h2>
Lets check if there are any correlated features. If two features are highly correlated we can remove one of the feature.
This will help in dimentionality reduction.

<h4>Observation</h4>
<ul>
    <li>We can see some resonable correlation with Target. Therefore, plotting target correlation with other features</li>
    <li>No correlation is observed among Training dataset features.Target has considerable correlation with feature f22</li>
</ul>

<br><a href="#Approach">back to main menu</a>

<h3>CORRELATION CHECK CATEGORICAL FEATURES</h3>

In [None]:
#CORRELATION CHECK CATEGORICAL FEATURES
corr = df_train[cat_features+['target']].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Plotting correlation heatmap
fig,ax=plt.subplots(figsize=(20,20))
ax.set_xticklabels(labels=corr.columns,fontsize=12)
ax.set_yticklabels(labels=corr.columns,fontsize=12)
# plt.rcParams.update({'font.size': 12})
sns.heatmap(corr,mask=mask,cmap='tab20c',linewidth=0.1)
plt.show()

In [None]:
del corr
gc.collect()

<h3>CORRELATION CHECK CONTINUOUS FEATURES</h3>

In [None]:
#CORRELATION CHECK CONTINUOUS FEATURES
corr = df_train[cont_features+['target']].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Plotting correlation heatmap
fig,ax=plt.subplots(figsize=(30,30))
ax.set_xticklabels(labels=corr.columns,fontsize=12)
ax.set_yticklabels(labels=corr.columns,fontsize=12)

# plt.rcParams.update({'font.size': 12})
sns.heatmap(corr,mask=mask,cmap='tab20c',linewidth=0.5)
plt.show()

In [None]:
del corr
gc.collect()

In [None]:
#CORRELATION CHECK BETWEEN TARGET AND CATEGORICAL FEATURES
corr=pd.DataFrame()
corr['target'] = df_train[cat_features].corrwith(df_train['target'])
plt.subplots(figsize=(3,15))
df=corr.sort_values(by='target', ascending=False)
heatmap = sns.heatmap(df,annot=True,cmap='tab20c',linewidth=0.5,xticklabels=df.columns,yticklabels=df.index)

heatmap.set_title('Categorical Features Correlating with target', fontdict={'fontsize':18}, pad=16)
plt.show()

In [None]:
#CORRELATION CHECK BETWEEN TARGET AND CONTINUOUS FEATURES
corr=pd.DataFrame()
corr['target'] = df_train[cont_features].corrwith(df_train['target'])
plt.subplots(figsize=(3,50))
df=corr.sort_values(by='target', ascending=False)
heatmap = sns.heatmap(df,annot=True,cmap='tab20c',linewidth=0.5,xticklabels=df.columns,yticklabels=df.index)

heatmap.set_title('Continous Features Correlating with target', fontdict={'fontsize':18}, pad=16)
plt.show()

In [None]:
del corr
gc.collect()

<h2 id="TrainVisual">Visualizating Training dataset</h2>
We are making use of PCA, dimentionality reduction technique to Visualize Training dataset.<br>
Visualization is also helpful in understanding any grouping or patterns within dataset.
<h4>Observation</h4>
No pattern or grouping observed in training dataset

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
X=df_train.drop(['id','target'],axis=1)

pca = PCA(n_components=3,random_state=200)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal_component_1','principal_component_2','principal_component_3'])
principalDf['target']=df_train['target']

fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111, projection = '3d')

ax.set_xlabel("principal_component_1")
ax.set_ylabel("principal_component_2")
ax.set_zlabel("principal_component_3")

sc=ax.scatter(xs=principalDf['principal_component_1'], ys=principalDf['principal_component_2'],
              zs=principalDf['principal_component_3'],c=principalDf['target'],cmap='OrRd')
plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2)
plt.show()

In [None]:
del X
gc.collect()

<h2 id="AggFeatures">Creating Aggregated features</h2>
Creating aggregated features
<h4>Observation</h4>


<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
#SIMPLE FEATURE ENGINEERING, CREATING SOME AGGREGATION FEATURES
df_train['cat_sum']=df_train[cat_features].sum(axis=1)
df_test['cat_sum']=df_test[cat_features].sum(axis=1)

df_train['rcat_sum']=df_train.cat_sum.apply(lambda x:45-x)
df_test['rcat_sum']=df_test.cat_sum.apply(lambda x:45-x)

df_train['cont_std'] = df_train[cont_features].std(axis=1)
df_test['cont_std'] = df_test[cont_features].std(axis=1)

df_train['cont_mean'] = df_train[cont_features].mean(axis=1)
df_test['cont_mean'] = df_test[cont_features].mean(axis=1)

features += ['cat_sum','rcat_sum','cont_std','cont_mean']

In [None]:
gc.collect()

<h2 id="Normalization">Normalizing dataset</h2>
Using Standard Scaler to normalize dataset

<br><a href="#Approach">back to main menu</a>


In [None]:
%%time
scaler = StandardScaler()
df_train[features] = scaler.fit_transform(df_train[features])
df_test[features] = scaler.transform(df_test[features])

In [None]:
X=df_train.drop(['id','target'],axis=1).to_numpy()
y=df_train['target'].values
test=df_test.drop(['id'],axis=1).to_numpy()

In [None]:
del df_train,df_test,scaler
gc.collect()

In [None]:
X.shape,y.shape,test.shape

<h2 id="LogisticRegression">LogisticRegression</h2>
Starting with base Linear Model, without any hyperparameter tuning.

<h4>Observations</h4>

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
model=LogisticRegression()
print('Logistic Regression parameters:\n',model.get_params())

logistic_predictions,best_logistic_model,LRpreds=response_predictor(X,y,test,model,'LR')

In [None]:
gc.collect()

In [None]:
df_submission['target']=logistic_predictions
#SAVING LOGISTIC PREDICTIONS
df_submission.to_csv('logistic_submission.csv',index=False)
df_submission

In [None]:
gc.collect()

In [None]:
df_feature_impt=pd.DataFrame()
df_feature_impt['features']=features
df_feature_impt['importance']=best_logistic_model.coef_[0]

df_feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (15,50))
ax=sns.barplot(x=df_feature_impt['importance'],y=df_feature_impt['features'],data=df_feature_impt)
ax.bar_label(ax.containers[0]);

<h2 id="Ridge">Ridge Classifier</h2>
Starting with base Ridge model, without any hyperparameter tuning.

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
model=RidgeClassifierwithProba()
print('Ridge Classifier parameters:\n',model.get_params())

ridge_predictions,best_ridge_model,ridge_preds=response_predictor(X,y,test,model,'RC')

In [None]:
gc.collect()

In [None]:
df_submission['target']=ridge_predictions
#SAVING LGBM PREDICTIONS
df_submission.to_csv('ridge_submission.csv',index=False)
df_submission

In [None]:
df_feature_impt=pd.DataFrame()
df_feature_impt['features']=features
df_feature_impt['importance']=best_ridge_model.coef_[0]

df_feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (15,50))
ax=sns.barplot(x=df_feature_impt['importance'],y=df_feature_impt['features'],data=df_feature_impt)
ax.bar_label(ax.containers[0]);

In [None]:
gc.collect()

<h2 id="LGBM">LGBMClassifier</h2>
Starting with base LGBM model, without any hyperparameter tuning.

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
lgbm_params = {
    'metric' : 'auc',
    'objective' : 'binary',
    'device_type': 'gpu', 
    'n_estimators': 20000, 
    'learning_rate':  0.01, 
    'min_child_weight': 256,
    'min_child_samples': 20, 
    'reg_alpha': 10, 
    'reg_lambda': 0.1, 
    'subsample': 0.6, 
    'subsample_freq': 1, 
    'colsample_bytree': 0.4
   }

model=lgb.LGBMClassifier(**lgbm_params)
print('LGBM parameters:\n',model.get_params())

lgb_predictions,best_lgb_model,LGBpreds=response_predictor(X,y,test,model,'LGB')

In [None]:
gc.collect()

In [None]:
df_submission['target']=lgb_predictions
#SAVING LGBM PREDICTIONS
df_submission.to_csv('lgb_submission.csv',index=False)
df_submission

In [None]:
df_feature_impt=pd.DataFrame()
df_feature_impt['features']=features
df_feature_impt['importance']=best_lgb_model.feature_importances_

df_feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (15,50))
ax=sns.barplot(x=df_feature_impt['importance'],y=df_feature_impt['features'],data=df_feature_impt)
ax.bar_label(ax.containers[0]);

In [None]:
gc.collect()

<h2 id="CatBoost">CatBoostClassifier</h2>

Simple CatBoost without any hyperparameter tunning.

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time

catb_params = {
    'eval_metric' : 'AUC',
    'iterations': 15585,
    'objective': 'CrossEntropy',
    'bootstrap_type': 'Bernoulli',
    'od_wait': 1144,
    'learning_rate': 0.023575206684596582,
    'reg_lambda': 36.30433203563295,
    'random_strength': 43.75597655616195,
    'depth': 7,
    'min_data_in_leaf': 11,
    'leaf_estimation_iterations': 1,
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}

model=CatBoostClassifier(**catb_params)
print('CatBoost paramters:\n',model.get_params())

catb_predictions,best_catb_model,CBpreds=response_predictor(X,y,test,model,'CB')

In [None]:
gc.collect()

In [None]:
df_submission['target']=catb_predictions
#SAVING CATBOOST PREDICTIONS
df_submission.to_csv('catb_submission.csv',index=False)
df_submission

In [None]:
df_feature_impt=pd.DataFrame()
df_feature_impt['features']=features
df_feature_impt['importance']=best_catb_model.feature_importances_

df_feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (15,50))
ax=sns.barplot(x=df_feature_impt['importance'],y=df_feature_impt['features'],data=df_feature_impt)
ax.bar_label(ax.containers[0]);

In [None]:
gc.collect()

<h2 id="XGBoost">XGBClassifier</h2>

Simple XGBoost without any hyperparameter tunning.

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric' : 'auc',
    'tree_method': 'gpu_hist',
    'learning_rate': 0.02,
    'n_estimators': 20_000,
    'random_state': 42,
    'lambda': 0.003731399945310043,
    'alpha': 0.1660536107526955,
    'colsample_bytree': 0.5164889907489927,
    'subsample': 0.5869840790716806,
    'max_depth': 18,
    'min_child_weight': 142,
    }

model=xgb.XGBClassifier(**xgb_params)
print('XGB parameters:\n',model.get_params())

xgb_predictions,best_xgb_model,XGBpreds=response_predictor(X,y,test,model,'XGB')

In [None]:
gc.collect()

In [None]:
df_submission['target']=xgb_predictions
#SAVING LGBM PREDICTIONS
df_submission.to_csv('xgb_submission.csv',index=False)
df_submission

In [None]:
df_feature_impt=pd.DataFrame()
df_feature_impt['features']=features
df_feature_impt['importance']=best_xgb_model.feature_importances_

df_feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (15,55))
ax=sns.barplot(x=df_feature_impt['importance'],y=df_feature_impt['features'],data=df_feature_impt)
ax.bar_label(ax.containers[0]);

In [None]:
gc.collect()

In [None]:
#BLENDING PREDICTIONS
df_submission['target']=logistic_predictions*0.05+ridge_predictions*0.05+lgb_predictions*0.1+catb_predictions*0.6+xgb_predictions*0.2
#CREATING SUMBISSION FILE
df_submission.to_csv('submission.csv',index=False)

In [None]:
df_submission