<b>Problem Statement:</b>  The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting <b>identifying spam emails</b> via various extracted features from the email. We have to predict molecule response probability.

<b>Problem type:</b> A binary classification problem.

<b>Evaluation matrix:</b> Submissions are evaluated on area under the <b>ROC(receiver operating characteristic)</b> curve between the predicted probability and the observed target.

<h2 id="Approach">Approach to the problem</h2>
Idea is to develop a generalized approach for solving any binary classification problem
<ol>
    <li>Performing exploratory data analysis (EDA) and Data Preparation (DP).</li>
    <ol>
        <li><a href="#FeatureSummary">Understanding Train and Test dataset features (EDA)</a></li>
        <li><a href="#Downcasting">Down Casting Training and Testing datasets (DP)</a></li>
        <li><a href="#Target">Understanding Target feature distribution (EDA)</a></li>
        <li><a href="#Corr">Correlation check (EDA)</a></li>
        <li><a href="#TrainVisual">Visualizing Training dataset (EDA)</a></li>
        <li><a href="#Normalizing">Normalizing dataset (DP)</a></li>
    </ol>
    <li>Feature Engineering.</li>
    <ol>
        <li><a href="#AggFeatures">Creating Aggregated features</a></li>
    </ol>
    <li>Training Linear,Gradient Boost and Ensemble models.</li>
    <ol>
        <li><a href="#LogisticRegression">Logistic Regression</a></li>
        <li><a href="#Ridge">Ridge Classifier</a></li>
        <li><a href="#LGBM">LGBM Classification</a></li>
    </ol>
    <li><a href="#Blending">Blending</a></li>
    <li><a href="#Stacking">Stacking</a></li>
   </ol>

<h4>Observations</h4>
<ul>
    <li>Linear models performing better than gradient boost models. Best <b>Logistic Regression Public Score of 0.74553</b> submitted by this notebook </li>
    <li>Can't find a blending score better than single model Linear Regression score.</li>
    <li>Achieved best <b>Stacking Public Score of 0.74635</b> by Stacking Linear Regression and Ridge outputs to build Catboost model.</li>
   
</ul>

In [None]:
#REQUIRED LIBRARIES

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression,RidgeClassifier,Lasso
# from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
import lightgbm as lgb
from sklearn.utils.extmath import softmax
# import pickle
# from sklearn.externals import joblib

# import pandas_profiling as pp

warnings.filterwarnings('ignore')
gc.enable()
%matplotlib inline

In [None]:
#CHECKING ALL AVAILABLE FILES
path='/kaggle/input/tabular-playground-series-nov-2021/'
data_files=list(os.listdir(path))
df_files=pd.DataFrame(data_files,columns=['file_name'])
df_files['size_in_gb']=df_files.file_name.apply(lambda x: round(os.path.getsize(path+x)/(1024*1024*1024),4))
df_files['type']=df_files.file_name.apply(lambda x:'file' if os.path.isfile(path+x) else 'directory')
df_files['file_count']=df_files[['file_name','type']].apply(lambda x: 0 if x['type']=='file' else len(os.listdir(path+x['file_name'])),axis=1)

print('Following files are available under path:',path)
display(df_files)

In [None]:
#ALL CUSTOM FUNCTIONS

#FUNCTION FOR PROVIDING FEATURE SUMMARY
def feature_summary(df_fa):
    print('DataFrame shape')
    print('rows:',df_fa.shape[0])
    print('cols:',df_fa.shape[1])
    col_list=['null','unique_count','data_type','max/min','mean','median','mode','std','skewness','sample_values']
    df=pd.DataFrame(index=df_fa.columns,columns=col_list)
    df['null']=list([len(df_fa[col][df_fa[col].isnull()]) for i,col in enumerate(df_fa.columns)])
    #df['%_Null']=list([len(df_fa[col][df_fa[col].isnull()])/df_fa.shape[0]*100 for i,col in enumerate(df_fa.columns)])
    df['unique_count']=list([len(df_fa[col].unique()) for i,col in enumerate(df_fa.columns)])
    df['data_type']=list([df_fa[col].dtype for i,col in enumerate(df_fa.columns)])
    for i,col in enumerate(df_fa.columns):
        if 'float' in str(df_fa[col].dtype) or 'int' in str(df_fa[col].dtype):
            df.at[col,'max/min']=str(round(df_fa[col].max(),2))+'/'+str(round(df_fa[col].min(),2))
            df.at[col,'mean']=round(df_fa[col].mean(),4)
            df.at[col,'median']=round(df_fa[col].median(),4)
            df.at[col,'mode']=round(df_fa[col].mode()[0],4)
            df.at[col,'std']=round(df_fa[col].std(),4)
            df.at[col,'skewness']=round(df_fa[col].skew(),4)
        elif 'datetime64[ns]' in str(df_fa[col].dtype):
            df.at[col,'max/min']=str(df_fa[col].max())+'/'+str(df_fa[col].min())
        df.at[col,'sample_values']=list(df_fa[col].unique())
    display(df_fa.head())      
    return(df.fillna('-'))


def feature_compare(df_fa,df_ft):
    print('Train DataFrame shape')
    print('rows:',df_fa.shape[0])
    print('cols:',df_fa.shape[1])
    
    print('Test DataFrame shape')
    print('rows:',df_ft.shape[0])
    print('cols:',df_ft.shape[1])
    
    col_list=['null','unique_count','data_type','max/min','mean','median','mode','std','skewness','sample_values']
    df=pd.DataFrame(index=pd.MultiIndex.from_product([df_train.columns,['train','test']],names=['features','dataset']),columns=col_list)
   
    df.loc[(slice(None),['train']),'null']=list([len(df_fa[col][df_fa[col].isnull()]) for i,col in enumerate(df_fa.columns)])
    df.loc[(slice(None),['test']),'null']=list([len(df_ft[col][df_ft[col].isnull()]) for i,col in enumerate(df_ft.columns)])+['-']
    
    
    #df['%_Null']=list([len(df_fa[col][df_fa[col].isnull()])/df_fa.shape[0]*100 for i,col in enumerate(df_fa.columns)])
    df.loc[(slice(None),['train']),'unique_count']=list([len(df_fa[col].unique()) for i,col in enumerate(df_fa.columns)])
    df.loc[(slice(None),['test']),'unique_count']=list([len(df_ft[col].unique()) for i,col in enumerate(df_ft.columns)])+['-']
    
    df.loc[(slice(None),['train']),'data_type']=list([df_fa[col].dtype for i,col in enumerate(df_fa.columns)])
    df.loc[(slice(None),['test']),'data_type']=list([df_ft[col].dtype for i,col in enumerate(df_ft.columns)])+['-']
    
    for i,col in enumerate(df_fa.columns):
        if 'float' in str(df_fa[col].dtype) or 'int' in str(df_fa[col].dtype):
            df.loc[([col],['train']),'max/min']=str(round(df_fa[col].max(),2))+'/'+str(round(df_fa[col].min(),2))
            df.loc[([col],['train']),'mean']=round(df_fa[col].mean(),4)
            df.loc[([col],['train']),'median']=round(df_fa[col].median(),4)
            df.loc[([col],['train']),'mode']=round(df_fa[col].mode()[0],4)
            df.loc[([col],['train']),'std']=round(df_fa[col].std(),4)
            df.loc[([col],['train']),'skewness']=round(df_fa[col].skew(),4)
        elif 'datetime64[ns]' in str(df_fa[col].dtype):
            df.loc[([col],['train']),'max/min']=str(df_fa[col].max())+'/'+str(df_fa[col].min())
        df.loc[([col],['train']),'sample_values']=str(list(df_fa[col].unique()))
        
        
    for i,col in enumerate(df_ft.columns):            
        if 'float' in str(df_fa[col].dtype) or 'int' in str(df_fa[col].dtype):
            df.loc[([col],['test']),'max/min']=str(round(df_ft[col].max(),2))+'/'+str(round(df_ft[col].min(),2))
            df.loc[([col],['test']),'mean']=round(df_ft[col].mean(),4)
            df.loc[([col],['test']),'median']=round(df_ft[col].median(),4)
            df.loc[([col],['test']),'mode']=round(df_ft[col].mode()[0],4)
            df.loc[([col],['test']),'std']=round(df_ft[col].std(),4)
            df.loc[([col],['test']),'skewness']=round(df_ft[col].skew(),4)
        elif 'datetime64[ns]' in str(df_fa[col].dtype):
            df.loc[([col],['test']),'max/min']=str(df_ft[col].max())+'/'+str(df_ft[col].min())
        df.loc[([col],['test']),'sample_values']=str(list(df_ft[col].unique()))
        
    return(df.fillna('-'))

#EXTENDING RIDGE CLASSIFIER WITH PREDICT PROBABILITY FUNCITON

class RidgeClassifierwithProba(RidgeClassifier):
    def predict_proba(self, X):
        d = self.decision_function(X)
        d_2d = np.c_[-d, d]
        return softmax(d_2d)

    
#PREDICTION FUNCTIONS

def response_predictor(X,y,test,iterations,model,model_name):  

    df_preds=pd.DataFrame()
    df_preds_x=pd.DataFrame()
    k=1
    splits=iterations
    avg_score=0

    #CREATING STRATIFIED FOLDS
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=200)
    print('\nStarting KFold iterations...')
    for train_index,test_index in skf.split(X,y):
        df_X=X[train_index,:]
        df_y=y[train_index]
        val_X=X[test_index,:]
        val_y=y[test_index]
       

    #FITTING MODEL
        model.fit(df_X,df_y)

    #PREDICTING ON VALIDATION DATA
        col_name=model_name+'xpreds_'+str(k)
        preds_x=pd.Series(model.predict_proba(val_X)[:,1])
        df_preds_x[col_name]=pd.Series(model.predict_proba(X)[:,1])

    #CALCULATING ACCURACY
        acc=roc_auc_score(val_y,preds_x)
        print('Iteration:',k,'  roc_auc_score:',acc)
        if k==1:
            score=acc
            best_model=model
            preds=pd.Series(model.predict_proba(test)[:,1])
            col_name=model_name+'preds_'+str(k)
            df_preds[col_name]=preds
        else:
            preds1=pd.Series(model.predict_proba(test)[:,1])
            preds=preds+preds1
            col_name=model_name+'preds_'+str(k)
            df_preds[col_name]=preds1
            if score<acc:
                score=acc
                best_model=model
        avg_score=avg_score+acc        
        k=k+1
    print('\n Best score:',score,' Avg Score:',avg_score/splits)
    #TAKING AVERAGE OF PREDICTIONS
    preds=preds/splits
    
    print('Saving test and train predictions per iteration...')
    df_preds.to_csv(model_name+'.csv',index=False)
    df_preds_x.to_csv(model_name+'_.csv',index=False)
    x_preds=df_preds_x.mean(axis=1)
    del df_preds,df_preds_x
    gc.collect()
    return preds,best_model,x_preds 

In [None]:
%%time
#READING TRAIN DATASET

df_train=pd.read_csv(path+'train.csv')

#READING TEST DATASET AND SUBMISSION FILE
df_test=pd.read_csv(path+'test.csv')
df_submission=pd.read_csv(path+'sample_submission.csv')

<h2 id="FeatureSummary">Understanding Train and Test dataset features</h2>
Understanding Train and Test dataset features in comparative view, using basic statistical measures.

<h4>Observations</h4>
<ul>
    <li>No missing values in train or test dataset</li>
</ul>

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
#UNDERSTANDING TRAIN AND TEST DATASET USING FEATURE BY FEATURE COMPRISON
pd.set_option('display.max_rows', len(df_train.columns)*2)
feature_compare(df_train,df_test)

In [None]:
gc.collect()

In [None]:
#CREATING A FEATURE LIST EXCLUDING ID AND TARGET
features=[col for col in df_train.columns if col!='id' and col!='target']

In [None]:
%%time
#VISUALIZING TRAIN AND TEST FEATURE DISTRIBUTION
plt.figure()
fig, ax = plt.subplots(20, 5,figsize=(20,70))

for i,feature in enumerate(features):
    plt.subplot(20, 5,i+1)
    sns.kdeplot(data=df_train[feature],x=df_train[feature],color='red', label='train')
    plt.axvline(x=df_train[feature].mean(),color='yellow',linestyle='--',label='train mean')
    sns.kdeplot(df_test[feature],x=df_test[feature],color='grey',label='test')
    plt.axvline(x=df_test[feature].mean(),color='orange',linestyle='--',label='test mean')
    plt.xlabel(feature,color='blue')
    if i%5!=0:
        plt.ylabel('')
        
    else:
        plt.ylabel('Density',color='blue')
    plt.legend(loc=1,fontsize='x-small')
    
    
plt.show();

<h2 id="Downcasting">Down Casting Training and Testing datasets</h2>
Checking possibility for down casting dataset datatypes. This will help in reducing overall dataset size.

<h4>Observations</h4>
<ul>
    <li>We have only two data types in dataset float64 and int64</li>
    <li>It is always a good idea to reduce overall dataset size by finding correct datatypes</li>
    <li>With downcasting able to reduce training dataset size from 466.9 MB to 231.7 MB</li>
    <li>With downcasting able to reduce training dataset size from 416.1 MB to 208.1 MB</li>
</ul>

<br><a href="#Approach">back to main menu</a>

In [None]:
#CHECKING TRAIN AND TEST DATASET MEMORY USAGE BEFORE DOWNCASTING
print('train dataset data usage information\n')
df_train.info(memory_usage='deep')
print('\ntest dataset data usage information\n')
df_test.info(memory_usage='deep')

In [None]:
%%time
#DOWNCASTING TRAIN DATASET
for column in df_train.columns:
    if df_train[column].dtype == "float64":
        df_train[column]=pd.to_numeric(df_train[column], downcast="float")
    if df_train[column].dtype == "int64":
        df_train[column]=pd.to_numeric(df_train[column], downcast="integer")
        
#DOWNCASTING TEST DATASET
for column in df_test.columns:
    if df_test[column].dtype == "float64":
        df_test[column]=pd.to_numeric(df_test[column], downcast="float")
    if df_test[column].dtype == "int64":
        df_test[column]=pd.to_numeric(df_test[column], downcast="integer")

In [None]:
#CHECKING TRAIN AND TEST DATASET MEMORY USAGE AFTER DOWNCASTING
print('train dataset data usage information\n')
df_train.info(memory_usage='deep')
print('\ntest dataset data usage information\n')
df_test.info(memory_usage='deep')

In [None]:
gc.collect()

<h2 id="Target">Understanding Target feature distribution</h2>
Lets visualize Target feature.

<h4>Observation</h4>
As observations have almost equal count of response and no response observations, this is a balanced dataset. 

<br><a href="#Approach">back to main menu</a>

In [None]:
#Understanding Target (claim) feature distribution
pie_labels=['Spam-'+str(df_train['target'][df_train.target==1].count()),'No Spam-'+
            str(df_train['target'][df_train.target==0].count())]
pie_share=[df_train['target'][df_train.target==1].count()/df_train['target'].count(),
           df_train['target'][df_train.target==0].count()/df_train['target'].count()]
figureObject, axesObject = plt.subplots(figsize=(6,6))
pie_colors=('yellow','lightgrey')
pie_explode=(.01,.01)
axesObject.pie(pie_share,labels=pie_labels,explode=pie_explode,autopct='%.2f%%',colors=pie_colors,startangle=30,shadow=True)
axesObject.axis('equal')
plt.title('Percentage of Response - No Response Observations',color='blue',fontsize=12)
plt.show()

<h2 id="Corr">Correlation Check</h2>
Lets check if there are any correlated features. If two features are highly correlated we can remove one of the feature.
This will help in dimentionality reduction.

<h4>Observation</h4>
<ul>
    <li>No reasonable correlation is observed</li>
   
</ul>

<br><a href="#Approach">back to main menu</a>

In [None]:
#CORRELATION CHECK CATEGORICAL FEATURES
corr = df_train[features+['target']].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Plotting correlation heatmap
fig,ax=plt.subplots(figsize=(20,20))
ax.set_xticklabels(labels=corr.columns,fontsize=12)
ax.set_yticklabels(labels=corr.columns,fontsize=12)
# plt.rcParams.update({'font.size': 12})
sns.heatmap(corr,mask=mask,cmap='tab20c',linewidth=0.1)
plt.title('Correlation Map',color='blue',fontsize=12)
plt.show()

In [None]:
del corr
gc.collect()

<h2 id="TrainVisual">Visualizating Training dataset</h2>
We are making use of PCA, dimentionality reduction technique to Visualize Training dataset.<br>
Visualization is also helpful in understanding any grouping or patterns within dataset.
<h4>Observation</h4>
No pattern or grouping observed in training dataset

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
X=df_train.drop(['id','target'],axis=1)

pca = PCA(n_components=2,random_state=200)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal_component_1','principal_component_2'])
principalDf['target']=df_train['target']

fig = plt.figure(figsize=(15,15))
sc=plt.scatter(x=principalDf['principal_component_1'], y=principalDf['principal_component_2'],c=principalDf['target'],cmap='Accent')
plt.legend(*sc.legend_elements(),bbox_to_anchor=(1.05, 1), loc=2)
plt.title('2D Visualization of train Dataset',color='blue',fontsize=12)
plt.show()

In [None]:
%%time
X=df_train.drop(['id','target'],axis=1)

pca = PCA(n_components=3,random_state=200)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal_component_1','principal_component_2','principal_component_3'])
principalDf['target']=df_train['target']

fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111, projection = '3d')

ax.set_xlabel("principal_component_1")
ax.set_ylabel("principal_component_2")
ax.set_zlabel("principal_component_3")

sc=ax.scatter(xs=principalDf['principal_component_1'], ys=principalDf['principal_component_2'],
              zs=principalDf['principal_component_3'],c=principalDf['target'],cmap='Accent')
plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2)
plt.title('3D Visualization of train Dataset',color='blue',fontsize=12)
plt.show()

In [None]:
del X
gc.collect()

<h2 id="AggFeatures">Creating Aggregated features</h2>
Creating aggregated features
<h4>Observation</h4>


<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
#SIMPLE FEATURE ENGINEERING, CREATING SOME AGGREGATION FEATURES
df_train['sum']=df_train[features].sum(axis=1)
df_test['sum']=df_test[features].sum(axis=1)

df_train['mean']=df_train[features].mean(axis=1)
df_test['mean']=df_test[features].mean(axis=1)

df_train['std'] = df_train[features].std(axis=1)
df_test['std'] = df_test[features].std(axis=1)

df_train['max'] = df_train[features].max(axis=1)
df_test['max'] = df_test[features].max(axis=1)

df_train['min'] = df_train[features].min(axis=1)
df_test['min'] = df_test[features].min(axis=1)

df_train['kurt'] = df_train[features].kurtosis(axis=1)
df_test['kurt'] = df_test[features].kurtosis(axis=1)

agg_features= ['sum','mean','std','max','min','kurt']

In [None]:
gc.collect()

<h2 id="Normalization">Normalizing dataset</h2>
Using Standard Scaler to normalize dataset

<br><a href="#Approach">back to main menu</a>


In [None]:
%%time
scaler = StandardScaler()
X = scaler.fit_transform(df_train[features+agg_features])
test = scaler.transform(df_test[features+agg_features])
y=df_train['target'].values

In [None]:
gc.collect()

In [None]:
#FINAL DATASET SHAPES
X.shape,y.shape,test.shape

<h2 id="LogisticRegression">LogisticRegression</h2>
Starting with base Linear Model, with basic hyperparameter tuning.

<h4>Observations</h4>

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
lr_params={'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
model=LogisticRegression(**lr_params)
print('Logistic Regression parameters:\n',model.get_params())

logistic_predictions,best_logistic_model,LRpreds=response_predictor(X,y,test,10,model,'LR')

In [None]:
gc.collect()

In [None]:
df_submission['target']=logistic_predictions
#SAVING LOGISTIC PREDICTIONS
df_submission.to_csv('logistic_submission.csv',index=False)
df_submission

In [None]:
gc.collect()

In [None]:
df_feature_impt=pd.DataFrame()
df_feature_impt['features']=features+agg_features
df_feature_impt['importance']=best_logistic_model.coef_[0]

df_feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (15,50))
ax=sns.barplot(x=df_feature_impt['importance'],y=df_feature_impt['features'],data=df_feature_impt)
plt.title('Feature importance Logistic Regression Model',color='blue',fontsize=12)
ax.bar_label(ax.containers[0]);

<h2 id="Ridge">Ridge Classifier</h2>
Starting with base Ridge model, without any hyperparameter tuning.

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
model=RidgeClassifierwithProba()
print('Ridge Classifier parameters:\n',model.get_params())

ridge_predictions,best_ridge_model,ridge_preds=response_predictor(X,y,test,10,model,'RC')

In [None]:
gc.collect()

In [None]:
df_submission['target']=ridge_predictions
#SAVING LGBM PREDICTIONS
df_submission.to_csv('ridge_submission.csv',index=False)
df_submission

In [None]:
df_feature_impt=pd.DataFrame()
df_feature_impt['features']=features+agg_features
df_feature_impt['importance']=best_ridge_model.coef_[0]

df_feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (15,50))
ax=sns.barplot(x=df_feature_impt['importance'],y=df_feature_impt['features'],data=df_feature_impt)
plt.title('Feature importance Ridge Model',color='blue',fontsize=12)
ax.bar_label(ax.containers[0]);

In [None]:
gc.collect()

<h2 id="LGBM">LGBM Classification</h2>

Simple LGBMClassifier without any hyperparameter tunning.

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
lgbm_params = {
    'metric' : 'auc',
    'objective' : 'binary',
   }

model=lgb.LGBMClassifier(**lgbm_params)
print('LGBM parameters:\n',model.get_params())

lgb_predictions,best_lgb_model,LGBpreds=response_predictor(X,y,test,10,model,'LGB')

In [None]:
gc.collect()

In [None]:
df_submission['target']=lgb_predictions
#SAVING LGBM PREDICTIONS
df_submission.to_csv('lgb_submission.csv',index=False)
df_submission

In [None]:
df_feature_impt=pd.DataFrame()
df_feature_impt['features']=features+agg_features
df_feature_impt['importance']=best_lgb_model.feature_importances_

df_feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (15,50))
ax=sns.barplot(x=df_feature_impt['importance'],y=df_feature_impt['features'],data=df_feature_impt)
plt.title('Feature importance LGB Model',color='blue',fontsize=12)
ax.bar_label(ax.containers[0]);

In [None]:
gc.collect()

<h2 id="Blending">Blending</h2>

Simple Blending approach, based on observation

<br><a href="#Approach">back to main menu</a>

In [None]:
#BLENDING PREDICTIONS
df_submission['target']=logistic_predictions*0.6+ridge_predictions*0.2+lgb_predictions*0.2
#CREATING SUMBISSION FILE
df_submission.to_csv('submission.csv',index=False)

In [None]:
df_submission

<h2 id="Stacking">Stacking approach</h2>

<ol>
    <li>Stacking Linear Regression, Ridge and LGBM. Using Catboost on stacked dataset</li>
    <li>Stacking Linear Regression and Ridge. Using LGBM on stacked dataset</li>
    <li>Stacking Linear Regression and Ridge. Using Catboost on stacked dataset.</li>
</ol>
<h3>Observations</h3>
<ul>
    <li>Getting best <b>Public Score of 0.74635</b> using 3rd approach</li>
</ul>
<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
#READING PREDICTED VALUES
df_LR_=pd.read_csv('./LR_.csv')
df_LR=pd.read_csv('./LR.csv')
df_RC_=pd.read_csv('./RC_.csv')
df_RC=pd.read_csv('./RC.csv')
df_LGB_=pd.read_csv('./LGB_.csv')
df_LGB=pd.read_csv('./LGB.csv')


In [None]:
%%time
X=pd.concat([df_LR_,df_RC_,df_LGB_],axis=1).to_numpy()
test=pd.concat([df_LR,df_RC,df_LGB],axis=1).to_numpy()

In [None]:
%%time
catb_params = {
    'eval_metric' : 'AUC',
    'verbose' : 0,
     'learning_rate':0.01,
    'n_estimators':200
    }
model=CatBoostClassifier(**catb_params)
print('CatBoost paramters:\n',model.get_params())

catb_predictions,best_catb_model,CBpreds=response_predictor(X,y,test,10,model,'CBS')

In [None]:
df_submission['target']=catb_predictions
#SAVING CATBOOST PREDICTIONS
df_submission.to_csv('catbstack_mixed_submission.csv',index=False)
df_submission

In [None]:
%%time
#STACKING OUTCOMES FROM LINEAR MODELS
X=pd.concat([df_LR_,df_RC_],axis=1).to_numpy()
test=pd.concat([df_LR,df_RC],axis=1).to_numpy()

In [None]:
%%time
lgbm_params = {
    'metric' : 'auc',
    'objective' : 'binary',
   }

model=lgb.LGBMClassifier(**lgbm_params)
print('LGBM parameters:\n',model.get_params())

lgb_predictions,best_lgb_model,LGBpreds=response_predictor(X,y,test,10,model,'LGB')

In [None]:
df_submission['target']=lgb_predictions
#SAVING CATBOOST PREDICTIONS
df_submission.to_csv('lgbstack_linear_submission.csv',index=False)
df_submission

In [None]:
%%time
catb_params = {
    'eval_metric' : 'AUC',
    'verbose' : 0,
     'learning_rate':0.01,
    'n_estimators':200
    }
model=CatBoostClassifier(**catb_params)
print('CatBoost paramters:\n',model.get_params())

catb_predictions,best_catb_model,CBpreds=response_predictor(X,y,test,10,model,'CBS')

In [None]:
df_submission['target']=catb_predictions
#SAVING CATBOOST PREDICTIONS
df_submission.to_csv('catbstack_linear_submission.csv',index=False)
df_submission