In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

#### This is simple one stacked ensemble solution based on XGB, LGBM and CATB
#### Thank you for checking out my notebook and if you like it or even copy some code, please leave an upvote.

# Importing libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

import gc
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from statistics import mean

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Reducing memory usage

In [None]:
# credits -- https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file)
    df = reduce_mem_usage(df)
    return df

# Reading data

In [None]:
train = import_data('../input/tabular-playground-series-oct-2021/train.csv')
test = import_data('../input/tabular-playground-series-oct-2021/test.csv')
sample_submission = import_data('../input/tabular-playground-series-oct-2021/sample_submission.csv')

# Checking for null values

In [None]:
print("Null values in train data", train.isnull().sum().sum())
print("Null values in test data", test.isnull().sum().sum())

#### No null values in the dataset. We are lucky :-)

# Preprocessing

In [None]:
continous_cols= ['f'+str(i) for i in range(242)]
continous_cols.remove('f22')
continous_cols.remove('f43')

categorical_cols = ['f'+str(i) for i in range(242,285)]+['f22','f43']

cols = continous_cols + categorical_cols

In [None]:
# # Normalizing the features
# scaler = MinMaxScaler()

# train[continous_cols] = scaler.fit_transform(train[continous_cols])
# test[continous_cols] = scaler.transform(test[continous_cols])

# Train base model
#### Good Blog for understanding ROC curve 
#### https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5

In [None]:
def Stacker(model, model_name, train_data, test_data, fold):
    test_preds = np.zeros(test_data.shape[0])
    train_preds = np.zeros(train_data.shape[0])
    
    kf = StratifiedKFold(n_splits=fold,random_state=48,shuffle=True)
    auc=[]
    n=0
    
    for train_index, test_index in kf.split(train[cols],train['target']):
        
        X_train, X_valid = train[cols].iloc[train_index], train[cols].iloc[test_index]
        y_train, y_valid = train['target'].iloc[train_index], train['target'].iloc[test_index]
        
        if model_name == 'catb':
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], silent=True)
        else:
            model.fit(X_train, y_train, eval_set=[(X_valid,y_valid)], early_stopping_rounds=100, eval_metric="auc", verbose=False)
        
        test_preds += model.predict_proba(test_data[cols])[:,1]/kf.n_splits
        train_preds += model.predict_proba(train_data[cols])[:,1]/kf.n_splits
        
        auc.append(roc_auc_score(y_valid, model.predict_proba(X_valid)[:, 1]))
        gc.collect()
        
        print(f"fold: {n+1}, auc: {auc[n]}")
        n+=1
    print("Average =", mean(auc))
    return train_preds, test_preds

In [None]:
lgbm = LGBMClassifier(device='gpu')
xgb = XGBClassifier(tree_method='gpu_hist', gpu_id=0, predictor = 'gpu_predictor')
catb = CatBoostClassifier()

In [None]:
lgbm_train, lgbm_test = Stacker(lgbm, 'lgbm', train, test, 5)
del lgbm
gc.collect()

sample_submission['target'] = lgbm_test
sample_submission.to_csv('lgbm_test.csv', index=False)

In [None]:
catb_train, catb_test = Stacker(catb, 'catb', train, test, 5)
del catb
gc.collect()

sample_submission['target'] = catb_test
sample_submission.to_csv('catb_test.csv', index=False)

In [None]:
xgb_train, xgb_test = Stacker(xgb, 'xgb', train, test, 5)
del xgb
gc.collect()

sample_submission['target'] = xgb_test
sample_submission.to_csv('xgb_test.csv', index=False)

In [None]:
stack_train = np.concatenate((catb_train.reshape(-1,1), lgbm_train.reshape(-1,1), xgb_train.reshape(-1,1)), axis = 1)
stack_test = np.concatenate((catb_test.reshape(-1,1), lgbm_test.reshape(-1,1), xgb_test.reshape(-1,1)), axis = 1)

del catb_train, lgbm_train, xgb_train, catb_test, lgbm_test, xgb_test
gc.collect()

stack_train = pd.DataFrame(stack_train, columns = ['catb', 'lgbm', 'xgb'])
stack_test = pd.DataFrame(stack_test, columns = ['catb', 'lgbm', 'xgb'])

In [None]:
stack_train.to_csv('stack_train.csv', index=True)
stack_test.to_csv('stack_test.csv', index=True)

#### Creating another dataframe with input columns as the out put of the above three classifiers and same output as y. This interesting idea is stacking. You can see the increase in the ROC score.

In [None]:
y = train['target'].copy()

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

train_preds = np.zeros(stack_train.shape[0])
test_preds = np.zeros(stack_test.shape[0])

kf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
auc=[]

n=0
for train_index, test_index in kf.split(stack_train, y):
    
    X_train, X_valid = stack_train.iloc[train_index], stack_train.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    lr = LogisticRegression(random_state = 42)
    lr.fit(X_train, y_train)
    
    train_preds += lr.predict_proba(stack_train)[:,1]/kf.n_splits
    test_preds += lr.predict_proba(stack_test)[:,1]/kf.n_splits
    
    auc.append(roc_auc_score(y_valid, lr.predict_proba(X_valid)[:, 1]))
    gc.collect()
        
    print(f"fold: {n+1}, auc: {auc[n]}")
    n+=1

In [None]:
sample_submission['target'] = test_preds
sample_submission.to_csv('submission.csv', index=False)

# Weighted average

In [None]:
sample_submission['target'] = (stack_test['lgbm']*1 + stack_test['xgb']*2 + stack_test['catb']*3)/6
sample_submission.to_csv('average.csv', index=False)

### If you like my notebook please upvote it and share it. If you see any mistakes, you are always welcome to comment section.
### Even I am a begginer with a curiosity of learning new ideas, We will learn together.

# Thank you!