<div style="background-color:rgba(205, 29, 31, 0.5);">
    <h1><center>Importing Libraries and Data</center></h1>
</div>

**Please have a look at the key take-aways from this run, at the very end of the code.**

**May save you some time when you try simple feature engineering at your end :)**

In [None]:
import random
random.seed(123)

import pandas as pd
import numpy as np
import datatable as dt
import warnings
warnings.filterwarnings("ignore")

# importing feature selection and processing packages

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler

# importing modelling packages

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [None]:
# using datatable for faster loading

train = dt.fread(r'../input/tabular-playground-series-oct-2021/train.csv').to_pandas()
test = dt.fread(r'../input/tabular-playground-series-oct-2021/test.csv').to_pandas()
sub = dt.fread(r'../input/tabular-playground-series-oct-2021/sample_submission.csv').to_pandas()

<div style="background-color:rgba(205, 29, 31, 0.5);">
    <h1><center>Reducing Memory Usage</center></h1>
</div>

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64','float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                else:
                    df[col] = df[col].astype(np.float32)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

<div style="background-color:rgba(205, 29, 31, 0.5);">
    <h1><center>Data Split and Feature Creation</center></h1>
</div>

In [None]:
train_data = train.copy()
test_data = test.copy()

In [None]:
train_data.info()

In [None]:
# splitting data into float and boolean

train_data_boolean = train_data.select_dtypes(include = 'bool')
test_data_boolean = test_data.select_dtypes(include = 'bool')

train_data_float = train_data.select_dtypes(include = 'float16')
test_data_float = test_data.select_dtypes(include = 'float16')

In [None]:
# adding feature 'ones'

train_data['ones'] = train_data_boolean.drop(['f22','target'],axis=1).sum(axis=1)
test_data['ones'] = test_data_boolean.drop('f22',axis=1).sum(axis=1)

In [None]:
# reducing memory

train_1 = reduce_mem_usage(train)
test_1 = reduce_mem_usage(test)

In [None]:
# memory reduced - float64 downcast to float16

train_1.info()

In [None]:
train_data = train_1.copy()
test_data = test_1.copy()

# dropping 'id' variable

train_data = train_data.drop('id',axis=1)
test_data = test_data.drop('id',axis=1)

In [None]:
# splitting data 

X = train_data.drop('target',axis=1)
y = train_data['target'] # the target variable

In [None]:
# dropping all boolean columns other than f22 and ones

columns_to_use = train_data_float.columns.tolist()+['f22']
#columns_to_use = ['f179','f22'] # keeping only decently correlated variables

X = X[columns_to_use]
test_data = test_data[columns_to_use]

In [None]:
X.head()

In [None]:
# scaling data for faster run = some memory problem for now

#rs = RobustScaler()
#X = rs.fit_transform(X)
#test_data = rs.transform(test_data)

<div style="background-color:rgba(205, 29, 31, 0.5);">
    <h1><center>Baseline CatBoost</center></h1>
</div>

In [None]:
folds = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)

predictions_cb = np.zeros(len(test_data))
cat_oof = np.zeros(X.shape[0])

for fold, (trn_idx, val_idx) in enumerate(folds.split(X,y)):
    print(f"Fold: {fold+1}")
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model_cb =  CatBoostClassifier(task_type='GPU',verbose=0) 
    
    model_cb.fit(X_train, y_train)
    pred_cb = model_cb.predict_proba(X_val)[:,1]
    cat_oof[val_idx] = pred_cb
    print('ROC: ',roc_auc_score(y_val,pred_cb))
    
    print("-"*50)
    
    predictions_cb += model_cb.predict_proba(test_data)[:,1] / folds.n_splits

In [None]:
# submission

sub['target'] = predictions_cb
sub.to_csv('baseline_cb.csv',index = False) 

<div style="background-color:rgba(205, 29, 31, 0.5);">
    <h1><center>Take-aways</center></h1>
</div>

**Results Summary**

1. gave a score of 0.85294 when no feature added
2. gave a score of 0.85303 when 'ones' added with all others - **BEST**
3. gave a score of 0.85177 when only f22 and 'ones' kept among binary variables
4. gave a score of 0.85111 when only f22 kept among binary variables - **WORST**

**What next?**

1. Will try LGBM and XGBoost
2. May do ensembling (weighted or power averaging)
3. Will try stacking with Log Regression or a Ridge Classifier as my meta-model
4. Will tune the best 2-3 models I get, using Optuna
5. I may have a look at GAMs - Please share any references you have :)