In [None]:
import numpy as np
import pandas as pd
import os,psutil
from sklearn.model_selection import KFold,StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import gc
from optuna.integration import lightgbm as lgb
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")
sample = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")

In [None]:
train.head()

To make the code efficent we should reduce memory usage...

In [None]:
def cpu_stats():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return 'memory GB:' + str(np.round(memory_use, 2))

def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                                    ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

train = reduce_memory_usage(train, verbose=True)
test = reduce_memory_usage(test, verbose=True)
print(cpu_stats())
print('Memory reduced')

Identify the categorical and continuous features of the dataset.

In [None]:
features = []
categorical = []
numerical = []
for feature in train.columns:
    if feature not in ['id', 'target']:
        features.append(feature)
        if train[feature].dtypes=='int8':
            categorical.append(feature)
        if train[feature].dtypes=='float16':
            numerical.append(feature)
print("Size of train dataframe",train.shape)
print("Total number of categorical features is ", len(categorical))
print("Total number of numerical features is", len(numerical))

In [None]:
train['target'].value_counts()

Check if there are any missing values...

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
y = train['target']
train = train.drop(columns = ['target', 'id'])

RobustScaler - Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile).

Robust scaler is used here for non-categorical columns...

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
train[numerical] = scaler.fit_transform(train[numerical])
test[numerical] = scaler.transform(test[numerical])

We will select some best features and drop others using SelectKBest. 

In [None]:
def SelectKBestFeatures(features, target, threshold):
    kbest = SelectKBest(score_func = f_classif, k = len(features.columns))
    X = kbest.fit_transform(features, target.values.ravel())
    print('Before the SelectKBest =',features.shape)
    
    selected_features = []
    
    for i in range(len(features.columns)):
        if kbest.pvalues_[i]<=threshold:
            selected_features.append(features.columns[i])
            
    X_selected =  pd.DataFrame(X)
    X_selected.columns = features.columns
    X_selected = X_selected[selected_features]
    
    print('After the SelectKBest = ', X_selected.shape)
    
    return X_selected, selected_features
    

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
p_feature = 0.0001
train_numerical, selected_numerical = SelectKBestFeatures(train[numerical], y, p_feature)

In [None]:
train_categorical, selected_categorical = SelectKBestFeatures(train[categorical], y, p_feature)

In [None]:
cols = selected_numerical + selected_categorical
X = pd.concat([train_numerical,train_categorical], axis=1)
test = test[cols]
Y = y
print("Shape of Final X ", X.shape)
print("Shape of final Y ", Y.shape)

In [None]:
params={'reg_alpha': 8.158768860412389, 'reg_lambda': 8.793022151019823, 'colsample_bytree': 0.2, 'subsample': 0.4, 'learning_rate': 0.02,
       'max_depth': 100, 'num_leaves': 12, 'min_child_samples': 68, 'cat_smooth': 91,'objective': 'binary',  
            'random_state': 48,'n_estimators': 20000,'n_jobs': -1}

StratifiedKFold - Provides train/test indices to split data in train/test sets.
This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class.


In [None]:
preds = np.zeros(test.shape[0])

kf = StratifiedKFold(n_splits = 5, random_state=20210,shuffle=True)

auc = []
n = 0

for train_idx, test_idx in kf.split(X,Y):
    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[test_idx]
    model = LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_set = [(X_val,y_val)], early_stopping_rounds = 100, eval_metric = "auc", verbose = "False")
    preds += model.predict_proba(test)[:,1]/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    gc.collect()
    print(f"fold: {n+1}, auc: {auc[n]}")
    n+=1      

In [None]:
np.mean(auc)

In [None]:
lgb.plot_importance(model, max_num_features=40, figsize=(10,10))
plt.show()

In [None]:
sample['target']=preds
sample.to_csv('submission.csv', index=False)