# Tabular Playground Series - Sep 2021

![](https://storage.googleapis.com/kaggle-competitions/kaggle/28009/logos/header.png?)

In [None]:
import warnings
warnings.filterwarnings('ignore', 'SettingWithCopyWarning')

## Imports

In [None]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import datatable as dt
import scipy.stats as stats
import statsmodels.api as sm

from lightgbm import LGBMClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import metrics, model_selection
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import QuantileTransformer
from sklearn.svm import LinearSVC
from sklearn.inspection import permutation_importance
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.ensemble import StackingClassifier

from eli5.sklearn import PermutationImportance
from IPython.display import display, Markdown, Latex

## Configuration

In [None]:
# matplotlib
plt.rc('font', size=15)
plt.rc('axes', titlesize=18)  
plt.rc('xtick', labelsize=10)  
plt.rc('ytick', labelsize=10)

# seaborn
sns.set(font_scale = 1.2)
sns.set_style("whitegrid")

In [None]:
class Config:
    RANDOM_STATE = 2021
    TRAIN_DATA = '../input/tabular-playground-series-sep-2021/train.csv'
    TEST_DATA = '../input/tabular-playground-series-sep-2021/test.csv'
    SUBMISSION = '../input/tabular-playground-series-aug-2021/sample_submission.csv'    
    SUBMISSION_FILE = 'submission.csv'
    TEST_SIZE = 0.3
    SAMPLE_FRAC = 0.02
    
    INDEX = 'id'
    TARGET = 'claim'
    FEATURES = ['f{}'.format(i) for i in range(1, 119)]
    COLUMNS = FEATURES + [TARGET]
    
    @staticmethod
    def set_seed():
        random.seed(Config.RANDOM_STATE)
        np.random.seed(Config.RANDOM_STATE)

Config.set_seed()

## Import Data

In [None]:
%%time

train_data = pd.read_csv(Config.TRAIN_DATA).set_index(Config.INDEX)
train_data

In [None]:
%%time

test_data = pd.read_csv(Config.TEST_DATA).set_index(Config.INDEX)
test_data

In [None]:
memory_usage = train_data.memory_usage(deep=True) / 1024 ** 2
print('Memory (train): {:.2f} MB'.format(memory_usage.sum()))

memory_usage = test_data.memory_usage(deep=True) / 1024 ** 2
print('Memory (test) : {:.2f} MB'.format(memory_usage.sum()))

In [None]:
# reduce memory usage
train_data[Config.FEATURES] = train_data[Config.FEATURES].astype(np.float32) 
test_data[Config.FEATURES] = test_data[Config.FEATURES].astype(np.float32)

In [None]:
memory_usage = train_data.memory_usage(deep=True) / 1024 ** 2
print('Memory (train): {:.2f} MB'.format(memory_usage.sum()))

memory_usage = test_data.memory_usage(deep=True) / 1024 ** 2
print('Memory (test) : {:.2f} MB'.format(memory_usage.sum()))

## Exploratory data analysis (EDA)

In [None]:
dtypes = pd .DataFrame({
    'feature': train_data.columns,
    'dtype': train_data.dtypes
}).set_index('feature')

dtypes

In [None]:
train_data[Config.COLUMNS].describe().T.style.bar(
    subset=['mean'], color='Bules'
).background_gradient(subset=['50%'], cmap='Blues')

## Target `claim`

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 5))
ax = sns.countplot(
    x='claim', 
    palette='Blues_r',
    data=train_data
)

fig.show()

In [None]:
def plot_pdf(
    data:pd.DataFrame, 
    feature:str, 
    title='Histplot',
    bins=50,
    ax=None):
    """ Plots the estimated pdf. 
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    # plot pdf
    sns.histplot(
        data=data[feature], 
        bins=bins,
        palette='Blues_r',
        shrink=.8,
        legend=True,
        ax=ax
    )
    
    ax.set_title(title)
    
    ax.set_xlabel('Feature {}'.format(feature))
    ax.set_ylabel('Count')
    
    return ax

In [None]:
def plot_boxplot(
    data:pd.DataFrame, 
    feature:str, 
    title='Boxplot',
    ax=None):
    
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    ax = sns.boxplot(
        x=Config.TARGET, 
        y=feature,
        palette='Blues_r',
        data=data
    )
    
    ax.set_title(title)
    
    ax.set_xlabel('Target {}'.format(Config.TARGET))
    ax.set_ylabel('Feature {}'.format(feature))
    
    return ax

In [None]:
data = train_data.sample(frac=0.01)

for feature in Config.FEATURES:
    display(Markdown('#### Plot feature `{}`'.format(feature)))
            
    fig, ax = plt.subplots(1, 2, figsize=(18, 5))

    plot_pdf(train_data, feature, ax=ax[0])
    plot_boxplot(train_data, feature, ax=ax[1])
    
    plt.show()

## Missing values

In [None]:
pd.DataFrame({
    'data_set': ['train', 'test'],
    'missing_values': [
        train_data.isna().sum().sum(), 
        test_data.isna().sum().sum()
    ]
}).set_index('data_set')

In [None]:
idx = data[Config.FEATURES].isna().index
train_data.iloc[idx]

In [None]:
def add_na_count(data):
    """Adds the number of NaNs in a row as feature `nan_count`.
    """
    df = data.copy()
    
    df['nan_count'] = df[Config.FEATURES].isna().sum(axis=1) 
    df['std_dev'] =  df[Config.FEATURES].isna().std(axis=1) 
    
    return df

train_data = FunctionTransformer(add_na_count).fit_transform(train_data)

In [None]:
train_data.loc[:, ('nan_count', 'std_dev')]

In [None]:
idx0 = train_data[train_data['claim'] == 0].index
idx1 = train_data[train_data['claim'] == 1].index

fig, ax = plt.subplots(1, 3, figsize=(20, 5))

plot_pdf(train_data.loc[idx0], 'nan_count', ax=ax[0])
plot_pdf(train_data.loc[idx1], 'nan_count', ax=ax[1])

plot_boxplot(train_data, 'nan_count', ax=ax[1])

plt.show()

## Correlation

In [None]:
corr_matrix = train_data[Config.COLUMNS].corr()

In [None]:
plt.figure(figsize = (20, 15))

sns.heatmap(
    corr_matrix, 
    annot = False, 
    cmap = 'Blues', 
    mask = np.triu(corr_matrix), 
    linewidths = 0.1, 
    linecolor = 'white', 
    cbar = True
)

plt.show()

## Feature Engineering

In [None]:
def add_features(data):
    """
    """
    df = data.copy()
    
    df['med'] = df[Config.FEATURES].median(axis=1).astype(np.float)
    df['mean'] = df[Config.FEATURES].mean(axis=1).astype(np.float)
    
    df['max'] = df[Config.FEATURES].max(axis=1).astype(np.float)
    df['min'] = df[Config.FEATURES].min(axis=1).astype(np.float)
    
    df['max2'] = df[Config.FEATURES].abs().max(axis=1).astype(np.float)
    df['min2'] = df[Config.FEATURES].abs().min(axis=1).astype(np.float)
    
    df['skew'] = df[Config.FEATURES].skew(axis=1).astype(np.float)
    
    return df

train_data = FunctionTransformer(add_features).fit_transform(train_data)

In [None]:
train_data.loc[:, ('med', 'mean', 'max', 'min', 'max2', 'min2', 'skew')]

## Principal component analysis (PCA)

In [None]:
data = train_data.sample(frac=0.2, random_state=Config.RANDOM_STATE)
n_components = 35

pca = make_pipeline(
    FunctionTransformer(add_na_count),
    FunctionTransformer(add_features),
    SimpleImputer(strategy='mean'),
    QuantileTransformer(output_distribution='normal'),
    RobustScaler(),
    
    PCA(n_components=n_components, 
        random_state=Config.RANDOM_STATE)
)

pca_cols = ['pc{}'.format(i) for i in range(1, n_components + 1)]
components = pca.fit_transform(data)

In [None]:
pca_data = pd.DataFrame({Config.TARGET: data[Config.TARGET]})

for i in range(1, n_components + 1):
    pca_data[pca_cols[i-1]] = components[:, i-1]

In [None]:
variance = pca['pca'].explained_variance_ratio_
var=np.cumsum(np.round(variance, decimals=3)*100)

fig, ax = plt.subplots(1, 2, figsize=(12, 5))

ax[0].plot(variance)
ax[0].set_xlabel('# of Components')
ax[0].set_ylabel('Explained variance')
ax[0].set_title("PCA Analysis")

ax[1].plot(var)
ax[1].set_ylabel('% Variance Explained')
ax[1].set_xlabel('# of Components')

fig.tight_layout()
fig.show()

In [None]:
def plot_pca(data, x, y, ax=None):
    if ax == None:
        fig, ax = plt.subplots(1, 1)
        
    sns.scatterplot(
        data=data,
        x=x, 
        y=y,
        hue=Config.TARGET,
        palette=sns.color_palette(['red', 'blue']),
        alpha=0.3, 
        ax=ax)

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(25, 5))

plot_pca(pca_data, 'pc1', 'pc2', ax=ax[0])
plot_pca(pca_data, 'pc2', 'pc3', ax=ax[1])
plot_pca(pca_data, 'pc3', 'pc4', ax=ax[2])
plot_pca(pca_data, 'pc4', 'pc5', ax=ax[3])

plt.show()

## Baseline model

In [None]:
idx = train_data.sample(frac=1, random_state=Config.RANDOM_STATE).index

X_data = train_data.iloc[idx][Config.FEATURES]
y_data = train_data.iloc[idx][Config.TARGET]

In [None]:
# spit data into train and validation data sets
X_train, X_val, y_train, y_val = train_test_split(
    X_data,
    y_data,
    test_size=Config.TEST_SIZE, 
    random_state=Config.RANDOM_STATE
)

In [None]:
print(f'train size: {X_train.shape[0]} rows')
print(f'val size  : {X_val.shape[0]} rows')

In [None]:
def create_baseline_model():
    """
    """
    estimators = [
        ('lgbm', LGBMClassifier(
            max_depth = 3,
            num_leaves = 7,
            n_estimators = 2000,
            colsample_bytree = 0.3,
            subsample = 0.5,
            random_state = 42,
            reg_alpha=18,
            reg_lambda=17,
            learning_rate = 0.095,
            objective= 'binary')
        ),
        ('sgd', SGDClassifier()),
        ('lr', LogisticRegression()),
        ('ridge', RidgeClassifier())
    ]
        
    model = make_pipeline(
        FunctionTransformer(add_na_count),
        FunctionTransformer(add_features),
        SimpleImputer(strategy='mean'),
        QuantileTransformer(output_distribution='normal'),
        RobustScaler(),
        StackingClassifier(
            estimators=estimators, 
            final_estimator=LogisticRegression()
        )
    )
    return model

model = create_baseline_model()

In [None]:
%%time

y_pred = model.fit(X_train, y_train).predict(X_val)
print('ROC: {}'.format(roc_auc_score(y_val, y_pred)))

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
metrics.plot_roc_curve(model, X_val, y_val, ax=ax)  

plt.show()

## Confusion Matrix

In [None]:
plot_confusion_matrix(
    model, 
    X_val, 
    y_val, 
    cmap=plt.cm.Blues,
    normalize='true') 

plt.show()

## Permutation Importance

In [None]:
def plot_feature_importances(feature_imp, feature_names, num=20, ax=None):
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    df = pd.DataFrame({
        'feature': feature_names,
        'value': feature_imp
    }).sort_values('value', ascending=False).head(num)
    
    sns.barplot(
        x='value', 
        y='feature', 
        palette='Blues_r',
        data=df,
        ax=ax
    ) 

    ax.set_title("Permutation Importance of each feature")
    ax.set_ylabel("Features")

    return ax

In [None]:
feature_imp = model['stackingclassifier'].estimators_[0].feature_importances_
feature_names = Config.FEATURES + ['nan_count', 'std_dev', 'med', 'mean', 'max', 'min', 'max2', 'min2', 'skew']

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plot_feature_importances(feature_imp, feature_names, num=30, ax=ax)

fig.tight_layout()
plt.show()

## Submission

In [None]:
y_pred_submission = model.predict_proba(test_data[Config.FEATURES])[:, 1]

In [None]:
submission_data = pd.DataFrame({
    Config.INDEX: test_data.index,
    Config.TARGET: y_pred_submission,
}).set_index(Config.INDEX)

submission_data

In [None]:
# save submission file
submission_data.to_csv(Config.SUBMISSION_FILE)