In [None]:
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns
import gc

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')

In [None]:
import missingno as msno
%matplotlib inline

In [None]:
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv').drop('id', axis=1)
test  = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv').drop('id', axis=1)
ss    = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
train.head(10)

In [None]:
train.shape, test.shape

Lets check if there is any missing values or not.

In [None]:
msno.matrix(train)

* We can clearly see that there is no missing values in the dataset.

In [None]:
train.loc[:, 'f0':'f99'].describe().T.style.bar(subset=['mean'], color='#206ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')


Let's check the distribution of **target** in training dataset.

In [None]:
sns.countplot(train['target'], palette='Set3')

This plot shows that there is almost equal distribution of **Target** variable.

# Density plots of features

Here we represent distribution of **train** and **test** in different color.

In [None]:
features = train.columns.values[0:100]
i = 0
sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(10,10,figsize=(18,22))

for feature in features:
    i += 1
    plt.subplot(10,10,i)
    sns.distplot(train[feature], hist=False,label='train')
    sns.distplot(test[feature], hist=False,label='test')
    plt.xlabel(feature, fontsize=9)
    locs, labels = plt.xticks()
    plt.tick_params(axis='x', which='major', labelsize=6, pad=-6)
    plt.tick_params(axis='y', which='major', labelsize=6)
plt.show();

Train and test seems to have almost same distribution.

# Distribution of mean and std

Let's check the distribution of the mean values per row in the train and test set.

In [None]:
plt.figure(figsize=(16,6))
features = train.columns.values[0:100]
plt.title("Distribution of mean values per row in the train and test set")
sns.distplot(train[features].mean(axis=1),color="green", kde=True,bins=120, label='train')
sns.distplot(test[features].mean(axis=1),color="blue", kde=True,bins=120, label='test')
plt.legend()
plt.show()

Distribution of the **mean** values per **columns** in the train and test set.

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of mean values per column in the train and test set")
sns.distplot(train[features].mean(axis=0),color="magenta",kde=True,bins=120, label='train')
sns.distplot(test[features].mean(axis=0),color="darkblue", kde=True,bins=120, label='test')
plt.legend()
plt.show()

Distribution of **standard deviation** of values per **row** for train and test datasets.

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of std values per row in the train and test set")
sns.distplot(train[features].std(axis=1),color="black", kde=True,bins=120, label='train')
sns.distplot(test[features].std(axis=1),color="red", kde=True,bins=120, label='test')
plt.legend();plt.show()

Distribution of the **standard deviation** of values per **columns** in the train and test datasets.

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of std values per column in the train and test set")
sns.distplot(train[features].std(axis=0),color="blue",kde=True,bins=120, label='train')
sns.distplot(test[features].std(axis=0),color="green", kde=True,bins=120, label='test')
plt.legend(); plt.show()

Distribution of the **mean** value per **row** in the train dataset, grouped by value of **target**.

In [None]:
t0 = train.loc[train['target'] == 0]
t1 = train.loc[train['target'] == 1]
plt.figure(figsize=(16,6))
plt.title("Distribution of mean values per row in the train set")
sns.distplot(t0[features].mean(axis=1),color="red", kde=True,bins=120, label='target = 0')
sns.distplot(t1[features].mean(axis=1),color="blue", kde=True,bins=120, label='target = 1')
plt.legend(); plt.show()

Distribution of the **mean** value per **column** in the train dataset, grouped by value of **target**.

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of mean values per column in the train set")
sns.distplot(t0[features].mean(axis=0),color="green", kde=True,bins=120, label='target = 0')
sns.distplot(t1[features].mean(axis=0),color="darkblue", kde=True,bins=120, label='target = 1')
plt.legend(); plt.show()

# Distribution of min and max

Let's check the distribution of min per row in the train and test set.

In [None]:
plt.figure(figsize=(16,6))
features = train.columns.values[0:100]
plt.title("Distribution of min values per row in the train and test set")
sns.distplot(train[features].min(axis=1),color="red", kde=True,bins=120, label='train')
sns.distplot(test[features].min(axis=1),color="orange", kde=True,bins=120, label='test')
plt.legend()
plt.show()

Distribution of **min** per **column** in the train and test set.

In [None]:
plt.figure(figsize=(16,6))
features = train.columns.values[0:100]
plt.title("Distribution of min values per column in the train and test set")
sns.distplot(train[features].min(axis=0),color="magenta", kde=True,bins=120, label='train')
sns.distplot(test[features].min(axis=0),color="darkblue", kde=True,bins=120, label='test')
plt.legend()
plt.show()

Distribution of **max** values per **rows** for train and test set.

In [None]:
plt.figure(figsize=(16,6))
features = train.columns.values[0:100]
plt.title("Distribution of max values per row in the train and test set")
sns.distplot(train[features].max(axis=1),color="brown", kde=True,bins=120, label='train')
sns.distplot(test[features].max(axis=1),color="yellow", kde=True,bins=120, label='test')
plt.legend()
plt.show()

Now showing the max distribution on columns for train and test set.

In [None]:
plt.figure(figsize=(16,6))
features = train.columns.values[0:100]
plt.title("Distribution of max values per column in the train and test set")
sns.distplot(train[features].max(axis=0),color="blue", kde=True,bins=120, label='train')
sns.distplot(test[features].max(axis=0),color="red", kde=True,bins=120, label='test')
plt.legend()
plt.show()

Distributions of **min** values per **row** in train set, grouped by value of **target**.

In [None]:
t0 = train.loc[train['target'] == 0]
t1 = train.loc[train['target'] == 1]
plt.figure(figsize=(16,6))
plt.title("Distribution of min values per row in the train set")
sns.distplot(t0[features].min(axis=1),color="orange", kde=True,bins=120, label='target = 0')
sns.distplot(t1[features].min(axis=1),color="darkblue", kde=True,bins=120, label='target = 1')
plt.legend(); plt.show()

Distribution of **min** values per **columns** in train set, grouped by **target**.

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of min values per column in the train set")
sns.distplot(t0[features].min(axis=0),color="red", kde=True,bins=120, label='target = 0')
sns.distplot(t1[features].min(axis=0),color="blue", kde=True,bins=120, label='target = 1')
plt.legend(); plt.show()

Distribution of **max** values per **row** in the train set, grouped by **target**.

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of max values per row in the train set")
sns.distplot(t0[features].max(axis=1),color="gold", kde=True,bins=120, label='target = 0')
sns.distplot(t1[features].max(axis=1),color="darkblue", kde=True,bins=120, label='target = 1')
plt.legend(); plt.show()

Distribution of **max** values per **columns** in the train set, grouped by **target**.

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of max values per column in the train set")
sns.distplot(t0[features].max(axis=0),color="red", kde=True,bins=120, label='target = 0')
sns.distplot(t1[features].max(axis=0),color="blue", kde=True,bins=120, label='target = 1')
plt.legend(); plt.show()

# Distribution of skew and kurtosis

Distribution of **skewness** calculated per **rows** in train and test sets.

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of skew per row in the train and test set")
sns.distplot(train[features].skew(axis=1),color="red", kde=True,bins=120, label='train')
sns.distplot(test[features].skew(axis=1),color="orange", kde=True,bins=120, label='test')
plt.legend()
plt.show()

Distribution of **skewness** calculated per **columns** in train and test sets.

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of skew per column in the train and test set")
sns.distplot(train[features].skew(axis=0),color="magenta", kde=True,bins=120, label='train')
sns.distplot(test[features].skew(axis=0),color="darkblue", kde=True,bins=120, label='test')
plt.legend()
plt.show()

Distribution of **kurtosis** calculated per **rows** in train and test sets.

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of kurtosis per row in the train and test set")
sns.distplot(train[features].kurtosis(axis=1),color="darkblue", kde=True,bins=120, label='train')
sns.distplot(test[features].kurtosis(axis=1),color="yellow", kde=True,bins=120, label='test')
plt.legend()
plt.show()

Distribution of **kurtosis** calculated per **columns** in train and test sets.

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of kurtosis per column in the train and test set")
sns.distplot(train[features].kurtosis(axis=0),color="magenta", kde=True,bins=120, label='train')
sns.distplot(test[features].kurtosis(axis=0),color="green", kde=True,bins=120, label='test')
plt.legend()
plt.show()

Distribution of **skewness** values per **row** in the train set, grouped by **target**.

In [None]:
t0 = train.loc[train['target'] == 0]
t1 = train.loc[train['target'] == 1]
plt.figure(figsize=(16,6))
plt.title("Distribution of skew values per row in the train set")
sns.distplot(t0[features].skew(axis=1),color="red", kde=True,bins=120, label='target = 0')
sns.distplot(t1[features].skew(axis=1),color="blue", kde=True,bins=120, label='target = 1')
plt.legend(); plt.show()

Distribution of **skewness** values per **column** in the train set, grouped by **target**.

In [None]:
t0 = train.loc[train['target'] == 0]
t1 = train.loc[train['target'] == 1]
plt.figure(figsize=(16,6))
plt.title("Distribution of skew values per column in the train set")
sns.distplot(t0[features].skew(axis=0),color="red", kde=True,bins=120, label='target = 0')
sns.distplot(t1[features].skew(axis=0),color="blue", kde=True,bins=120, label='target = 1')
plt.legend(); plt.show()

Distribution of **kurtosis** values per **row** in the train set, grouped by **target**.

In [None]:
t0 = train.loc[train['target'] == 0]
t1 = train.loc[train['target'] == 1]
plt.figure(figsize=(16,6))
plt.title("Distribution of kurtosis values per row in the train set")
sns.distplot(t0[features].kurtosis(axis=1),color="red", kde=True,bins=120, label='target = 0')
sns.distplot(t1[features].kurtosis(axis=1),color="blue", kde=True,bins=120, label='target = 1')
plt.legend(); plt.show()

Distribution of **kurtosis** values per **column** in the train set, grouped by **target**.

In [None]:
t0 = train.loc[train['target'] == 0]
t1 = train.loc[train['target'] == 1]
plt.figure(figsize=(16,6))
plt.title("Distribution of kurtosis values per column in the train set")
sns.distplot(t0[features].kurtosis(axis=0),color="red", kde=True,bins=120, label='target = 0')
sns.distplot(t1[features].kurtosis(axis=0),color="blue", kde=True,bins=120, label='target = 1')
plt.legend(); plt.show()

# Correlation

Below is a heatmap plot of first 25 features(f0-f24) and target variable.

In [None]:
columns = train.columns[0:25].to_list()
columns.append('target')

corr = train[columns].corr()
f, ax = plt.subplots(figsize=(20,10))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

**f25-f49** and **target** variable.

In [None]:
columns = train.columns[25:50].to_list()
columns.append('target')

corr = train[columns].corr()
f, ax = plt.subplots(figsize=(20,10))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

**f50-f74** and **target** variable.

In [None]:
columns = train.columns[50:75].to_list()
columns.append('target')

corr = train[columns].corr()
f, ax = plt.subplots(figsize=(20,10))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

**f75-f99** and **target** variable.

In [None]:
columns = train.columns[75:100].to_list()
columns.append('target')

corr = train[columns].corr()
f, ax = plt.subplots(figsize=(20,10))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

It seems like there is common correlation coefficient(**0.11**) of some features with target variable.

# Feature Engineering

In [None]:
X = train.drop('target', axis=1).copy()
y = train['target']
X_test = test.copy()

del train
gc.collect
del test
gc.collect

In [None]:
def feature_eng(df):
    df['sum'] = df.sum(axis=1)  
    df['min'] = df.min(axis=1)
    df['max'] = df.max(axis=1)
    df['mean'] = df.mean(axis=1)
    df['std'] = df.std(axis=1)
    df['skew'] = df.skew(axis=1)
    df['kurt'] = df.kurtosis(axis=1)
    return df

X = feature_eng(X)
X_test = feature_eng(X_test)

In [None]:
display(X.head())
display(X_test.head())

Let's check the distribution of these new engineered features.

In [None]:
def plot_new_feature_distribution(df1, df2, label1, label2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(2,4,figsize=(18,8))

    for feature in features:
        i += 1
        plt.subplot(2,4,i)
        sns.kdeplot(df1[feature], bw=0.5,label=label1)
        sns.kdeplot(df2[feature], bw=0.5,label=label2)
        plt.xlabel(feature, fontsize=11)
        locs, labels = plt.xticks()
        plt.tick_params(axis='x', which='major', labelsize=8)
        plt.tick_params(axis='y', which='major', labelsize=8)
    plt.show();

In [None]:
features = X.columns.values[100:107]
plot_new_feature_distribution(X, X_test, 'train', 'test', features)

# Model

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc

In [None]:
params = { 
          'objective': 'binary:logistic', 
          'gpu_id': 0, 
          'n_estimators': 10000, 
          'learning_rate': 0.01, 
          'gamma': 0.25, 
          'max_depth': 4, 
          'min_child_weight': 366, 
          'subsample': 0.64, 
          'colsample_bytree': 0.78, 
          'colsample_bylevel': 0.86, 
          'reg_lambda': 0, 
          'reg_alpha': 10
          }

In [None]:
%%time
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

preds = []
scores = []
feature_importance_df = pd.DataFrame()

for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    model = XGBClassifier(**params,
                            booster= 'gbtree',
                            eval_metric = 'auc',
                            tree_method= 'gpu_hist',
                            predictor="gpu_predictor",
                            use_label_encoder=False)
    
    model.fit(X_train,y_train,
              eval_set=[(X_valid,y_valid)],
              early_stopping_rounds=100,
              verbose=False)
    
    pred_valid = model.predict_proba(X_valid)[:,1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores.append(score)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = X.columns
    fold_importance_df["importance"] = model.feature_importances_
    fold_importance_df["fold"] = fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    print(f"Fold: {fold + 1} Score: {score}")
    print('||'*40)
    
    test_preds = model.predict_proba(X_test)[:,1]
    preds.append(test_preds)
    
print(f"Overall Validation Score: {np.mean(scores)}")

In [None]:
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:107].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')

# Submission

In [None]:
predictions = np.mean(np.column_stack(preds),axis=1)

ss['target'] = predictions
ss.to_csv('./xgb.csv', index=False)
ss.head()

# Reference
* https://www.kaggle.com/gpreda/santander-eda-and-prediction
* https://www.kaggle.com/subinium/tps-oct-simple-eda