### 2 key components are:
* Scatterplots between continuous features
* Bar plots of number of differences between binary features and target

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error, accuracy_score, roc_auc_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier

In [None]:
train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')

In [None]:
print(f'Shape of train dataset: {train.shape}')

#### Dividing continuous and binary features

In [None]:
df = pd.DataFrame(train.drop(['id', 'target'],axis=1).dtypes, columns = ['dtype'])
con_features = df[df.dtype == 'float64'].index.to_numpy()
bin_features = df[df.dtype == 'int64'].index.to_numpy()
print(f'Number of feature columns dtype float: {con_features.shape[0]}')
print(f'Number of feature columns dtype int: {bin_features.shape[0]}')
del df

In [None]:
con_df = train[con_features]
bin_df = train[bin_features]

#### Number of target data

In [None]:
labels = train.target.unique().tolist()
sns.set_palette('Blues_d')
plt.rcParams['figure.figsize'] = (16, 2)
ax = pd.DataFrame(train['target'].value_counts()).T.plot(kind='barh', stacked=True)
ax.set_xlabel('Number of Targets')
plt.show() 

### LightGBM Classification
Simple classification with LightGB Model. For simplicity, number of examples were limited to 10,000(1%).

In [None]:
FOLDS = 10
BATCH_SIZE = 100
EPOCHS = 70

X, y = con_df.iloc[:10000,:], train['target'][:10000]

train_oof = np.zeros((10000,))
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=2021)
    
for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = LGBMClassifier(random_state=2021, n_estimators=200, 
                          learning_rate=1e-3, objective='binary')
    
    model = model.fit(X_train, y_train, verbose=0)
    
    oof = model.predict(X_valid)
    train_oof[valid_idx] = oof
    
    print(f'Fold {fold + 1} MAE: ', roc_auc_score(y_valid, oof))
    
print('K-Fold MAE: ', roc_auc_score(y, train_oof))

In [None]:
X, y = bin_df.iloc[:10000,:], train['target'][:10000]

train_oof = np.zeros((10000,))
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=2021)
    
for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = LGBMClassifier(random_state=2021, n_estimators=100, learning_rate=1e-3)
    
    model = model.fit(X_train, y_train, verbose=0)
    
    oof = model.predict(X_valid)
    train_oof[valid_idx] = oof
    
    print(f'Fold {fold + 1} MAE: ', roc_auc_score(y_valid, oof))
    
print('K-Fold MAE: ', roc_auc_score(y, train_oof))

Without feature engineering, we will not get a good score.

### Drawing Scatterplots 
Scatterplots for continuous features. 
* For simplicity, plots are drawn with first 10,000 examples. 
* Plots only show relation between 1-neighborhood features.

In [None]:
X, y = con_df.iloc[:10000, :], train['target'][:10000]

for k in range(20):
    fig, axes = plt.subplots(3, 4, figsize=(18,18))
    fig.suptitle(f'Scatter plots of features {con_features[12*k]} to {con_features[12*(k+1)-1]}')
    for i in range(3):
        for j in range(4):
            if (k, i, j) == (19, 2, 3): 
                break
            else:
                sns.scatterplot(ax=axes[i, j], data=X, x=X.iloc[:, 12*k + 4*i + j], y=X.iloc[:,12*k + 4*i + j + 1], hue=y, s=5, palette='Set2')

fig.delaxes(axes[2,3])

Some characteristics are:
* lines (sharp boundaries)
* lines with variance
* good round shape

### Drawing Barplots 
Barplots on binary features of amounts matching with target. 
* Most examples are not overlapped (only 72 out of 1,000,000 are overlapped).
* The lower the values in the second plot, the closer a feature is to target.

In [None]:
var_count = bin_df.value_counts()
var_count[var_count == 2].sum()

In [None]:
num_bin_df = bin_df.apply(lambda x: train['target'] - x, axis=0)
num_bin_df = pd.DataFrame(num_bin_df.apply(np.sum, axis=0)).T
plt.figure(figsize=(18,6))
plt.suptitle('Sum of values subtracted from target')
sns.barplot(data=num_bin_df, palette='Set2')

In [None]:
num_bin_df = pd.DataFrame(num_bin_df.apply(lambda x: np.sum(abs(x)), axis=0)).T
plt.figure(figsize=(18,6))
plt.suptitle('Sum of absolute values subtracted from target')
sns.barplot(data=num_bin_df, palette='Set2')

Could f25, f264 be a major feature that describes target? Or we can suspect data leakage.