In [None]:
import gc
import numpy as np
import os
import pandas as pd
import pickle
import sys
from time import time
from tqdm import tqdm
import warnings
from sklearn.exceptions import ConvergenceWarning

pd.set_option('display.max_columns', None)
warnings.filterwarnings(action="ignore", category=ConvergenceWarning)
warnings.filterwarnings(action="ignore", category=UserWarning)
warnings.filterwarnings(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", category=RuntimeWarning)

# Utils
from IPython.display import display
import lightgbm as lgb
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from statsmodels.graphics.gofplots import qqplot

# Model
from imblearn.over_sampling import RandomOverSampler as ROS
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

In [None]:
PARAMS = {
    'objective': 'binary',
    'metric': 'auc',
    'max_bin': 31,
    'num_leaves': 17,
    'max_depth': 5,
    'path_smooth': 0.5,
    'min_data_in_leaf': 16,
    'min_sum_hessian_in_leaf': 1e-5,
    'is_unbalance': True,
    'learning_rate': 0.01,
    'force_col_wise': True,
    'feature_fraction': 1.0,
    'bagging_fraction': 1.0,
    'bagging_freq': 10,
    'lambda_l1': 5,
    'lambda_l2': 10,
    'device_type': 'gpu',
    'boosting': 'gbdt',
    'num_thread': os.cpu_count(),
}

# Utils

In [None]:
def plot_data_stats(X):
#     fig, ax = plt.subplots(3, 1, figsize=(12, 12))
    
    fig = plt.figure(figsize=(24, 12))
    ax1 = plt.subplot2grid((3, 3), (0, 0), colspan=1)
    ax2 = plt.subplot2grid((3, 3), (0, 1), colspan=1)
    ax3 = plt.subplot2grid((3, 3), (1, 0), colspan=2)

    bins = X.shape[1]
    
    sns.distplot(X.mean(axis=0), bins=bins, color='blue', kde=True, ax=ax1)
    sns.distplot(X.std(axis=0), bins=bins, color='red', kde=True, ax=ax2)

    outlier = (np.abs(stats.zscore(X)) > 3) * 1
    outlier_col_count = np.sum(outlier, axis=0)
    X_total = X.shape[0]
    outlier_col_perc = outlier_col_count/X_total*100

    pd.DataFrame(outlier_col_perc).plot.bar(color='purple', ax=ax3, legend=None)
    ax3.axes.get_xaxis().set_ticklabels([])
    ax3.axhline(y=outlier_col_perc.mean(), color='orange', ls='--', lw=2.0)

    ax1.set_title('Mean')
    ax2.set_title('Standard deviation')
    ax3.set_title('Z-score')
    
    outlier_col_above_avg = np.sum(outlier_col_perc > outlier_col_perc.mean())
    print(f'{outlier_col_above_avg} features with more outliers than average.')

    plt.show()

In [None]:
def train_eval_model(Xtrain, ytrain, Xvalid, yvalid, 
                     parameters=PARAMS, verbose=False,
                     num_boost_round=1000, early_stopping_rounds=50):    
    train_data = lgb.Dataset(Xtrain, label=ytrain)
    valid_data = lgb.Dataset(Xvalid, label=yvalid)
    
    evals_result = {}
    callbacks = [lgb.record_evaluation(evals_result)]
    
    model = lgb.train(
        parameters,
        train_data, 
        valid_sets=valid_data, 
        num_boost_round=num_boost_round,
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=False,
        callbacks=callbacks
    )
    
    ypred = model.predict(Xvalid)
    score = roc_auc_score(yvalid, ypred)
    
    print(f'\n\nScore = {score:.5f}', end='\n\n')
    print(classification_report(yvalid, np.round(ypred)))
    
    if verbose:
        fig, [ax1, ax3] = plt.subplots(1, 2, figsize=(12, 5))

        lgb.plot_metric(evals_result, ax=ax1)
        
        ypred = model.predict(Xtrain)
        fpr, tpr, _ = roc_curve(ytrain, ypred)
        roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(
            ax=ax3, color='blue')
        
        ypred = model.predict(Xvalid)
        fpr, tpr, _ = roc_curve(yvalid, ypred)
        roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(
            ax=ax3, color='orange', linestyle="--")
        
        plt.show()
    return model, evals_result, score

# Data loading

In [None]:
fnames = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        fnames.append(os.path.join(dirname, filename))

train = pd\
    .read_csv([fname for fname in fnames if 'train' in fname][0])\
    .drop('ID_code', axis=1)

test = pd\
    .read_csv([fname for fname in fnames if 'test' in fname][0])
test_id = test.ID_code
test = test.drop('ID_code', axis=1)

FEATURES = [column for column in train.columns if 'var' in column]
LABEL = 'target'

# Analysis

In [None]:
train.info()

In [None]:
train.describe()

### Dataset imbalanced

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))
sns.countplot(x=train.target);

### Normally distributed

In [None]:
alpha = 1e-3
all_normal = True
for feature in tqdm(FEATURES):
    if stats.normaltest(train[feature].values).pvalue > alpha:
        all_normal = False
        print(f'{feature} may not be normal')
if all_normal:
    print('All features are normally distributed')

### No null values

In [None]:
np.sum(train.isna().sum().values)

### Outliers

In [None]:
plot_data_stats(train[FEATURES])

### Features are independent

In [None]:
correlation = train.corr()

#### Changing signs of features inversely correlated to the target

In [None]:
for feature in tqdm(FEATURES):
    if correlation.loc['target'][feature] < 0:
        train[feature] *=-1
        test[feature] *=-1

In [None]:
correlation = train.corr()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
sns.histplot(
    correlation.loc['target'][FEATURES], bins=len(FEATURES), 
    color='blue', log_scale=True, kde=True, ax=ax);

### Removing synthetic rows from test dataset

In [None]:
df_hist = test.copy()
for idx, var in tqdm(enumerate(FEATURES)):
    var_stats = train[var].append(test[var]).value_counts()
    df_hist[var] = df_hist[var].map(var_stats)
    df_hist[var] = df_hist[var] > 1
real_idxs = df_hist.sum(axis=1) != 200

del(df_hist)

### Standard scaler

In [None]:
scl = StandardScaler()
scl.fit(train[FEATURES].append(test[real_idxs]))
scl_features = [f'{col}_scl' for col in FEATURES]

### Counting occurrences of values

In [None]:
var_stats = {}
for var in tqdm(FEATURES):
    var_stats[var] = dict(
        train[var].append(
            test[real_idxs][var]
        ).value_counts().reset_index().values)

### Data transform

In [None]:
def reduce_dtype(df):
    for feature in [column for column in df.columns if column != LABEL]:
        df[feature] = df[feature].astype(np.float32)
    return df.copy()

def transform_dataframe(df):  
    ds = df.copy()    
    # Creating features based on occurrences of their values
    for idx, var in tqdm(enumerate(FEATURES)):
        ds[var] = ds[var].map(var_stats[var]).astype(np.int8)
    ds.columns = [f'{col}_stats' for col in ds.columns]
    
    return pd.concat([
        reduce_dtype(df), ds,
        reduce_dtype(pd.DataFrame(
            scl.transform(df[FEATURES]), 
            columns=scl_features
        ))
    ], axis=1)

# Estimators

### RAW data

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train.drop(columns=LABEL), 
    train[LABEL], test_size=0.2, random_state=42)

t0 = time()
model, evals_result, score = train_eval_model(
    X_train, y_train, X_valid, y_valid, 
    num_boost_round=40000)
t1 = time()

# Releasing memory
del(X_train)
del(X_valid)
del(y_train)
del(y_valid)
gc.collect()

print('Total time', t1-t0)

In [None]:
lgb.plot_metric(evals_result, figsize=(20,6));

### New features

In [None]:
X, y = transform_dataframe(train.drop(columns=LABEL)), train[LABEL]
plot_data_stats(X)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42)

del(X)
del(y)

t0 = time()
model, evals_result, score = train_eval_model(
    X_train, y_train, X_valid, y_valid, 
    num_boost_round=40000)
t1 = time()

# Releasing memory
del(X_train)
del(X_valid)
del(y_train)
del(y_valid)
gc.collect()

print('Total time', t1-t0)

In [None]:
lgb.plot_metric(evals_result, figsize=(20,6));

The f1-score for target 1 is barely higher than 0.5 without balancing the dataset which we will do in the next section.

### New features + PCA

In [None]:
X, y = PCA(n_components=0.98).fit_transform(
        transform_dataframe(train.drop(columns=LABEL))
    ), train[LABEL]
plot_data_stats(X)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42)

del(X)
del(y)

t0 = time()
model, evals_result, score = train_eval_model(
    X_train, y_train, X_valid, y_valid, 
    num_boost_round=40000)
t1 = time()

# Releasing memory
del(X_train)
del(X_valid)
del(y_train)
del(y_valid)
gc.collect()

print('Total time', t1-t0)

In [None]:
lgb.plot_metric(evals_result, figsize=(20,6));

### ROS

<h5>Scores for this model:</h5>
<ul>
    <li>Public Score: 0.90492</li>
    <li>Private score: 0.90284</li>
</ul>

In [None]:
X, y = ROS().fit_resample(
    transform_dataframe(train.drop(columns=LABEL)), 
    train[LABEL])
plot_data_stats(X)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Releasing memory
del(X)
del(y)

t0 = time()
model, evals_result, score = train_eval_model(
    X_train, y_train, X_valid, y_valid, 
    num_boost_round=40000)
t1 = time()

# Releasing memory
del(X_train)
del(X_valid)
del(y_train)
del(y_valid)
gc.collect()

print('Total time', t1-t0)

In [None]:
lgb.plot_metric(evals_result, figsize=(20,6));

In [None]:
lgb.plot_tree(model, figsize=(20,6), tree_index=model.best_iteration-1, dpi=300, show_info='data_percentage');

In [None]:
lgb.plot_importance(model, figsize=(12,10), max_num_features=50);

In [None]:
# Releasing memory
del(model)
del(evals_result)
del(score)

gc.collect()

### ROS + Stacking estimators

We will use StratifiedKFold to generate 10 samples and train a model 10 times, generating 10 predictions with different learning data of the same test data. This method is roughly like a VotingClassifier with soft voting.

In [None]:
X, y = ROS().fit_resample(
    transform_dataframe(train.drop(columns=['target'])), 
    train['target'])
test = transform_dataframe(test)

# Releasing memory
del(train)
gc.collect()

prediction = np.zeros(test.shape[0])

skf = StratifiedKFold(n_splits=10, shuffle=True)

for train_index, valid_index in skf.split(X, y):
    model = lgb.train(
        PARAMS, 
        lgb.Dataset(
            X.iloc[train_index], 
            label=y[train_index],
            free_raw_data=True
        ), 
        valid_sets=lgb.Dataset(
            X.iloc[valid_index],
            label=y[valid_index],
            free_raw_data=True
        ), 
        num_boost_round=40000,
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    y_pred = model.predict(X.iloc[valid_index])
    score = roc_auc_score(y[valid_index], y_pred)
    
    print(f'Score = {score:.5f}')
    print(classification_report(y[valid_index], np.round(y_pred)))
    
    prediction += model.predict(test)/skf.n_splits
    
    gc.collect()

The variable prediction stores the argmax of the sums of the predicted probabilities, which is our result.

# Submission

In [None]:
test['target'] = prediction
test['ID_code'] = test_id
test = test.copy()

test[['ID_code', 'target']].to_csv('submission.csv', index=False)

# References

https://www.kaggle.com/code/fatemetardasti/santander-transaction-prediction-lgbm

https://www.kaggle.com/code/alirezahanifi/santander-customer-pca-dae-using-lr-lgbm

https://www.kaggle.com/code/ricardopennaleite/internship-work

https://www.tensorflow.org/tutorials/structured_data/imbalanced_data

https://www.tensorflow.org/tutorials/customization/custom_layers

https://matplotlib.org/3.1.1/gallery/userdemo/demo_gridspec01.html#sphx-glr-gallery-userdemo-demo-gridspec01-py

https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/plot_example.py

https://neptune.ai/blog/lightgbm-parameters-guide

https://coderzcolumn.com/tutorials/machine-learning/lightgbm-an-in-depth-guide-python#9