## Imports

In [None]:
import numpy as np
import os
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.feature_selection import RFE, RFECV
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, make_scorer 
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, roc_auc_score, auc, log_loss
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import lightgbm as lgb
import seaborn as sns
from tqdm import tqdm
from scipy.stats import ks_2samp

## Loading the data

In [None]:
train = pd.read_csv('../input/dont-overfit-ii/train.csv')
test = pd.read_csv('../input/dont-overfit-ii/test.csv')

In [None]:
train.head()

In [None]:
train.isnull().sum()[train.isnull().sum() > 0]

In [None]:
train.info()

## Preprocessing

In [None]:
#from kernel  "https://www.kaggle.com/nanomathias/distribution-of-test-vs-training-data"
def get_diff_columns(train_df, test_df, show_plots=True, show_all=False, threshold=0.1):
    """Use KS to estimate columns where distributions differ a lot from each other"""

    # Find the columns where the distributions are very different
    diff_data = []
    for col in tqdm(train_df.columns):
        statistic, pvalue = ks_2samp(
            train_df[col].values, 
            test_df[col].values
        )
        if pvalue > 0.05 and np.abs(statistic) < threshold:
            diff_data.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})

    # Put the differences into a dataframe
    diff_df = pd.DataFrame(diff_data).sort_values(by='statistic', ascending=False)
    print(f"number of features with diff distribution : {len(diff_df)}")
    if show_plots:
        # Let us see the distributions of these columns to confirm they are indeed different
        n_cols = 5
        n_rows = 5
        _, axes = plt.subplots(n_rows, n_cols, figsize=(20, 3*n_rows))
        axes = [x for l in axes for x in l]

        # Create plots
        for i, (_, row) in enumerate(diff_df.iterrows()):
            if i >= len(axes):
                break
            extreme = np.max(np.abs(train_df[row.feature].tolist() + test_df[row.feature].tolist()))
            train_df.loc[:, row.feature].apply(np.log1p).hist(
                ax=axes[i], alpha=0.5, label='Train', density=True,
                bins=np.arange(-extreme, extreme, 0.25)
            )
            test_df.loc[:, row.feature].apply(np.log1p).hist(
                ax=axes[i], alpha=0.5, label='Test', density=True,
                bins=np.arange(-extreme, extreme, 0.25)
            )
            axes[i].set_title(f"Statistic = {row.statistic}, p = {row.p}")
            axes[i].set_xlabel(f'Log({row.feature})')
            axes[i].legend()

        plt.tight_layout()
        plt.show()
        
    return diff_df

# Get the columns which differ a lot between test and train
diff_df = get_diff_columns(train.drop(['id','target'], axis=1), test.drop(['id'], axis=1))

In [None]:
corr_with_y = pd.DataFrame(train.drop(['id','target'], axis=1).corrwith(train["target"]).abs()).reset_index()
corr_with_y.columns = ["Feature", "Correlation with Target"]
corr_with_y = corr_with_y.sort_values(by="Correlation with Target", ascending=False)
corr_with_y.head(10)

In [None]:
# Drop identity and target columns
variables_train = train.drop(['id','target'], axis=1)
var_resp = train["target"].copy()
variables_test = test.drop(['id'], axis=1)

In [None]:
# View % of each class of the response var
(var_resp.value_counts()/var_resp.count())*100

In [None]:
# Make the break between training and testing with stratify before any base treatment
x_train, x_test, y_train, y_test = train_test_split(variables_train, var_resp, test_size=0.2, random_state=2, stratify=var_resp)

## Feature engineering

In [None]:
def with_statistics(X):
    statistics = pd.DataFrame()
    statistics['mean']   = X.mean(axis=1)
    statistics['std']    = X.std(axis=1)
    statistics['kurt']   = X.kurt(axis=1)
    statistics['mad']    = X.mad(axis=1)
    statistics['median'] = X.median(axis=1)
    statistics['max']    = X.max(axis=1)
    statistics['min']    = X.min(axis=1)
    statistics['skew']   = X.skew(axis=1)
    statistics['sem']    = X.sem(axis=1)
    
    from sklearn.neighbors import NearestNeighbors
    neigh = NearestNeighbors(n_jobs=-1)
    neigh.fit(X)

    dists, _ = neigh.kneighbors(X)
    dists = np.delete(dists, 0, 1)
    statistics['minDist'] = dists.mean(axis=1)
    statistics['maxDist'] = dists.max(axis=1)
    statistics['meanDist'] = dists.min(axis=1)

# Trigometric FE
    sin_temp = np.sin(X)
    cos_temp = np.cos(X)
    tan_temp = np.tan(X)
    statistics['mean_sin'] = np.mean(sin_temp, axis=1)
    statistics['mean_cos'] = np.mean(cos_temp, axis=1)
    statistics['mean_tan'] = np.mean(tan_temp, axis=1)
# Hyperbolic FE
    sinh_temp = np.sinh(X)
    cosh_temp = np.cosh(X)
    tanh_temp = np.tanh(X)
    statistics['mean_sinh'] = np.mean(sinh_temp, axis=1)
    statistics['mean_cosh'] = np.mean(cosh_temp, axis=1)
    statistics['mean_tanh'] = np.mean(tanh_temp, axis=1)
# Exponents FE
    exp_temp = np.exp(X)
    expm1_temp = np.expm1(X)
    exp2_temp = np.exp2(X)
    statistics['mean_exp'] = np.mean(exp_temp, axis=1)
    statistics['mean_expm1'] = np.mean(expm1_temp, axis=1)
    statistics['mean_exp2'] = np.mean(exp2_temp, axis=1)
# Polynomial FE
    # X**2
    statistics['mean_x2'] = np.mean(np.power(X, 2), axis=1)
    # X**3
    statistics['mean_x3'] = np.mean(np.power(X, 3), axis=1)
    # X**4
    statistics['mean_x4'] = np.mean(np.power(X, 4), axis=1)
    
    X = pd.concat([X, statistics], axis=1)
    return X

In [None]:
# Apply feature engineering on training and testing/validating datasets
x_train = with_statistics(x_train).values
x_test = with_statistics(x_test).values
variables_test = with_statistics(variables_test).values

In [None]:
# Here I apply a Pipeline to standardize the scale on numerical data
# As we don't have missings and categorical data, I don't need to worry about this part
# As we have 282/301 variables with different distribution on training and test basis, we will standardize with RobustScaler

preprocessor = Pipeline([
        ('selector', VarianceThreshold()),
        ('std_scaler', RobustScaler())
    ])

## Modeling with hyperparameter tuning

In [None]:
# define roc_auc_metric robust to only one class in y_pred
def scoring_roc_auc(y, y_pred):
    try:
        return roc_auc_score(y, y_pred)
    except:
        return 0.5

robust_roc_auc = make_scorer(scoring_roc_auc)

In [None]:
# Number of trees
# Increase to previne overfit
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 10000, num = 4)]

param_grid = [
    {
        'classify__n_estimators': n_estimators,
        'classify__reg_lambda': [0.00001, 1000],
        'classify__reg_alpha': [0.00001, 1000]
    } 
]

model = Pipeline([
        ('preprocessor', preprocessor),
        ('classify', lgb.LGBMClassifier(
            objective = 'binary',
            n_jobs = -1,
            boosting_type = 'gbdt',
            metric = 'binary_error',
            class_weight='balanced',
            # Decrease both to previne overfit
            # Maximum number of levels in tree
            max_depth = 2,
            num_leaves = 2
        ))
])

# GridSearchCV with specify roc_auc that is robust against unbalanced datasets
grid_search = GridSearchCV(
    model, param_grid, cv=20, scoring=robust_roc_auc, verbose=1, return_train_score=True, n_jobs=-1)

grid_search = grid_search.fit(x_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
test_prepared = grid_search.best_estimator_.named_steps['preprocessor'].transform(x_test)

In [None]:
grid_search.best_estimator_.named_steps['classify'].score(test_prepared, y_test)

In [None]:
model = grid_search.best_estimator_

In [None]:
y_test_estimation = model.named_steps['classify'].predict(test_prepared)
y_test_score = model.named_steps['classify'].predict_proba(test_prepared)[:,1]

In [None]:
test_report = classification_report(y_test, y_test_estimation, digits=4)
print("Test:\n",test_report)

In [None]:
test_ID = test['id'].copy()

In [None]:
final_test = grid_search.best_estimator_.named_steps['preprocessor'].transform(variables_test)

In [None]:
y_predicted_test = grid_search.best_estimator_.named_steps['classify'].predict_proba(final_test)[:,1]

In [None]:
dataset = pd.DataFrame({'Id': test_ID, 'target': y_predicted_test})

In [None]:
dataset.to_csv('LightGBM_output.csv', index=False)