# Imports

In [None]:
import numpy as np
import os
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, GridSearchCV, cross_validate, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, make_scorer, mean_squared_error, mean_absolute_error 
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, roc_auc_score, auc, log_loss, r2_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
import seaborn as sns
from tqdm import tqdm
from scipy.stats import ks_2samp
from imblearn.over_sampling import SMOTE

# Loading the data

In [None]:
train = pd.read_csv('../input/dont-overfit-ii/train.csv')
test = pd.read_csv('../input/dont-overfit-ii/test.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.isnull().sum()[train.isnull().sum() > 0]

In [None]:
train.info()

# Preprocessing

In [None]:
#from kernel  "https://www.kaggle.com/nanomathias/distribution-of-test-vs-training-data"
def get_diff_columns(train_df, test_df, show_plots=True, show_all=False, threshold=0.1):
    """Use KS to estimate columns where distributions differ a lot from each other"""

    # Find the columns where the distributions are very different
    diff_data = []
    for col in tqdm(train_df.columns):
        statistic, pvalue = ks_2samp(
            train_df[col].values, 
            test_df[col].values
        )
        if pvalue > 0.05 and np.abs(statistic) < threshold:
            diff_data.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})

    # Put the differences into a dataframe
    diff_df = pd.DataFrame(diff_data).sort_values(by='statistic', ascending=False)
    print(f"number of features with diff distribution : {len(diff_df)}")
    if show_plots:
        # Let us see the distributions of these columns to confirm they are indeed different
        n_cols = 5
        n_rows = 5
        _, axes = plt.subplots(n_rows, n_cols, figsize=(20, 3*n_rows))
        axes = [x for l in axes for x in l]

        # Create plots
        for i, (_, row) in enumerate(diff_df.iterrows()):
            if i >= len(axes):
                break
            extreme = np.max(np.abs(train_df[row.feature].tolist() + test_df[row.feature].tolist()))
            train_df.loc[:, row.feature].apply(np.log1p).hist(
                ax=axes[i], alpha=0.5, label='Train', density=True,
                bins=np.arange(-extreme, extreme, 0.25)
            )
            test_df.loc[:, row.feature].apply(np.log1p).hist(
                ax=axes[i], alpha=0.5, label='Test', density=True,
                bins=np.arange(-extreme, extreme, 0.25)
            )
            axes[i].set_title(f"Statistic = {row.statistic}, p = {row.p}")
            axes[i].set_xlabel(f'Log({row.feature})')
            axes[i].legend()

        plt.tight_layout()
        plt.show()
        
    return diff_df

# Get the columns which differ a lot between test and train
diff_df = get_diff_columns(train.drop(['id','target'], axis=1), test.drop(['id'], axis=1))

In [None]:
corr_with_y = pd.DataFrame(train.drop(['id','target'], axis=1).corrwith(train["target"]).abs()).reset_index()
corr_with_y.columns = ["Feature", "Correlation with Target"]
corr_with_y = corr_with_y.sort_values(by="Correlation with Target", ascending=False)
corr_with_y.head(10)

In [None]:
# Drop identity and target columns
variables_train = train.drop(['id','target'], axis=1).values
var_resp = train["target"].copy()
variables_test = test.drop(['id'], axis=1).values

In [None]:
(var_resp.value_counts()/var_resp.count())*100

## Feature engineering

In [None]:
def with_statistics(X):
    statistics = pd.DataFrame()
    statistics['mean']   = X.mean(axis=1)
    statistics['std']    = X.std(axis=1)
    statistics['kurt']   = X.kurt(axis=1)
    statistics['mad']    = X.mad(axis=1)
    statistics['median'] = X.median(axis=1)
    statistics['max']    = X.max(axis=1)
    statistics['min']    = X.min(axis=1)
    statistics['skew']   = X.skew(axis=1)
    statistics['sem']    = X.sem(axis=1)
    
    from sklearn.neighbors import NearestNeighbors
    neigh = NearestNeighbors(5, n_jobs=-1)
    neigh.fit(X)

    dists, _ = neigh.kneighbors(X, n_neighbors=5)
    dists = np.delete(dists, 0, 1)
    statistics['minDist'] = dists.mean(axis=1)
    statistics['maxDist'] = dists.max(axis=1)
    statistics['meanDist'] = dists.min(axis=1)

# Trigometric FE
    sin_temp = np.sin(X)
    cos_temp = np.cos(X)
    tan_temp = np.tan(X)
    statistics['mean_sin'] = np.mean(sin_temp, axis=1)
    statistics['mean_cos'] = np.mean(cos_temp, axis=1)
    statistics['mean_tan'] = np.mean(tan_temp, axis=1)
# Hyperbolic FE
    sinh_temp = np.sinh(X)
    cosh_temp = np.cosh(X)
    tanh_temp = np.tanh(X)
    statistics['mean_sinh'] = np.mean(sinh_temp, axis=1)
    statistics['mean_cosh'] = np.mean(cosh_temp, axis=1)
    statistics['mean_tanh'] = np.mean(tanh_temp, axis=1)
# Exponents FE
    exp_temp = np.exp(X)
    expm1_temp = np.expm1(X)
    exp2_temp = np.exp2(X)
    statistics['mean_exp'] = np.mean(exp_temp, axis=1)
    statistics['mean_expm1'] = np.mean(expm1_temp, axis=1)
    statistics['mean_exp2'] = np.mean(exp2_temp, axis=1)
# Polynomial FE
    # X**2
    statistics['mean_x2'] = np.mean(np.power(X, 2), axis=1)
    # X**3
    statistics['mean_x3'] = np.mean(np.power(X, 3), axis=1)
    # X**4
    statistics['mean_x4'] = np.mean(np.power(X, 4), axis=1)
    
    X = pd.concat([X, statistics], axis=1)
    return X

In [None]:
# Here I apply a Pipeline to standardize the scale on numerical data
# As we don't have missings and categorical data, I don't need to worry about this part
# As we have 282/301 variables with different distribution on training and test basis, we will standardize with RobustScaler

preprocessor = Pipeline([
        ('selector', VarianceThreshold()),
        ('std_scaler', RobustScaler())
    ])

In [None]:
data = preprocessor.fit_transform(np.concatenate((variables_train, variables_test), axis=0))
variables_train = data[:variables_train.shape[0]]
variables_test = data[variables_train.shape[0]:]

In [None]:
variables_train = with_statistics(pd.DataFrame(variables_train)).values
variables_test = with_statistics(pd.DataFrame(variables_test)).values

# Modeling with hyperparameter tuning

In [None]:
# define roc_auc_metric robust to only one class in y_pred
def scoring_roc_auc(y, y_pred):
    try:
        return roc_auc_score(y, y_pred)
    except:
        return 0.5

robust_roc_auc = make_scorer(scoring_roc_auc)

In [None]:
param_grid = [
    {
        'C': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
        'tol': [0.00009, 0.0001, 0.00011],
        'max_iter': [int(x) for x in np.linspace(start = 100, stop = 10000, num = 32)],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['liblinear','sag']
    }
]

model = LogisticRegression(random_state=42, class_weight='balanced')

grid_search = GridSearchCV(model, param_grid=param_grid, verbose=0, n_jobs=-1, scoring=robust_roc_auc, cv=20)
grid_search.fit(variables_train, var_resp)

feature_selector = RFECV(grid_search.best_estimator_, verbose=0, min_features_to_select=10, scoring=robust_roc_auc, step=15, cv=20, n_jobs=-1)

In [None]:
print("counter | val_mse  |  val_mae  |  val_roc  |  val_cos  |  val_dist  |  val_r2    | feature_count ")
print("-------------------------------------------------------------------------------------------------")

predictions = pd.DataFrame()
counter = 0
# split training data to build one model on each traing-data-subset
for train_index, val_index in StratifiedKFold(n_splits=20, shuffle=True).split(variables_train, var_resp):
    X, val_X = variables_train[train_index], variables_train[val_index]
    y, val_y = var_resp[train_index], var_resp[val_index]

    # get the best features for this data set
    feature_selector.fit(X, y)
    
    # remove irrelevant features from X, val_X and test
    X_important_features        = feature_selector.transform(X)
    val_X_important_features    = feature_selector.transform(val_X)
    test_important_features     = feature_selector.transform(variables_test)

    # run grid search to find the best model parameters for this subset of training data and subset of features 
    grid_search = GridSearchCV(feature_selector.estimator_, param_grid=param_grid, verbose=0, n_jobs=-1, scoring=robust_roc_auc, cv=20)
    grid_search.fit(X_important_features, y)

    # score our fitted model on validation data
    val_y_pred = grid_search.best_estimator_.predict_proba(val_X_important_features)[:,1]
    val_mse = mean_squared_error(val_y, val_y_pred)
    val_mae = mean_absolute_error(val_y, val_y_pred)
    val_roc = roc_auc_score(val_y, val_y_pred)
    val_cos = cosine_similarity(val_y.values.reshape(1, -1), val_y_pred.reshape(1, -1))[0][0]
    val_dst = euclidean_distances(val_y.values.reshape(1, -1), val_y_pred.reshape(1, -1))[0][0]
    val_r2  = r2_score(val_y, val_y_pred)

    # if model did well on validation, save its prediction on test data, using only important features
    # r2_threshold (0.2) is a heuristic threshold for r2 error
    # you can use any other metric/metric combination that works for you
    if val_r2 > 0.185:
        message = '<-- OK'
        prediction = grid_search.best_estimator_.predict_proba(test_important_features)[:,1]
        predictions = pd.concat([predictions, pd.DataFrame(prediction)], axis=1)
    else:
        message = '<-- skipping'


    print("{0:2}      | {1:.4f}   |  {2:.4f}   |  {3:.4f}   |  {4:.4f}   |  {5:.4f}    |  {6:.4f}    |  {7:3}         {8}  ".format(counter, val_mse, val_mae, val_roc, val_cos, val_dst, val_r2, feature_selector.n_features_, message))
    
    counter += 1

In [None]:
mean_pred = pd.DataFrame(predictions.mean(axis=1))
mean_pred.index += 250
mean_pred.columns = ['target']
mean_pred.to_csv('submission.csv', index_label='id', index=True)