In [None]:
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import roc_auc_score, make_scorer, r2_score
from sklearn.feature_selection import RFECV
from imblearn.over_sampling import ADASYN


In [None]:
random_state=213
np.random.seed(random_state)

# load data
df_train = pd.read_csv('../input/older-dataset-for-dont-overfit-ii-challenge/train.csv', index_col='id')
df_test = pd.read_csv('../input/older-dataset-for-dont-overfit-ii-challenge/test.csv', index_col='id')

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
# prepare train data
train_columns = list(df_train.columns)
train_columns.remove('target')
x_train = df_train[train_columns]
y_train = df_train['target']

In [None]:
#oversampling the data with ADASYN
x_train, y_train = ADASYN().fit_resample(x_train, y_train)


In [None]:
x_train.info()

In [None]:
# scale the data
temp = RobustScaler().fit_transform(np.concatenate((x_train, df_test), axis=0))
scaled_x_train = temp[:x_train.shape[0]]
scaled_x_test  = temp[x_train.shape[0]:]


In [None]:
# add noise
scaled_x_train += np.random.normal(0, 0.01, scaled_x_train.shape)


In [None]:
#make scorer
roc_auc_score = make_scorer(roc_auc_score)


In [None]:
#make feature selector with RFECV
feature_selector = RFECV(Lasso(alpha = .09 ,max_iter=1e5), min_features_to_select=20, scoring=roc_auc_score, step=5, verbose=0, cv=5, n_jobs=-1)

In [None]:
estimators = []
train_indexs = []
feature_selectors = []
diff = []
#train the model with lasso model with StratifiedShuffleSplit and grid search to get the best Combination and without overfitting  
for train_index, val_index in StratifiedShuffleSplit(n_splits=15, test_size=.2, random_state=123).split(scaled_x_train, y_train):
    X, val_X = scaled_x_train[train_index], scaled_x_train[val_index]
    y, val_y = y_train[train_index], y_train[val_index]
    feature_selector.fit(X, y)
    X = feature_selector.transform(X)
    val_X = feature_selector.transform(val_X)
    lasso_param_grid = {
    'alpha': [i for i in np.arange(0.001,0.1,.001)],
    'tol': [i for i in np.arange(0.001,0.01,0.001)]
        }
    lasso_grid = GridSearchCV(estimator=Lasso(max_iter=1e5), cv=5, param_grid=lasso_param_grid,
                           scoring=roc_auc_score, verbose=4, n_jobs=-1)
    
    lasso_grid.fit(X, y)
    print('Initial Best Score:', lasso_grid.best_score_)
    train_score = lasso_grid.score(X, y)
    val_score = lasso_grid.score(val_X, val_y)

    if train_score < val_score +.06:
        diff.append(train_score - val_score)
        print('<-- OK')
        print(f'train_score >> {train_score} || val_score >> {val_score}')
        estimators.append(lasso_grid.best_estimator_)
        train_indexs.append(train_index)
        feature_selectors.append(feature_selector)
    else:
        print('<-- skipping')
        print(f'train_score >> {train_score} || val_score >> {val_score}')

In [None]:
#get the best estimator with min difference between train and val score
print(min(diff))
i = diff.index(min(diff))
X = scaled_x_train[train_indexs[i]]
y = y_train[train_indexs[i]]
new_scaled_x_train = feature_selectors[i].transform(X)
new_scaled_x_test = feature_selectors[i].transform(scaled_x_test)
estimators[i].fit(new_scaled_x_train, y)
pred = estimators[i].predict(new_scaled_x_test)
mean_pred = pd.DataFrame(pred)
mean_pred.index += 250
mean_pred.columns = ['target']
mean_pred.to_csv('submission.csv', index_label='id', index=True)     