In [None]:
import sys, os
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, r2_score, make_scorer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import matplotlib.pyplot as plt
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
rfe_min_features = 12
rfe_step = 15
rfe_cv = 20
sss_n_splits = 20
sss_test_size = 0.35
grid_search_cv = 20
noise_std = 0.01
r2_threshold = 0.185
random_seed = 213

np.random.seed(random_seed)

In [None]:
# import data
train = pd.read_csv('../input/older-dataset-for-dont-overfit-ii-challenge/train.csv')
train_y = train['target']
train_X = train.drop(['id','target'], axis=1).values

test_df = pd.read_csv('../input/older-dataset-for-dont-overfit-ii-challenge/test.csv')
test_df= test_df.drop(['id'], axis=1).values

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.nunique()

In [None]:
#number of training samples is too small
plt.bar(range(2), (train_X.shape[0], test_df.shape[0]), align='center', alpha=0.8)
plt.xticks(range(2), ('train','test'))
plt.ylabel('Number of data') 
plt.title('Can we avoid overfitting')
plt.show()


In [None]:
#from the hist of each column it shows that data follows a gaussian shape or normal distribution around 0 mean and std =1
plt.figure(figsize=(15,15))
for i in range(5):
    for j in range(5):
        plt.subplot(5,5,5*i+j+1)
        plt.hist(train[str(5*i+j)],bins=100)
        plt.title('Column '+str(5*i+j))
plt.show()

In [None]:
#check the mean value and std on the train 

# with mean 0 and std 1 
print(train.mean().sum()/300)
print(train.std().sum()/300)

In [None]:
#check the mean value and std on the Test Data 
print(test_df.mean().sum()/300)
print(test_df.std().sum()/300)

In [None]:
# scale using RobustScaler 
data = RobustScaler().fit_transform(np.concatenate((train_X, test_df), axis=0))
train_X = data[:250]
test_df= data[250:]
# add a bit of noise to train_X to reduce overfitting
train_X += np.random.normal(0, noise_std, train_X.shape)


In [None]:
# define roc_auc_metric 
#then make it as ascorer 
def scoring_roc_auc(y, y_pred):
    try:
        return roc_auc_score(y, y_pred)
    except:
        return 0.5

robust_roc_auc = make_scorer(scoring_roc_auc)


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Draw Correlation heatmap
train.drop(columns=['id','target']).corr()

In [None]:
#Try hist gradient classifier without any normalization on the data then get the most important features 

HistGradient = ExtraTreesClassifier(warm_start=True)

param = {'n_estimators' : [250,500], 
    'max_depth' : [11],
   
         'min_samples_split':[9],
         'min_samples_leaf':[9],
        }

gridSearch_HistGradient = GridSearchCV(HistGradient,param,scoring=robust_roc_auc,cv=7,verbose=3)
gridSearch_HistGradient.fit(train_X, train_y)

best_HistGradient = gridSearch_HistGradient.best_estimator_
bestHistGradient_testScore=best_HistGradient.score(train_X, train_y)


In [None]:
# from sklearn.inspection import permutation_importance
# r = permutation_importance(gridSearch_HistGradient, train_X, train_y,
#                             n_repeats=30)

# for i in r.importances_mean.argsort()[::-1]:

#     print(f"{X_train.columns[i]} "
#            f"{r.importances_mean[i]:.3f} "
#            f" +/- {r.importances_std[i]:.3f}")
# plt.figure(figsize=(10,7))
# plt.barh(X_train.columns, r.importances_mean)

# Feature selection  using RFECV on Lasso 

In [None]:
# define Laso model and its parameters
#Lasso uses L1 normalization
model = Lasso(alpha=0.031, tol=0.01,warm_start=True, random_state=random_seed, selection='random')

param_grid = {
            'alpha' : [0.022, 0.021, 0.02, 0.019, 0.023, 0.024, 0.025, 0.026, 0.027, 0.029, 0.031],
            'tol'   : [0.0013, 0.0014, 0.001, 0.0015, 0.0011, 0.0012, 0.0016, 0.0017]
        }

# define recursive elimination feature selector
feature_selector = RFECV(model, min_features_to_select=rfe_min_features, scoring=robust_roc_auc, step=rfe_step, verbose=0, cv=rfe_cv, n_jobs=-1)


In [None]:
predictions = pd.DataFrame()
counter = 0
print("counter | val_mse  |  val_mae  |  val_roc  |  val_cos  |  val_dist  |  val_r2    | feature_count ")
print("-------------------------------------------------------------------------------------------------")
# split training data to build one model on each traing-data-subset
for train_index, val_index in StratifiedShuffleSplit(n_splits=sss_n_splits, test_size=sss_test_size, random_state=random_seed).split(train_X, train_y):
    X, val_X = train_X[train_index], train_X[val_index]
    y, val_y = train_y[train_index], train_y[val_index]

    # get the best features for this data set
    feature_selector.fit(X, y)
    # remove irrelevant features from X, val_X and test
    X_important_features        = feature_selector.transform(X)
    val_X_important_features    = feature_selector.transform(val_X)
    test_important_features     = feature_selector.transform(test_df)

    # run grid search to find the best Lasso parameters for this subset of training data and subset of features 
    grid_search = GridSearchCV(feature_selector.estimator_, param_grid=param_grid, verbose=0, n_jobs=-1, scoring=robust_roc_auc, cv=20)
    grid_search.fit(X_important_features, y)

    # score  fitted model on validation data
    val_y_pred = grid_search.best_estimator_.predict(val_X_important_features)
    val_mse = mean_squared_error(val_y, val_y_pred)
    val_mae = mean_absolute_error(val_y, val_y_pred)
    val_roc = roc_auc_score(val_y, val_y_pred)
    val_cos = cosine_similarity(val_y.values.reshape(1, -1), val_y_pred.reshape(1, -1))[0][0]
    val_dst = euclidean_distances(val_y.values.reshape(1, -1), val_y_pred.reshape(1, -1))[0][0]
    val_r2  = r2_score(val_y, val_y_pred)

    # if model did well on validation, save its prediction on test data, using only important features
    # r2_threshold (0.185) is a heuristic threshold for r2 error
    # you can use any other metric/metric combination that works for you
    if val_r2 > r2_threshold:
        message = '<-- OK'
        prediction = grid_search.best_estimator_.predict(test_important_features)
        predictions = pd.concat([predictions, pd.DataFrame(prediction)], axis=1)
    else:
        message = '<-- skipping'


    print("{0:2}      | {1:.4f}   |  {2:.4f}   |  {3:.4f}   |  {4:.4f}   |  {5:.4f}    |  {6:.4f}    |  {7:3}         {8}  ".format(counter, val_mse, val_mae, val_roc, val_cos, val_dst, val_r2, feature_selector.n_features_, message))
    
    counter += 1

print("-------------------------------------------------------------------------------------------------")
print("{}/{} models passed validation threshold and will be ensembled.".format(len(predictions.columns), sss_n_splits))


In [None]:
mean_pred = pd.DataFrame(predictions.mean(axis=1))
mean_pred.index += 250
mean_pred.columns = ['target']
mean_pred.to_csv('submission.csv', index_label='id', index=True) 