In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, r2_score, make_scorer
from sklearn.model_selection import cross_val_score
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
rfe_min_features = 12
rfe_step = 15
rfe_cv = 20
sss_n_splits = 12
sss_test_size = 0.35
grid_search_cv = 20
noise_std = 0.01
r2_threshold = 0.2
seed = 213

np.random.seed(seed)

In [3]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

train_y = train['target']
train_X = train.drop(['id','target'], axis=1).values
test = test.drop(['id'], axis=1).values

In [4]:
def scoring_roc_auc(y, y_pred):
    try:
        return roc_auc_score(y, y_pred)
    except:
        return 0.5

robust_roc_auc = make_scorer(scoring_roc_auc)

In [5]:
# RobustScaler
data = RobustScaler().fit_transform(np.concatenate((train_X, test), axis=0))
train_X = data[:250]
test = data[250:]

In [6]:
model = Lasso(alpha=0.031, tol=0.01, random_state = seed, selection='random')

In [7]:
hp_space = {
    'alpha': hp.uniform('alpha', low=0.012, high=0.051),
    'tol': hp.uniform('tol', low=0.0003, high=0.0027)
}

def hyperopt_xgb_score(params):
    clf = Lasso(**params)
    current_score = cross_val_score(clf, X, y, cv=3,scoring = robust_roc_auc).mean()
    print(current_score, params)
    return -current_score

In [8]:
# Feature selector
feature_selector = RFECV(model, min_features_to_select=rfe_min_features, scoring = robust_roc_auc,
                         step=rfe_step, verbose=0, cv=rfe_cv, n_jobs=-1)

predictions = pd.DataFrame()
counter = 0

# StratifiedShuffleSplit
for train_index, val_index in StratifiedShuffleSplit(n_splits=sss_n_splits, test_size=sss_test_size,
                                                     random_state=seed).split(train_X, train_y):
    X, val_X = train_X[train_index], train_X[val_index]
    y, val_y = train_y[train_index], train_y[val_index]

    # Feature selector
    feature_selector.fit(X, y)
    X_important_features = feature_selector.transform(X)
    val_X_important_features = feature_selector.transform(val_X)
    test_important_features = feature_selector.transform(test)
    
    # HyperPlot
    best = fmin(fn=hyperopt_xgb_score, space=hp_space, algo=tpe.suggest, max_evals=50,verbose=-1)
    
    # Lasso
    clf = Lasso(alpha=best['alpha'], copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=213,
      selection='random', tol=best['tol'], warm_start=False)
    clf.fit(X_important_features, y)
    
    # R2 score
    val_y_pred = clf.predict(val_X_important_features)
    r2  = r2_score(val_y, val_y_pred)

    # Threshold
    if r2 > r2_threshold:
        prediction = clf.predict(test_important_features)
        predictions = pd.concat([predictions, pd.DataFrame(prediction)], axis=1)
        
    counter += 1

print("{}/{} models ensembled".format(len(predictions.columns), counter))

0.6613365183296959
{'alpha': 0.026539178234602105, 'tol': 0.00121815028775451}
0.6649255052398327
{'alpha': 0.01975966621722492, 'tol': 0.002234002466612582}
0.7067518151054539
{'alpha': 0.05093971693728351, 'tol': 0.002638382124996949}
0.67073317163067
{'alpha': 0.037470523189209966, 'tol': 0.002570552286465036}
0.6712814172447051
{'alpha': 0.037094463656423424, 'tol': 0.0025058745640021782}
0.6818157857712763
{'alpha': 0.014381952265706398, 'tol': 0.0020166822088218696}
0.6995862906316123
{'alpha': 0.0472858611705409, 'tol': 0.00121855196305658}
0.6995862906316123
{'alpha': 0.04693933507979599, 'tol': 0.0011019633177982362}
0.6784185373082384
{'alpha': 0.014029225681028011, 'tol': 0.0012387469503489094}
0.6996772888748198
{'alpha': 0.04704338947575226, 'tol': 0.0020753708804304104}
0.6615498117325602
{'alpha': 0.0213698291876998, 'tol': 0.0011788086442504565}
0.6636949058570241
{'alpha': 0.020444760474955696, 'tol': 0.001567578918112136}
0.6608193845117173
{'alpha': 0.034206856614774

In [9]:
mean_pred = pd.DataFrame(predictions.mean(axis=1))
mean_pred.index += 250
mean_pred.columns = ['target']
mean_pred.to_csv('submission.csv', index_label='id', index=True)  