In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:

# Data processing, metrics and modeling
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, r2_score, make_scorer
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

#ignore warning messages 
import warnings
warnings.filterwarnings('ignore') 

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
print(train.info())

In [None]:
train_y = train['target']
train_X = train.drop(['id','target'], axis=1).values
g_id = test['id']
test = test.drop(['id'], axis=1).values

In [None]:
scaler = StandardScaler()
data = scaler.fit_transform(np.concatenate((train_X, test), axis=0))
train_X = data[:250]
test = data[250:]

> 

In [None]:
# add noise 
train_X += np.random.normal(0, 0.01, train_X.shape)

In [None]:
# define roc_auc_metric 
def scoring_roc_auc(y, y_pred):
    try:
        return roc_auc_score(y, y_pred)
    except:
        return 0.5
standard_roc_auc =  make_scorer(scoring_roc_auc)

In [None]:
# define model
model = LogisticRegression(random_state=0, penalty='l1', C=0.1, solver='liblinear')

param_grid = {'class_weight' : ['balanced', None],
                  'penalty' : ['l2', 'l1'],
                  'C' : [0.001, 0.01, 0.08, 0.1, 0.15, 1.0, 10.0, 100.0],
                 }

In [None]:
# feature select
feature_selector = RFECV(model, min_features_to_select=12, scoring=standard_roc_auc, 
                         step=1, verbose=0, cv=10, n_jobs=-1)

In [None]:
feature_selector.fit(train_X, train_y)
X_important_features        = feature_selector.transform(train_X)
# val_X_important_features    = feature_selector.transform(val_X)
# test_important_features     = feature_selector.transform(test)
    
grid_search = GridSearchCV(feature_selector.estimator_, param_grid=param_grid,
                               verbose=0, n_jobs=-1, scoring=standard_roc_auc, cv=20)
grid_search.fit(X_important_features, train_y)
grid_search.best_params_
# val_y_pred = grid_search.best_estimator_.predict(val_X_important_features)


In [None]:
new_model = LogisticRegression(random_state=0, penalty=grid_search.best_params_.get('penalty')
                               , C=grid_search.best_params_.get('C'), solver='liblinear',
                              class_weight=grid_search.best_params_.get('class_weight'))

In [None]:
predictions = pd.DataFrame()
count = 0
val_mse = 0.0
val_mae = 0.0
val_roc = 0.0
val_r2  = 0.0
for train_index, val_index in StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=13).split(train_X, train_y):
    X, val_X = train_X[train_index], train_X[val_index]
    y, val_y = train_y[train_index], train_y[val_index]
    
    X_important_features        = feature_selector.transform(X)
    val_X_important_features    = feature_selector.transform(val_X)
    test_important_features     = feature_selector.transform(test)
    
    new_model.fit(X_important_features, y)
    val_y_pred = new_model.predict(val_X_important_features)
    
    val_mse += mean_squared_error(val_y, val_y_pred)
    val_mae += mean_absolute_error(val_y, val_y_pred)
    val_roc += roc_auc_score(val_y, val_y_pred)
    val_r2  += r2_score(val_y, val_y_pred)
    
    count += 1
predictions =  new_model.predict(test_important_features)
print(val_mse/count,val_mae/count,val_roc/count,val_r2/count)
print(predictions)

In [None]:
submission = pd.DataFrame({
    "id": g_id,
    "target": predictions
})
print(submission)
submission.to_csv('submission.csv', index=False)