In [None]:
import sys, os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import  roc_auc_score ,make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
train = pd.read_csv('../input/dont-overfit-ii/train.csv')
train_y = train['target']
train_X = train.drop(['id','target'], axis=1).values

test_df = pd.read_csv('../input/dont-overfit-ii/test.csv')
test = test_df.drop(['id'], axis=1).values

In [None]:
train['target'].value_counts()

In [None]:
train.info()

In [None]:
test_df.info()

In [None]:
train.isnull().sum()

# s=train.isnull().sum()*100/len(train)
# s=test_df.isnull().sum()*100/len(test_df)

# for i in s:
#     print(i)

In [None]:
test_df.isnull().sum()

In [None]:
print(train.duplicated().sum())
# print(test_df.duplicated().sum())



In [None]:
plt.bar(range(2), (train.shape[0], test.shape[0]), align='center', alpha=0.8)
plt.xticks(range(2), ('train','test'))
plt.ylabel('Number of data') 
plt.title('train test counts')
plt.show()

In [None]:
print(train.mean().sum()/300)
print(train.std().sum()/300)

In [None]:
print(test.mean().sum()/300)
print(test.std().sum()/300)

In [None]:
#distribution for train and test are different 
#this may  cause overfit (you train the model on a data with specific distribution and in test the dist.. is different )

In [None]:
fig = plt.figure(figsize =(10, 7))
 
# Creating plot
plt.boxplot(train.iloc[:,2:])
 
# show plot
plt.show()

In [None]:
fig = plt.figure(figsize =(10, 7))
 
# Creating plot
plt.boxplot(test_df.iloc[:,1:])
 
# show plot
plt.show()

In [None]:
data = RobustScaler().fit_transform(np.concatenate((train_X, test), axis=0))
train = data[:250]
# add a bit of noise to train_X to reduce overfitting
train += np.random.normal(0, 0.01, train.shape)
test = data[250:]

In [None]:
type(train)

In [None]:
print(train.mean().sum()/300)
print(train.std().sum()/300)

In [None]:
print(test.mean().sum()/300)
print(test.std().sum()/300)

In [None]:
df_train = pd.DataFrame(train)
df_test = pd.DataFrame(test)
df_train.head()

In [None]:

def scoring_roc_auc(y, y_pred):
    try:
        return roc_auc_score(y, y_pred)
    except:
        return 0.5


In [None]:
ned={ 0:{                           'alpha' : [0.022, 0.021, 0.02, 0.019, 0.023, 0.024, 0.025, 0.026, 0.027, 0.029, 0.031],
                                    'tol'   : [0.0013, 0.0014, 0.001, 0.0015, 0.0011, 0.0012, 0.0016, 0.0017]},
     
     1:{"C":np.logspace(-3,3,7), "penalty":["l1","l2"],'max_iter':[1000,500]},
     
     2:{ 'n_estimators': [100,300],'max_features': ['auto', 'sqrt', 'log2'],'max_depth' : [6,8,10],'criterion' :['gini', 'entropy']}
    
}
              
  

In [None]:
modelLasso = Lasso(alpha=0.031, tol=0.01, random_state=213, selection='random')
lr = LogisticRegression(solver='liblinear',max_iter=1000, class_weight="balanced",C=0.1, penalty="l1")

rfc=RandomForestClassifier()
listm=[modelLasso,lr,rfc]

In [None]:
listm[1]

In [None]:
kfolds=StratifiedShuffleSplit(n_splits=20, test_size= 0.35, random_state=213).split(train_X, train_y)

for i in range(len(listm))   :

    feature_selector = RFECV(listm[i], min_features_to_select=12, scoring=make_scorer(scoring_roc_auc), step=15, verbose=0, cv=20, n_jobs=-1)
    print("model",listm[i],ned[i])
    for ti, vi in kfolds :
            X, val_X = train_X[ti], train_X[vi]
            y, val_y = train_y[ti], train_y[vi]

            # get the best features for this data set
            feature_selector.fit(X, y)
            # remove irrelevant features from X, val_X and test
            X_selected        = feature_selector.transform(X)
            val_X_selected    = feature_selector.transform(val_X)
            test_selected      = feature_selector.transform(test)

            # run grid search to find the best model parameters for this subset of training data and subset of features 
            grid_search = GridSearchCV(feature_selector.estimator_, param_grid=ned[i], verbose=0, n_jobs=-1, scoring=make_scorer(scoring_roc_auc), cv=20)
            grid_search.fit(X_selected , y)

            # score our fitted model on validation data
            val_y_pred = grid_search.best_estimator_.predict(val_X_selected )
            val_roc = roc_auc_score(val_y, val_y_pred)
            print("model","val_roc",val_roc,"param",grid_search.best_params_)
            print("*--------------------------------------------------------------------------------------*")
    
    kfolds=StratifiedShuffleSplit(n_splits=20, test_size= 0.35, random_state=213).split(train_X, train_y)
    print("*--------------------------------------another model------------------------------------------------*")

In [None]:
#model val_roc 0.7819010416666666 param {'alpha': 0.019, 'tol': 0.0013}
modelLasso = Lasso(alpha=0.019, tol=0.0013, random_state=213, selection='random')
kfolds=StratifiedShuffleSplit(n_splits=20, test_size= 0.35, random_state=213).split(train_X, train_y)
# para={                           'alpha' : [0.022, 0.021, 0.02, 0.019, 0.023, 0.024, 0.025, 0.026, 0.027, 0.029, 0.031],
#                                     'tol'   : [0.0013, 0.0014, 0.001, 0.0015, 0.0011, 0.0012, 0.0016, 0.0017]}
feature_selector = RFECV(modelLasso, min_features_to_select=12, scoring=make_scorer(scoring_roc_auc), step=15, verbose=0, cv=20, n_jobs=-1)
   
for ti, vi in kfolds :
            X, val_X = train_X[ti], train_X[vi]
            y, val_y = train_y[ti], train_y[vi]

            # get the best features for this data set
            feature_selector.fit(X, y)
            # remove irrelevant features from X, val_X and test
            X_selected        = feature_selector.transform(X)
            val_X_selected    = feature_selector.transform(val_X)
            test_selected      = feature_selector.transform(test)

            # run grid search to find the best Lasso parameters for this subset of training data and subset of features 
            #grid_search = GridSearchCV(feature_selector.estimator_, param_grid=para, verbose=0, n_jobs=-1, scoring=make_scorer(scoring_roc_auc), cv=20)
            modelLasso.fit(X_selected , y)

            # score our fitted model on validation data
            val_y_pred = modelLasso.predict(val_X_selected )
            val_roc = roc_auc_score(val_y, val_y_pred)
            print("model","val_roc",val_roc)
            print("*--------------------------------------------------------------------------------------*")
    
    

In [None]:
predictions = pd.DataFrame()
counter = 0
prediction = modelLasso.predict(test_selected )
predictions = pd.concat([predictions, pd.DataFrame(prediction)], axis=1)
mean_pred = pd.DataFrame(predictions)
mean_pred.index += 250
mean_pred.columns = ['target']
mean_pred.head()   
mean_pred.to_csv('submission.csv', index_label='id', index=True) 