In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from xgboost import XGBRegressor, XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneOut
from sklearn.metrics import mean_squared_error
from keras.layers import Dense, Input
from keras.models import Model
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# evaluation function

def eval_auc(y_test, pred):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print(roc_auc)
    plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')

# Read data

In [None]:
train_set = pd.read_csv('../input/saftey_efficay_myopiaTrain.csv')
test_set = pd.read_csv('../input/saftey_efficay_myopiaTest.csv')

print('train set shape:', train_set.shape)
print('test set shape:', test_set.shape)

# Hendle missing values

In [None]:
train_set = train_set.dropna(how='all')

train_set = train_set.apply(lambda x: x.replace([' ', '', 'NaN', 'nan', 'None'], np.nan))
test_set = test_set.apply(lambda x: x.replace([' ', '', 'NaN', 'nan', 'None'], np.nan))

# keeping only the columns that has less then 0.5 missing values - train set
train_set = train_set[train_set.columns[train_set.isnull().mean() < 0.5]]

# split to catagorial and numeric columns from remaining columns
catigorial_columns = train_set.select_dtypes(include='object').columns
numeric_columns = train_set.columns[~train_set.columns.isin(np.append(catigorial_columns, 'Class'))]

# keeping only the columns that has less then 0.5 missing values - test set
test_columns = train_set.columns[~train_set.columns.isin(['Class'])]
test_set = test_set[test_columns]

In [None]:
print('train set shape:', train_set.shape)
print('test set shape:', test_set.shape)

# Preprocess

In [None]:
# fillna for catigorial values using 'null_value' string
    
for col_name in catigorial_columns:
    train_set[col_name].fillna('null_value', inplace=True)
    test_set[col_name].fillna('null_value', inplace=True)
for col_name in catigorial_columns:
    le  = LabelEncoder()
    le.fit(pd.concat([train_set[col_name], test_set[col_name]]))
    train_set[col_name] = le.transform(train_set[col_name])
    test_set[col_name] = le.transform(test_set[col_name])
    
# fillna for numeric values using mean values
for col_name in numeric_columns:
    mean_val = train_set[col_name].mean()
    train_set[col_name].fillna(mean_val, inplace=True)
    test_set[col_name].fillna(mean_val, inplace=True)

## Split to feature and target

In [None]:
x_train_set = train_set.iloc[:, :-1]
y_train_set = train_set.iloc[:, -1]

In [None]:
print("counts of label '1': {}".format(sum(y_train_set==1)))
print("counts of label '0': {} \n".format(sum(y_train_set==0)))
print('imbalance ratio:', sum(y_train_set==0)/sum(y_train_set==1))


# Get more '1' lable recoreds using SMOTE 

In [None]:
# print("Before OverSampling, counts of label '1': {}".format(sum(y_train_set==1)))
# print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train_set==0)))

# sm = SMOTE(random_state=2)
# x_train_smote, y_train_smote = sm.fit_sample(x_train_set, y_train_set.ravel())

# print('After OverSampling, the shape of train_X: {}'.format(x_train_smote.shape))
# print('After OverSampling, the shape of train_y: {} \n'.format(y_train_smote.shape))

# print("After OverSampling, counts of label '1': {}".format(sum(y_train_smote==1)))
# print("After OverSampling, counts of label '0': {}".format(sum(y_train_smote==0)))

# Building models

# lgb model with leave-one-out cross validation

In [None]:
# x = x_train_set.values
# y = y_train_set.values

# loo = LeaveOneOut()
# train_predictions_lgb_loo = np.zeros(len(x))
# predictions_lgb_loo = np.zeros(len(test_set))
# n_splits_loo = loo.get_n_splits(x)

# lgb_params = {'num_leaves': 200,
#              'min_data_in_leaf': 149, 
#              'objective':'regression',
#              'max_depth': 7,
#              'learning_rate': 0.01,
#              "boosting": "gbdt",
#              "feature_fraction": 0.7522,
#              "bagging_freq": 1,
#              "bagging_fraction": 0.7083 ,
#              "bagging_seed": 11,
#              "metric": 'rmse',
#              "lambda_l1": 0.2634,
#              "random_state": 133
#              }

# for train_index, test_index in loo.split(x):
#     X_train, X_test = x[train_index], x[test_index]
#     y_train, y_test = y[train_index], y[test_index]
    
#     lgb_loo_train = lgb.Dataset(X_train, y_train)
#     lgb_loo_eval = lgb.Dataset(X_test, y_test, reference=lgb_loo_train)
    
#     lgb_loo_model = lgb.train(lgb_params, lgb_loo_train, num_boost_round=20, valid_sets=lgb_loo_eval, early_stopping_rounds=5)
    
#     train_predictions_lgb_loo[test_index] = lgb_loo_model.predict(X_test, num_iteration=lgb_loo_model.best_iteration)
    
#     predictions_lgb_loo += lgb_loo_model.predict(test_set, num_iteration=lgb_loo_model.best_iteration) / n_splits_loo

# eval_auc(y, train_predictions_lgb_loo)

# XGB regressor and lgb regressor with Stratified-5-fold cross validation

In [None]:
x = x_train_set.values
y = y_train_set.values

skf_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
n_splits_skf = skf_folds.get_n_splits(x)

train_predictions_xgb_skf_reg = np.zeros(len(x))
predictions_xgb_skf_reg = np.zeros(len(test_set))

# train_predictions_lgb_skf_reg = np.zeros(len(x))
# predictions_lgb_skf_reg = np.zeros(len(test_set))

number_of_fold = 1
for train_index, test_index in skf_folds.split(x, y):
    X_train, X_test = x_train_set.iloc[train_index], x_train_set.iloc[test_index]
    y_train, y_test = y_train_set.iloc[train_index], y_train_set.iloc[test_index]
    
    print('for fold ', number_of_fold, ' count of lable 0 on train set: ', sum(y_train==0))
    print('for fold ', number_of_fold, ' count of lable 1 on train set: ', sum(y_train==1))
    number_of_fold = number_of_fold + 1
    
    eval_set = [(X_train, y_train), (X_test, y_test)]
    
    xgb_model_reg_skf = XGBRegressor(n_estimators=100, learning_rate=0.01, gamma=0, 
                                     subsample=0.8, colsample_bytree=1, max_depth=7)
    xgb_model_reg_skf.fit(X_train, y_train, eval_set=eval_set, verbose=False)
    
#     lgb_model_reg_skf = lgb.sklearn.LGBMRegressor(is_unbalance=True, n_estimators=200, 
#                                               num_leaves=5, learning_rate =0.01, subsample=0.8, 
#                                               colsample_bytree=0.6, max_depth=7)
#     lgb_model_reg_skf.fit(X_train, y_train, eval_set=eval_set, verbose=False)
    
    train_predictions_xgb_skf_reg[test_index] = xgb_model_reg_skf.predict(X_test)
    predictions_xgb_skf_reg += (xgb_model_reg_skf.predict(test_set) / (n_splits_skf + 1))
    
#     train_predictions_lgb_skf_reg[test_index] = lgb_model_reg_skf.predict(X_test)
#     predictions_lgb_skf_reg += (lgb_model_reg_skf.predict(test_set) / (n_splits_skf + 1))
    
#     total_pred = (train_predictions_xgb_skf_reg[test_index] + train_predictions_lgb_skf_reg[test_index]) / 2
#     eval_auc(y_test, total_pred)
    eval_auc(y_test, train_predictions_xgb_skf_reg[test_index])

# total_pred = (train_predictions_xgb_skf_reg + train_predictions_lgb_skf_reg) / 2
# eval_auc(y_train_set, total_pred)
eval_auc(y_train_set, train_predictions_xgb_skf_reg)

In [None]:
# add the prediction on the test set of the XGBRegressor model after training on ALL the data

xgb_model_reg = XGBRegressor(n_estimators=100, learning_rate=0.01, gamma=0, 
                             subsample=0.8, colsample_bytree=1, max_depth=7)
xgb_model_reg.fit(x_train_set, y_train_set, verbose=False)
predictions_xgb_skf_reg += (xgb_model_reg.predict(test_set) / (n_splits_skf + 1))

# lgb_model_reg = lgb.sklearn.LGBMRegressor(is_unbalance=True, n_estimators=200, 
#                                           num_leaves=5, learning_rate =0.01, subsample=0.8, 
#                                           colsample_bytree=0.6, max_depth=7)
# lgb_model_reg.fit(x_train_set, y_train_set, verbose=False)
# predictions_lgb_skf_reg += (lgb_model_reg.predict(test_set) / (n_splits_skf + 1))

# XGB regressor 

In [None]:
# xgb_model_reg = XGBRegressor(n_estimators=100, learning_rate=0.01, gamma=0, subsample=0.8, colsample_bytree=1, max_depth=7, nthread=1)
# xgb_model_reg.fit(x_train_set, y_train_set)
# xgb_reg_pred = xgb_model_reg.predict(test_set)

# using all models as ensambel and calculate the average prediction for test set

In [None]:
# total_test_pred = (predictions_xgb_skf_reg + predictions_lgb_skf_reg) / 2

# Writing results to sample submmitions file

In [None]:
sample_submmision = pd.DataFrame()
sample_submmision['Id'] = test_set.index + 1
sample_submmision['Class'] = predictions_xgb_skf_reg #total_test_pred
sample_submmision.sort_values(by=['Id'], inplace=True)
sample_submmision.to_csv('sample_submmision.csv', index=False)