In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn import svm
import xgboost as xgb

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train_set = pd.read_csv('../input/saftey_efficay_myopiaTrain.csv')
test_set = pd.read_csv('../input/saftey_efficay_myopiaTest.csv')
samples_set = pd.read_csv('../input/saftey_efficay_myopiaSample.csv')

In [None]:
train_set = train_set.dropna(how='all')

In [None]:
catigorial_columns = ['D_L_Sex', 'D_L_Eye', 
                      'D_L_Dominant_Eye', 'Pre_L_Contact_Lens', 
                      'T_L_Laser_Type', 'T_L_Treatment_Type', 
                      'T_L_Cust._Ablation', 'T_L_Micro', 'T_L_Head', 
                      'T_L_Therapeutic_Cont._L.', 'T_L_Epith._Rep.', 'Class']
numeric_columns = train_set.columns[~train_set.columns.isin(catigorial_columns)]

In [None]:
train_set = train_set.apply(lambda x: x.replace([' ', '', 'NaN', 'nan', 'None', np.nan], 'None'))
test_set = test_set.apply(lambda x: x.replace([' ', '', 'NaN', 'nan', 'None', np.nan], 'None'))

In [None]:
le = LabelEncoder()

for col_name in catigorial_columns:
    ### fit with the desired col, col in position 0 for this example
    fit_by = pd.Series([i for i in train_set.iloc[:,col_name].unique() if type(i) == str])

    le.fit(fit_by)

    ### Set transformed col leaving np.NaN as they are
    train_set[col_name] = fit_by.apply(lambda x: le.transform([x])[0] if type(x) == str else x)

In [None]:
train_set = train_set.apply(lambda x: x.replace([' ', '', 'NaN', 'nan', 'None'], np.nan))
test_set = test_set.apply(lambda x: x.replace([' ', '', 'NaN', 'nan', 'None'], np.nan))

In [None]:
dict = defaultdict(LabelEncoder)
for col_name in catigorial_columns:
    train_set[col_name] = dict[col_name].fit_transform(train_set[col_name])

In [None]:
# preprocess-fillna for catigorial values using madien values
dict = defaultdict(LabelEncoder)
    
for col_name in catigorial_columns:
    train_set[col_name].fillna(train_set[col_name].value_counts().index[0], inplace=True)
    if col_name != 'Class':
        test_set[col_name].fillna(train_set[col_name].value_counts().index[0], inplace=True)

for col_name in catigorial_columns:
    train_set[col_name] = dict[col_name].fit_transform(train_set[col_name])
    if col_name != 'Class':
        test_set[col_name] = dict[col_name].fit_transform(test_set[col_name])

In [None]:
# preprocess-fillna for numeric values using mean values

for col_name in numeric_columns:
    train_set[col_name].fillna(train_set[col_name].mean(), inplace=True)
    test_set[col_name].fillna(train_set[col_name].mean(), inplace=True)

In [None]:
x_train = train_set.iloc[:, :-1]
y_train = train_set.iloc[:, -1]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

In [None]:
oversampler = SMOTE(random_state=0)
os_x, os_y = oversampler.fit_sample(x_train, y_train)

In [None]:
def eval_auc(y_test, pred):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print(roc_auc)
    plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')

In [None]:
# RANDOM FOREST

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=0)
rf_model.fit(os_x, os_y)

In [None]:
pred = rf_model.predict(x_test)

In [None]:
eval_auc(y_test, pred)

In [None]:
# SVM

In [None]:
svm_model = svm.SVC(kernel='linear', class_weight={0:1, 1:60}, random_state=0)
svm_model.fit(x_train, y_train)

In [None]:
svm_pred = svm_model.predict(x_test)

In [None]:
eval_auc(y_test, svm_pred)

In [None]:
# XGBOOST

In [None]:
xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=8) #objective= 'binary:logistic'

xgb_model.fit(x_train, y_train, verbose=True, early_stopping_rounds=100,
             eval_set=[(x_test, y_test)])

In [None]:
xgb_pred = xgb_model.predict(x_test)

In [None]:
xgb_pred

In [None]:
xgb_pred[xgb_pred > 0.5] = 1
xgb_pred[xgb_pred <= 0.5] = 0

In [None]:
eval_auc(y_test, xgb_pred)

In [None]:
sample_submmision = pd.DataFrame()
sample_submmision['Id'] = x_test.index + 1
sample_submmision['Class'] = xgb_pred
sample_submmision.sort_values(by=['Id'], inplace=True)

In [None]:
dtrain = xgb.DMatrix(x_train, label=y_train, missing=np.nan)
dval = xgb.DMatrix(x_test,label=y_test, missing=np.nan)

watchlist = [('val',dval),('train',dtrain)]

params = {
    'eta':0.03,
    'objective':'binary:logistic',
    'subsample':0.5,
    'colsample_bytree':0.8,
    'scale_pos_weight' :0.016,
    'base_score' : 0.7
}

num_round = 10
xgb_model = xgb.train(params, dtrain, num_round, watchlist)
