In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [3]:
#import data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
#mapping data

Nkey = train['Neighbourhood'].append(test['Neighbourhood']).dropna().unique()
Gkey = train['Gender'].append(test['Gender']).dropna().unique()
Nval = np.arange(1,len(Nval)+1)
Gval = np.arange(1,len(Gval)+1)
Nmap = dict((key, value) for (key, value) in zip(Nkey,Nval))
Gmap = dict((key, value) for (key, value) in zip(Gkey,Gval))

train['Gender'].replace(Gmap, inplace=True)
train['Neighbourhood'].replace(Nmap, inplace=True)
test['Gender'].replace(Gmap, inplace=True)
test['Neighbourhood'].replace(Nmap, inplace=True)
#set the type of variables
cat_var = ['Gender','Neighbourhood','Scholarship', 'Hipertension',
           'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show']
time_var = ['ScheduledDay','AppointmentDay']
#drop the time of ScheduledDay variables, since AppointmentDay does not have time information
train['ScheduledDay']=train['ScheduledDay'].astype('str').map(lambda x: x[0:10])
test['ScheduledDay']=test['ScheduledDay'].astype('str').map(lambda x: x[0:10])
#change the variable type
for i in time_var:
    train[i]=pd.to_datetime(train[i])
    test[i]=pd.to_datetime(test[i])
for i in cat_var:
    train[i] = train[i].astype('category')
    test[i] = test[i].astype('category')
#make new column day_diff    
train['day_diff'] = (train['AppointmentDay'] - train['ScheduledDay']).map(lambda x : x.days)
train = train.iloc[ :, 3:]

test['day_diff'] = (test['AppointmentDay'] - test['ScheduledDay']).map(lambda x : x.days)
test = test.iloc[ :, 3:]
#remove the ScheduledDay AppointmentDay columns
train = train.drop(columns=['ScheduledDay','AppointmentDay'])
test = test.drop(columns=['ScheduledDay','AppointmentDay'])
if 0:
    train.drop(columns='Neighbourhood')
    test.drop(columns='Neighbourhood')
#imputation mode with categorical variables, median with numerical variables
Imputation = 0
if Imputation:
    for i in cat_var:
        train[i].fillna(train[i].mode()[0], inplace=True)    
        test[i].fillna(test[i].mode()[0], inplace=True)  
    for i in train.columns:
        if i not in cat_var:
            train[i].fillna(train[i].median(), inplace=True)
            test[i].fillna(test[i].median(), inplace=True)
if not Imputation:
    for i in cat_var:
        try:
            test[i].cat.add_categories(0)
        finally:
            test[i].fillna(0,inplace=True)
        try:
            train[i].cat.add_categories(0)
        finally:
            train[i].fillna(0,inplace=True)
        #finally:

#set the X,Y variables
y = train['No-show']
X = train.drop(columns='No-show')
#train test split for CV
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
#test data
test_X = test.drop(columns='No-show')
test_y = test['No-show']

NameError: name 'Nval' is not defined

In [4]:
#oversampling
X_samp, y_samp = SMOTE(random_state=0).fit_sample(X_train, y_train)
X_samp= pd.DataFrame(X_samp)
X_samp.columns = X_train.columns
X_train, y_train = X_samp, y_samp

NameError: name 'X_train' is not defined

In [107]:
clf = RandomForestClassifier(n_estimators=300, max_depth=10,random_state=0)
clf.fit(X_train, y_train)

def rf_with_cutoff(test_X=test_X,test_y=test_y,clf=clf,silence=True):
    tmp = [x[0] for x in clf.predict_proba(test_X)]
    cutoff = sorted(tmp)[int((train['No-show'].sum()/train.shape[0]) * len(clf.predict(test_X)))]
    sub = pd.Series([x[0] < cutoff for x in clf.predict_proba(test_X)]).map(int)
    print('f1 score : ' + str(f1_score(y_pred=sub,y_true=test_y)))
    if not silence:
        return sub

In [113]:
f1_score(y_pred=clf.predict(X_valid),y_true=y_valid)

0.38455529516794995

In [108]:
f1_score(y_pred=clf.predict(test_X),y_true=test_y)

0.3759567762269248

In [110]:
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.round(y_pred))
    return 'f1_err', err

alg = XGBClassifier(learning_rate=0.01, n_estimators=3000, max_depth=4,
                    min_child_weight=3, colsample_bytree=0.9,
                    objective='binary:logistic', nthread=8, seed=27, random_state = 0)

alg.fit(X_train, y_train,
        eval_set=[(X_train, y_train),(X_valid, y_valid)], #모델에서 자체적으로 평가에 사용할 데이터
        eval_metric=f1_eval, #모델의 목적함수 지정(최소화할 목적함수 1-f1_score)
        early_stopping_rounds=200, #1o0 Interations 동안 최대화 되지 않으면 stop
        verbose=10) #Iteration 과정을 10 단위로 보여준다.

[0]	validation_0-error:0.299609	validation_1-error:0.342236	validation_0-f1_err:0.299164	validation_1-f1_err:0.634018
Multiple eval metrics have been passed: 'validation_1-f1_err' will be used for early stopping.

Will train until validation_1-f1_err hasn't improved in 200 rounds.
[10]	validation_0-error:0.263321	validation_1-error:0.354123	validation_0-f1_err:0.242039	validation_1-f1_err:0.585912
[20]	validation_0-error:0.267669	validation_1-error:0.368298	validation_0-f1_err:0.243762	validation_1-f1_err:0.594922
[30]	validation_0-error:0.267669	validation_1-error:0.368298	validation_0-f1_err:0.243762	validation_1-f1_err:0.594922
[40]	validation_0-error:0.265651	validation_1-error:0.366706	validation_0-f1_err:0.242029	validation_1-f1_err:0.595029
[50]	validation_0-error:0.255944	validation_1-error:0.339451	validation_0-f1_err:0.240368	validation_1-f1_err:0.590653
[60]	validation_0-error:0.255804	validation_1-error:0.339501	validation_0-f1_err:0.240208	validation_1-f1_err:0.590791
[70]

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=4, min_child_weight=3, missing=None, n_estimators=3000,
       n_jobs=1, nthread=8, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=1)

In [72]:
def xgb_with_cutoff(test_X=test_X,test_y=test_y,alg=alg,silence = True):
    tmp = [x[0] for x in alg.predict_proba(test_X)]
    cutoff = sorted(tmp)[int((train['No-show'].sum()/train.shape[0]) * len(test_y))]
    sub = pd.Series([x[0] < cutoff for x in alg.predict_proba(test_X)]).map(int)
    print('f1 score : ' + str(f1_score(y_pred=sub,y_true=test_y)))
    if not silence:
        return sub

In [111]:
f1_score(y_pred=alg.predict(X_valid),y_true=y_valid)

0.4176125542774435

In [112]:
f1_score(y_pred=alg.predict(test_X),y_true=test_y)

0.41051505425645735