# 1.data load

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
train_data = pd.read_csv('./data/hospital/train.csv')
test_data = pd.read_csv('./data/hospital/test.csv')

# 2. Data pre-processing

## 2.1 Data Encoding

In [None]:
#train_data의 open close를 1과 0으로 encoding
train_data.loc[train_data["OC"] == "open", "OC"] = 1
train_data.loc[train_data["OC"] == " close", "OC"] = 0
train_data.loc[train_data["ownerChange"] == "same", "ownerChange"] = 1
train_data.loc[train_data["ownerChange"] == "change", "ownerChange"] = 0

train_data['OC'] = pd.to_numeric(train_data['OC'])
train_data['ownerChange'] = pd.to_numeric(train_data['ownerChange'])

list_c=["sido","instkind","inst_id"]
train_data.drop(columns=list_c, inplace=True)

## 2.2 Missing value handling


### 2.2.1 Listwise Deletion (common)
- 결측값 너무 많음

In [None]:
train_data = train_data.dropna(subset = ['salescost1','sga1','interest1','nonCAsset1'])

#### Checking Columns that has Missing Value

In [None]:
na_list =[]
for i in train_data.keys():
    list_index = train_data[train_data[i].isnull()].index
    for j in list_index:
        na_list.append(j)

na_list = list(set(na_list))

In [None]:
na_list

[193, 35, 230, 71, 263, 297, 298, 299, 300, 206, 48, 242, 212, 285, 93, 62]

### 2.2.2  CASE 1 : Missing value handling by Multiple Imputation(MI) - MICE package

- 사용 근거
    1. 15개 밖에 안되는 폐업 케이스에 결측값이 너무 많음 -> 단순삭제시 검정력(설명력)저하 우려 
    2. 평균값, 중앙값등 단순 대체 (imputation)은 p value를 떨어뜨리며 bias가 증가한다.
    

- 따라서 다른 결측치 처리방식인 MI 중 MICE사용
    - multivariate imputation by chained equations (MICE) 
        - 간단한 설명 : https://subinium.github.io/missing-data-handling/
        - 관련 논문 : mice: Multivariate Imputation by Chained Equations in R
            - http://scholar.google.co.kr/scholar_url?url=https://dspace.library.uu.nl/bitstream/handle/1874/44635/SvB-MICE%2520in%2520R%2520-%2520Draft.pdf%3Fsequence%3D1%3Fref%3Ddriverlayer.com/web&hl=ko&sa=X&scisig=AAGBfm2ILB5_fLCvf9S6cbHtO5fcP7GESg&nossl=1&oi=scholarr
            


- 문제점 : 1과 0으로 단순 encoding된 자료들이 수치형으로 나오는 발생...  => 추후 사용전 사용 후를 비교 

In [None]:
from impyute.imputation.cs import mice

# start the MICE training
imputed_training=mice(train_data.values)
train_data1 = train_data
train_data1 = pd.DataFrame(imputed_training, columns=train_data.keys())

In [None]:
train_data1.isnull().sum()

OC                    0
sgg                   0
openDate              0
bedCount              0
revenue1              0
salescost1            0
sga1                  0
salary1               0
noi1                  0
noe1                  0
interest1             0
ctax1                 0
profit1               0
liquidAsset1          0
quickAsset1           0
receivableS1          0
inventoryAsset1       0
nonCAsset1            0
tanAsset1             0
OnonCAsset1           0
receivableL1          0
debt1                 0
liquidLiabilities1    0
shortLoan1            0
NCLiabilities1        0
longLoan1             0
netAsset1             0
surplus1              0
revenue2              0
salescost2            0
sga2                  0
salary2               0
noi2                  0
noe2                  0
interest2             0
ctax2                 0
profit2               0
liquidAsset2          0
quickAsset2           0
receivableS2          0
inventoryAsset2       0
nonCAsset2      

### 2.2.3 CASE 2 : Dropping Variable [employee1, employee2]
- 결측값을 가진 많은 폐업 병원이 두 자료가 missing인 경우가 많으므로 칼럼삭제  (나머지는 na가진 행 전체 삭제) 

In [None]:
train_data2 = train_data

train_data2.drop(columns = ['employee1','employee2'], inplace=True)

### 2.2.4 CASE 3 : 에라모르겠다 MIssing Value 단순삭제 

In [None]:
train_data3 = train_data

train_data3 = train_data3.dropna()

## 2.3 Data Normalization
- 주성분 분석을 위해선 데이터 정규화가 필수 
### 2.3.1 Case 1 : Simple Normalization


### 2.3.2 Case 2 :  isolation forest를 통한 이상치 측정후 정규화

## 2.4 Data Selection
### 2.4.1 Common columns delete (chi-square test based) 
- Failing to Reject the Null Hypothesis (Using SAS) 

### 2.4.2 Case1 :  Data selection by VIF 

- VIF(Variance Inflation Factor) : 가장 의존적인 독립변수를 선택하는 방법
    1. 변수간의 상관관계가 높을 경우 지나치게 over-fitting할 우려가 있음 (다중공선성) 
    2. 통제변수 일경우 상관관계가 높다하더라도 VIF가 낮으면 사용 가능 
    

VIF 공식 및 설명 : https://datascienceschool.net/view-notebook/36176e580d124612a376cf29872cd2f0/

변수 선택참고 : https://ukchanoh.wordpress.com/2015/02/16/multicollinearity/

In [None]:
vif_list= ['sgg', 'openDate', 'bedCount', 'salary1', 'noi1', 'noe1',
       'interest1', 'ctax1', 'profit1', 'quickAsset1', 'receivableS1',
       'inventoryAsset1', 'tanAsset1', 'OnonCAsset1', 'shortLoan1',
       'longLoan1', 'surplus1', 'salary2', 'noi2', 'noe2', 'interest2',
       'ctax2', 'profit2', 'quickAsset2', 'receivableS2',
       'inventoryAsset2', 'tanAsset2', 'OnonCAsset2', 'shortLoan2',
       'longLoan2', 'surplus2', 'ownerChange']

In [None]:
vif_train_data1 =train_data1[vif_list]
vif_train_data2 =train_data2[vif_list]
vif_train_data3 =train_data3[vif_list]

In [None]:
OC_1 = train_data1['OC']
t_xdata = pd.concat([vif_train_data1, OC_1], axis=1)

aug = t_xdata[t_xdata['OC']==0]
t_xdata1 = pd.concat([t_xdata, aug], axis=0)
t_xdata2 = pd.concat([t_xdata1, aug], axis=0)
t_xdata3 = pd.concat([t_xdata2, aug], axis=0)


t_OC_1 = t_xdata3['OC']
t_xdata3.drop(columns='OC',inplace=True)

### 2.4.3 Case 2 : PCA (주성분분석)을 통한 변수선택 

- 몇개로 줄일지? 

In [None]:
from sklearn.decomposition import PCA

p10_xdata = PCA(n_components=10).fit_transform(t_xdata3)
p20_xdata =  PCA(n_components=20).fit_transform(t_xdata3)
p30_xdata =  PCA(n_components=30).fit_transform(t_xdata3)


### 2.4.4 Case 3 : Regularization(정규화) 적용

# 3. Modeling 


## 3.1 Classification Modeling 
    - 사용할 모델 
        1. Rogistic Regression
        2. Random Forest Classification 
        3. XG Boosting 
        4. LightGBM 
        5. Cat Boosting 
        

Data Split 

In [None]:
from sklearn.model_selection import train_test_split

train_Xdata, test_Xdata, train_ydata, test_tdata = train_test_split(p20_xdata,t_OC_1, test_size=0.3, random_state=777)

### 3.1.1 Rogistic Regression

train_data1  88.64

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(fit_intercept=False)
lr.fit(train_Xdata, train_ydata)
y_pred = lr.predict(test_Xdata)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
from sklearn.metrics import accuracy_score

print('로지스틱 회귀, 정확도 : {:.2f}%'.format(accuracy_score(test_tdata, y_pred)*100))

로지스틱 회귀, 정확도 : 69.61%


### 3.1.2 Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier()
param ={
    'n_estimators'      : [1],
    'max_depth'         : [1,2],
    'min_samples_leaf'  : [1,2],
    'min_samples_split' : [2,3]
}
grid_rfc = GridSearchCV(rfc, param_grid = param)
grid_rfc.fit(train_Xdata, train_ydata)
# 최적의 파라미터로 학습된 트리 Estimator 가져오기
estimator = grid_rfc.best_estimator_

# 검증 데이터로 결과 예측
rf_pred = estimator.predict(test_Xdata)

print('정확도 : {:.2f}%'.format(accuracy_score(test_tdata, rf_pred)*100))

정확도 : 80.39%


# 4. Predict / Submission

앙상블

In [None]:
##l########### logistic Regression
#############################################################################
lr = LogisticRegression(fit_intercept=False)
lr.fit(train_Xdata, train_ydata)
logistic_pred = lr.predict_proba(test_Xdata)


############ XGBOOST
############################################################################
import xgboost as xgb
dtrain_prod = xgb.DMatrix(data = train_Xdata, label = train_ydata)
dtest_prod = xgb.DMatrix(data=test_Xdata)
#Custom error function for the XGB model
threshold = 0.5
param = {'objective' : 'binary:logistic',
         'max_depth' : 6,
         'eta': 0.3,
         'colsample_bytree' : 1,
         'subsample' : 1,
         'silent' : 0
         }
nrounds = 2
np.random.seed(100)
xgb_model = xgb.train(param, 
                      dtrain_prod, 
                      num_boost_round = nrounds 
                      #maximize = True,
                      #early_stopping_rounds = 10,
                      )
XGB_prediction = xgb_model.predict(dtest_prod)


######### Random Forest
#######################################################
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier()
param ={
    'n_estimators'      : [1],
    'max_depth'         : [1,2],
    'min_samples_leaf'  : [1,2],
    'min_samples_split' : [2,3]
}
grid_rfc = GridSearchCV(rfc, param_grid = param)
grid_rfc.fit(train_Xdata, train_ydata)
# 최적의 파라미터로 학습된 트리 Estimator 가져오기
estimator = grid_rfc.best_estimator_

# 검증 데이터로 결과 예측
RF_prod_prediction = estimator.predict_proba(test_Xdata)



lor = logistic_pred[:,1]
rf = RF_prod_prediction[:,1]
xgb_p = XGB_prediction

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
(pred<0.6).sum()

14

In [None]:
def ensemble(m1,m2,m3,prob):
    pred = (m1+m2+m3)/3
    n=0
    for i in range(len(pred)):
        if pred[i] > prob:
            pred[i] = 1
        else:
            pred[i] = 0 
            n+=1
    
    print("close num ::",n)
    
    return pred

In [None]:
pred = ensemble(lor,rf,xgb_p,0.4)

close num :: 14


In [None]:
from sklearn.metrics import accuracy_score

print('앙상블 정확도 : {:.2f}%'.format(accuracy_score(test_tdata, pred)*100))

앙상블 정확도 : 84.31%


In [None]:
#test_data
test_data.loc[test_data["ownerChange"] == "same", "ownerChange"] = 1
test_data.loc[test_data["ownerChange"] == "change", "ownerChange"] = 0
test_data['ownerChange'] = pd.to_numeric(test_data['ownerChange'])
list_c=["sido","instkind","inst_id"]
test_data.drop(columns=list_c, inplace=True)
vif_test_data =test_data[vif_list]

# start the MICE training
imputed_training=mice(vif_test_data.values)
test_data1 = vif_test_data
test_data1 = pd.DataFrame(imputed_training, columns=vif_test_data.keys())

  res_values = method(rvalues)


KeyError: "['sido' 'instkind' 'inst_id'] not found in axis"

In [None]:
test_prod = xgb.DMatrix(data=test_data1)

logistic_pred = lr.predict_proba(test_Xdata)
XGB_prediction = xgb_model.predict(test_prod)
RF_prod_prediction = estimator.predict_proba(test_Xdata)

lor = logistic_pred[:,1]
rf = RF_prod_prediction[:,1]
xgb_p = XGB_prediction

pred = ensemble(lor,rf,xgb_p,0.4)

submit = pd.read_csv('./data/hospital/submission_sample.csv')
submit['OC'] = pred
submit.to_csv('C:/Users/smj71/Desktop/sub_12.csv', index= False)

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19'] ['sgg', 'openDate', 'bedCount', 'salary1', 'noi1', 'noe1', 'interest1', 'ctax1', 'profit1', 'quickAsset1', 'receivableS1', 'inventoryAsset1', 'tanAsset1', 'OnonCAsset1', 'shortLoan1', 'longLoan1', 'surplus1', 'salary2', 'noi2', 'noe2', 'interest2', 'ctax2', 'profit2', 'quickAsset2', 'receivableS2', 'inventoryAsset2', 'tanAsset2', 'OnonCAsset2', 'shortLoan2', 'longLoan2', 'surplus2', 'ownerChange']
expected f10, f12, f14, f7, f1, f5, f6, f4, f17, f19, f13, f3, f18, f15, f2, f8, f11, f0, f16, f9 in input data
training data did not have the following fields: OnonCAsset1, quickAsset2, surplus2, shortLoan2, noi1, inventoryAsset1, sgg, ownerChange, bedCount, interest2, shortLoan1, longLoan2, salary2, noe2, surplus1, noi2, receivableS1, interest1, profit2, receivableS2, tanAsset1, profit1, ctax2, OnonCAsset2, tanAsset2, quickAsset1, openDate, noe1, inventoryAsset2, longLoan1, ctax1, salary1