# Overview

* **Data Preprocessing**
    - Missing Data Preprocessing
    - One-hot Encoding / Normalization
    - Data Oversampling (Before & After Descriptive Statistics)

* **Descriptive Statistics**
    - Categorical Feature (N)
    - Numerical Feature (Mean, SD, T-Test, $X^2$ Test) 
    
* **Model**
    - Machine Learning Algorithms
    - Deep Learning Model (Reference Model)

* **Evaluation**
    - Recall, Precision, F1-Score, ROC Curve, AUC

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

## 0 - Data Upload

In [2]:
df_og = pd.read_excel('./datasets/200525_EMR_V3_Priorities_only.xlsx')
print(df_og.shape)

(3319, 44)


In [3]:
date = ['Operation_date', 'Last_Follow_Up_Date', 'Death_Date']
df = df_og.drop(date, axis=1)
df.shape

(3319, 41)

## 1 - Data Preprocessing

### 1.1 - Missing Data Preprocessing

In [4]:
df = df[df['Postop_Chemo_Regimen'].notnull()]
df.shape

(1934, 41)

In [5]:
def drop_missing_features(df):
    drop_features = [c for c in df.columns if df[c].isnull().sum() > 500]
    print('Has been dropped : {}'.format(drop_features))
    df = df.drop(drop_features, axis = 1)
    return df


df = drop_missing_features(df)
df.shape

Has been dropped : ['Family_history_cancer', 'Location_of_rectal_cancer', 'Radicality', 'MSI_status', 'Recurrence_Type']


(1934, 36)

In [6]:
df = df.dropna()
df.isnull().sum()[:5]
print(df.shape)

(1549, 36)


## 1.2 - Data Transform

In [7]:
target = 'Postop_Chemo_Regimen'
numerical = ['Age', 'BMI', 'Initial_CEA', 'Harvested_LN', 'Positive_LN']
categorical = [nc for nc in df.columns if nc not in numerical and nc not in date and nc != target]

print('numerical type : {}, \
       categorical type : {}'.format(len(numerical), len(categorical)))

numerical type : 5,        categorical type : 30


In [8]:
df[target].value_counts()

3.0    679
1.0    546
6.0    124
2.0    119
4.0     41
5.0     40
Name: Postop_Chemo_Regimen, dtype: int64

1. 5-FU/LV
2. XELODA
3. FOLFOX
4. XELOX
5. FOLFIRI
6. ERBITUX, Avastin (표적 치료제)

## 1.3 - Feature Preprocessing

In [9]:
from sklearn.preprocessing import MinMaxScaler

In [10]:
def split_target_feature(df, target_name):
    target = df[target_name].astype(str)
    df = df.drop([target_name], axis=1)
    return df, target


def numerical_preprocessing(df, numerical_cols):
    scaler = MinMaxScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return pd.DataFrame(df)

def categorical_preprocessing(df, categorical_cols, onehot_cols):
    for c in categorical_cols:
        df[c] = df[c].astype(str)
    
    df = pd.get_dummies(df, columns=onehot_cols)
    return df
    

In [11]:
df, target = split_target_feature(df, target)
print(df.shape, target.shape)

(1549, 35) (1549,)


### 연속형 및 범주형 변수 전처리 (정규화 & 원-핫 인코딩)

In [12]:
not_onehot_features = ['Sex', 'DM_history', 'Pulmonary_disease', 'Liver_disease', 'Hereditary_colorectal_tumor',
                   'Perforation', 'Obstruction', 'Emergency', 'Early_Complication', 'Recurrence']

onehot_features = [c for c in categorical if c not in not_onehot_features]

In [13]:
X = numerical_preprocessing(df, numerical)
X = categorical_preprocessing(df, categorical, onehot_features)
Y = target

  return self.partial_fit(X, y)


In [14]:
X.shape, Y.shape

((1549, 112), (1549,))

## 2 - Deep Learning Model

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(1239, 112) (310, 112) (1239,) (310,)


In [17]:
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dropout, Activation, Dense
from keras.layers import Flatten, Convolution2D, MaxPooling2D
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline


Using TensorFlow backend.


In [18]:
def baseline_model():
    model = Sequential()
    
    model.add(Dense(128, input_dim=112, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(6, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [21]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=16, verbose=0)
kfold = KFold(n_splits=10, shuffle=True, random_state=200528)

In [22]:
results = cross_val_score(estimator, x_train, y_train, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 65.62% (3.99%)


## Descriptive Statistics (Before Oversampling)