In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import ADASYN

from keras.models import Sequential
from keras.layers import Dropout, Activation, Dense
from keras.layers import Flatten, Convolution2D, MaxPooling2D
from keras.models import load_model

from visualization import *

import openpyxl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'

Using TensorFlow backend.


In [2]:
def drop_na_cols(df):
    """Return cols with more than half missing value"""
    na_cols = [df.columns[i] for i in range(len(df.columns)) if df.count()[i] < (df.shape[0] / 2)]
    df = df.drop(na_cols, axis = 1)
    return df


def drop_na(df, target):
    """Return not include NA rows"""
    df = df.dropna(axis=0, subset=target)
    df = df.dropna(axis=0)
    return df


def drop_specific_value(df, target, value):
    for i in value:
        df = df.drop(df[df[target] == i].index, axis=0)
    return df

In [3]:
"""Load dataset"""
df = pd.read_excel('./datasets/f2_r1_crc_emr_191208ver.xlsx')
print('Original Dataset Shape : {} rows, {} cols'.format(df.shape[0], df.shape[1]))

# df.count()

Original Dataset Shape : 1512 rows, 38 cols


In [4]:
""" Save data summary to excel
df_summary = df.describe()
df_summary.to_excel('summary.xlsx', sheet_name='sheet1')
"""

target = 'Postop_Chemo_Regimen'
classes = np.array(['5-FU/LV', 'XELODA', 'FOLFOX', 'FOLFIRI', 'Surveillance'])

In [5]:
_df = drop_na_cols(df)
print('After drop Columns : {} rows, {} cols'.format(_df.shape[0], _df.shape[1]))

After drop Columns : 1512 rows, 34 cols


In [6]:
_df = drop_na(_df, [target])
print('After drop NAs : {} rows, {} cols'.format(_df.shape[0], _df.shape[1]))

After drop NAs : 1358 rows, 34 cols


In [7]:
"""Drop Special Cases in Postop_Chemo_Regimen"""
remove_1 = ['3 + 7', '3 + 6', '5 + 7', 8, 4, 7]
_df = drop_specific_value(_df, target, remove_1)
print(_df.shape)
#print(df[target_col].value_counts())


"""Drop Special Cases in Heart_disease"""
remove_2 = ['2, 5', '1, 5', '1, 4', '1, 3', '1, 2', 2, 3, 5, 4]
_df = drop_specific_value(_df, 'Heart_disease', remove_2)
print(_df.shape)
#print(df['Heart_disease'].value_counts())


"""Drop Special Cases in Intraoperative_tumor_location"""
remove_3 = ['1, 2', '1', '2, 2', '1, 2, 2']
_df = drop_specific_value(_df, 'Intraoperative_tumor_location', remove_3)
print(_df.shape)
# print(df['Intraoperative_tumor_location'].value_counts())


"""Drop Special Cases in Histologic_type"""
remove_4 = [2, 3]
_df = drop_specific_value(_df, 'Histologic_type', remove_4)
print(_df.shape)
#print(df['Histologic_type'].value_counts())


"""Drop Special Cases in K-ras"""
_df = drop_specific_value(_df, 'K-ras', [22])
print(_df.shape)
#print(df['K-ras'].value_counts())


"""Drop Special Cases in N-ras"""
remove_5 = [21, 22]
_df = drop_specific_value(_df, 'N-ras', remove_5)
print(_df.shape)
#print(df['N-ras'].value_counts())

(1288, 34)
(1231, 34)
(1206, 34)
(1204, 34)
(1197, 34)
(1186, 34)


In [8]:
_df.columns

Index(['Age', 'Sex', 'ASA', 'BMI', 'DM_history', 'Pulmonary_disease',
       'Liver_disease', 'Heart_disease', 'Kidney_disease', 'Smoking_history',
       'Prior_Dx_cancer', 'Initial_CEA', 'Hereditary_colorectal_tumor',
       'Perforation', 'Obstruction', 'Emergency',
       'Intraoperative_tumor_location', 'Tumor_location_pathology',
       'Histologic_type', 'LVI', 'PNI', 'Distal resection margin',
       'Radial margin', 'Harvested_LN', 'Positive_LN', 'pTNM', 'K-ras',
       'N-ras', 'BRAF', 'Early_Complication', 'Postop_Chemotherapy',
       'Postop_Chemo_Regimen', 'Recurrence', 'OS'],
      dtype='object')

In [9]:
numeric_cols = ['Age', 'BMI', 'Initial_CEA', 'Harvested_LN', 'Positive_LN', 'OS']
categorical_cols = [i for i in _df.columns if i not in numeric_cols]
dummy_cols = ['ASA', 'Smoking_history', 'Histologic_type', 'LVI', 'PNI',
              'Intraoperative_tumor_location', 'Tumor_location_pathology', 'Radial margin', 'pTNM']

for i in numeric_cols:
    _df[i] = pd.to_numeric(_df[i])
    
for i in categorical_cols:
    _df[i] = _df[i].astype(str)

print(_df.dtypes.value_counts())

object     28
float64     6
dtype: int64


In [13]:
"""
1 : 5-FU/LV
2 : XELODA
3 : FOLFOX
5 : FOLFIRI
"""
df_m = _df[_df['Sex'] == '0.0'] 
df_fm = _df[_df['Sex'] == '1.0']

des_list = ['1', '2', '3', '5', '9']


for i in des_list:
    print(_df[_df[target] == i].describe()[0:3])
    print('')

#df.describe()


              Age         BMI  Initial_CEA  Harvested_LN  Positive_LN  \
count  398.000000  398.000000   398.000000    398.000000   398.000000   
mean    61.329146   22.977663     9.867261     22.118090     1.379397   
std     11.033711    3.161303    51.640466     18.754106     3.323722   

               OS  
count  398.000000  
mean    90.977387  
std     41.155869  

             Age        BMI  Initial_CEA  Harvested_LN  Positive_LN         OS
count  45.000000  45.000000    45.000000     45.000000    45.000000  45.000000
mean   69.866667  23.585333    20.066000     21.711111     2.444444  63.022222
std    10.039920   3.037705    38.969139     18.121587     3.997474  41.187757

             Age         BMI  Initial_CEA  Harvested_LN  Positive_LN  \
count  331.00000  331.000000   331.000000    331.000000   331.000000   
mean    60.18429   23.237764    15.556949     25.193353     4.057402   
std     10.26671    3.341583    82.033774     16.588023     6.511507   

               OS  


In [9]:
from sklearn.utils import resample

X_boot = resample(df, n_samples=int(len(df) * 5))
# X_boot.describe()

X_boot_xeloda = X_boot[X_boot[target_col] == '2']
X_boot_folfiri = X_boot[X_boot[target_col] == '5']

print(X_boot_xeloda.shape)
print(X_boot_folfiri.shape)


(207, 34)
(173, 34)


In [10]:
X = pd.concat([df, X_boot_xeloda, X_boot_folfiri])
print(df.shape)
print(X.shape)

(1169, 34)
(1549, 34)


In [11]:

"""
1 : 5-FU/LV
2 : XELODA
3 : FOLFOX
5 : FOLFIRI
"""
print(df[target_col].value_counts())
print(X[target_col].value_counts())

1    398
9    371
3    323
2     42
5     35
Name: Postop_Chemo_Regimen, dtype: int64
1    398
9    371
3    323
2    249
5    208
Name: Postop_Chemo_Regimen, dtype: int64
