In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import ADASYN

from keras.models import Sequential
from keras.layers import Dropout, Activation, Dense
from keras.layers import Flatten, Convolution2D, MaxPooling2D
from keras.models import load_model

from visualization import *

import openpyxl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'

Using TensorFlow backend.


In [16]:
def drop_na_cols(df):
    """Return cols with more than half missing value"""
    na_cols = [df.columns[i] for i in range(len(df.columns)) if df.count()[i] < (df.shape[0] / 2)]
    df = df.drop(na_cols, axis = 1)
    return df


def drop_na(df, target):
    """Return not include NA rows"""
    df = df.dropna(axis=0, subset=target)
    df = df.dropna(axis=0)
    return dfh


def drop_specific_value(df, target, value):
    for i in value:
        df = df.drop(df[df[target] == value].index, axis=0)
    return df

In [10]:
"""Load dataset"""
df = pd.read_excel('./datasets/f2_r1_crc_emr_191208ver.xlsx')
print('Original Dataset Shape : {} rows, {} cols'.format(df.shape[0], df.shape[1]))

# df.count()

Original Dataset Shape : 1512 rows, 38 cols


In [11]:
""" Save data summary to excel
df_summary = df.describe()
df_summary.to_excel('summary.xlsx', sheet_name='sheet1')
"""

target = 'Postop_Chemo_Regimen'
classes = np.array(['5-FU/LV', 'XELODA', 'FOLFOX', 'FOLFIRI', 'Surveillance'])

In [12]:
_df = drop_na_cols(df)
print('After drop Columns : {} rows, {} cols'.format(_df.shape[0], _df.shape[1]))

After drop Columns : 1512 rows, 34 cols


In [13]:
_df = drop_na(_df, [target])
print('After drop NAs : {} rows, {} cols'.format(_df.shape[0], _df.shape[1]))

After drop NAs : 1358 rows, 34 cols


In [14]:
"""Drop Special Cases in Postop_Chemo_Regimen"""


After drop NAs : 1358 rows, 34 cols


In [None]:
"""Drop Special Cases in Postop_Chemo_Regimen"""
df = df.drop(df[df[target_col] == '3 + 7'].index, axis=0)
df = df.drop(df[df[target_col] == '3 + 6'].index, axis=0)
df = df.drop(df[df[target_col] == '5 + 7'].index, axis=0)
df = df.drop(df[df[target_col] == 8].index, axis=0)
df = df.drop(df[df[target_col] == 4].index, axis=0)
df = df.drop(df[df[target_col] == 7].index, axis=0)

print(df.shape)
#print(df[target_col].value_counts())


"""Drop Special Cases in Heart_disease"""
df = df.drop(df[df['Heart_disease'] == '2, 5'].index, axis=0)
df = df.drop(df[df['Heart_disease'] == '1, 5'].index, axis=0)
df = df.drop(df[df['Heart_disease'] == '1, 4'].index, axis=0)
df = df.drop(df[df['Heart_disease'] == '1, 3'].index, axis=0)
df = df.drop(df[df['Heart_disease'] == '1, 2'].index, axis=0)

df = df.drop(df[df['Heart_disease'] == 2].index, axis=0)
df = df.drop(df[df['Heart_disease'] == 3].index, axis=0)
df = df.drop(df[df['Heart_disease'] == 5].index, axis=0)
df = df.drop(df[df['Heart_disease'] == 4].index, axis=0)

print(df.shape)
#print(df['Heart_disease'].value_counts())


"""Drop Special Cases in Intraoperative_tumor_location"""
df = df.drop(df[df['Intraoperative_tumor_location'] == '1, 2'].index, axis=0)
df = df.drop(df[df['Intraoperative_tumor_location'] == '1, 1'].index, axis=0)
df = df.drop(df[df['Intraoperative_tumor_location'] == '2, 2'].index, axis=0)
df = df.drop(df[df['Intraoperative_tumor_location'] == '1, 2, 2'].index, axis=0)

print(df.shape)
print(df['Intraoperative_tumor_location'].value_counts())


"""Drop Special Cases in Histologic_type"""
df = df.drop(df[df['Histologic_type'] == 2].index, axis=0)
df = df.drop(df[df['Histologic_type'] == 3].index, axis=0)

print(df.shape)
#print(df['Histologic_type'].value_counts())


"""Drop Special Cases in K-ras"""
df = df.drop(df[df['K-ras'] == 22].index, axis=0)

print(df.shape)
#print(df['K-ras'].value_counts())


"""Drop Special Cases in N-ras"""
df = df.drop(df[df['N-ras'] == 21].index, axis=0)
df = df.drop(df[df['N-ras'] == 22].index, axis=0)

print(df.shape)
#print(df['N-ras'].value_counts())

In [4]:
def drop_null_cols(df):
    """Return columns with more than half missing value"""
    null_cols = []
    for i in range(len(df.columns)):
        if df.count()[i] < (df.shape[0]/2):
            null_cols.append(df.columns[i])
    
    df = df.drop(null_cols, axis=1)
    return df


def drop_target_null_rows(df, target):
    """Return not include NA rows"""
    df = df.dropna(axis=0, subset=target)
    return df


df = drop_null_cols(df)
df = drop_target_null_rows(df, [target_col])
df = df.dropna(axis=0)

print(df.shape)
#print(df[target_col].value_counts())

(1358, 34)


(1288, 34)
(1231, 34)
(1188, 34)
1    656
2    532
Name: Intraoperative_tumor_location, dtype: int64
(1186, 34)
(1180, 34)
(1169, 34)


In [7]:
numeric_cols = ['Age', 'BMI', 'Initial_CEA', 'Harvested_LN', 'Positive_LN', 'OS']
categorical_cols = [i for i in df.columns if i not in numeric_cols]
dummy_cols = ['ASA', 'Smoking_history', 'Histologic_type', 'LVI', 'PNI',
              'Intraoperative_tumor_location', 'Tumor_location_pathology', 'Radial margin', 'pTNM']

for i in numeric_cols:
    df[i] = pd.to_numeric(df[i])
    
for i in categorical_cols:
    df[i] = df[i].astype(str)

print(df.dtypes.value_counts())

object     28
float64     6
dtype: int64


In [8]:
"""
1 : 5-FU/LV
2 : XELODA
3 : FOLFOX
5 : FOLFIRI
"""
df_m = df[df['Sex'] == '0.0'] 
df_fm = df[df['Sex'] == '1.0']

des_list = ['1', '2', '3', '5', '9']


for i in des_list:
    print(df[df[target_col] == i].describe()[0:3])
    print('')

#df.describe()


              Age         BMI  Initial_CEA  Harvested_LN  Positive_LN  \
count  398.000000  398.000000   398.000000    398.000000   398.000000   
mean    61.329146   22.977663     9.867261     22.118090     1.379397   
std     11.033711    3.161303    51.640466     18.754106     3.323722   

               OS  
count  398.000000  
mean    90.977387  
std     41.155869  

             Age        BMI  Initial_CEA  Harvested_LN  Positive_LN         OS
count  42.000000  42.000000    42.000000     42.000000    42.000000  42.000000
mean   69.261905  23.561905    21.081905     22.619048     2.404762  65.238095
std    10.063359   3.084740    40.171566     18.356832     4.096754  40.917412

              Age         BMI  Initial_CEA  Harvested_LN  Positive_LN  \
count  323.000000  323.000000   323.000000    323.000000   323.000000   
mean    60.325077   23.245913    14.963251     25.300310     4.105263   
std     10.339105    3.313012    81.617385     16.724092     6.579759   

               O

In [9]:
from sklearn.utils import resample

X_boot = resample(df, n_samples=int(len(df) * 5))
# X_boot.describe()

X_boot_xeloda = X_boot[X_boot[target_col] == '2']
X_boot_folfiri = X_boot[X_boot[target_col] == '5']

print(X_boot_xeloda.shape)
print(X_boot_folfiri.shape)


(207, 34)
(173, 34)


In [10]:
X = pd.concat([df, X_boot_xeloda, X_boot_folfiri])
print(df.shape)
print(X.shape)

(1169, 34)
(1549, 34)


In [11]:

"""
1 : 5-FU/LV
2 : XELODA
3 : FOLFOX
5 : FOLFIRI
"""
print(df[target_col].value_counts())
print(X[target_col].value_counts())

1    398
9    371
3    323
2     42
5     35
Name: Postop_Chemo_Regimen, dtype: int64
1    398
9    371
3    323
2    249
5    208
Name: Postop_Chemo_Regimen, dtype: int64
