#### Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [2]:
working_directory = '/Users/a.su/Documents/MultiClassCancer_RNAseq_CNV_lncRNA/'

#Import Data
y_multi = pd.read_table(working_directory + 'CancerTypes_y_multiClass.txt', sep = '\t', header = None)
x_cnv = pd.read_table(working_directory + 'CNV_processed_multiClass.txt', sep = '\t', header = 0)
x_rna = pd.read_table(working_directory + 'RNAseq_processed_multiClass.txt', sep = '\t', header = 0)
x_lnc = pd.read_table(working_directory + 'lncRNA_processed_multiClass.txt', sep = '\t', header = 0)

  interactivity=interactivity, compiler=compiler, result=result)


#### Process X data

In [34]:
def print_dropped_columns(df, df_dropped, df_name):
    print('Dropped {0} Columns from {1}'.format(len(df.columns) - len(df_dropped.columns), df_name))
    
def count_all_zeros (df, axis = 0):
    return len(df.columns) - np.count_nonzero(df.sum(axis = axis), axis = axis)

def preprocess_x(df, df_name, drop_threshold = 0.5):
    
    drop_theshold = drop_threshold*len(df.index)
    df = df.drop('GeneID', axis = 1)    #Remove GeneID column
    df = df.transpose()    #Transpose
    print('{0} shape is {1}'.format(df_name, df.shape))
    
    #Drop columns with more than drop_threshold NaN values
    df_dropped = df.dropna(thresh = drop_threshold)
    print_dropped_columns(df, df_dropped, df_name)
    
    #Impute
    impute_median = SimpleImputer(strategy = 'median')
    df_imputed = pd.DataFrame(impute_median.fit_transform(df_dropped))
    print('Are there NaN values in {0}? {1}'.format(df_name, np.isnan(df_imputed).all().any()))
    
    #Count columns with all zeros
    print('{0} has {1} column(s) with all zeros'.format(df_name, count_all_zeros(df_imputed, 0)))
    #Delete columns with all zeros
    df_nozero = df_imputed.loc[:, (df_imputed != 0).any(axis = 0)]
    print_dropped_columns(df_imputed, df_nozero, df_name)
    print('{0} has {1} column(s) with all zeros'.format(df_name, count_all_zeros(df_nozero, 0)))
    
    #Scale data
    zscore = lambda x: (x-x.mean())/x.std()
    df_processed = df_nozero.transform(zscore)
    print('Are there NaN values in {0}? {1}'.format(df_name, np.isnan(df_imputed).all().any()))
    
    #Visualise
    rand_columns = np.random.choice(df_processed.columns.values, size = 5, replace = False)
    print(df_processed[rand_columns].describe())
    
    #Reset Index
    df_processed = df.reset_index(drop = True)
          
    return df_processed

In [35]:
x_cnv_processed = preprocess_x(x_cnv, 'x_cnv', 0.5)
x_rna_processed = preprocess_x(x_rna, 'x_rna', 0.5)
x_lnc_processed = preprocess_x(x_lnc, 'x_lnc', 0.5)

x_cnv shape is (668, 26374)
Dropped 0 Columns from x_cnv
Are there NaN values in x_cnv? False
x_cnv has 0 column(s) with all zeros
Dropped 0 Columns from x_cnv
x_cnv has 0 column(s) with all zeros
Are there NaN values in x_cnv? False
              12017         3229          3332          16144         12364
count  6.680000e+02  6.680000e+02  6.680000e+02  6.680000e+02  6.680000e+02
mean   2.371481e-17 -8.443013e-17  2.007709e-16  5.052512e-17  1.003023e-16
std    1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00
min   -2.967428e+00 -3.531920e+00 -3.318285e+00 -3.078642e+00 -3.407484e+00
25%   -6.152205e-01 -6.186611e-01 -5.289799e-01 -5.408054e-01 -6.193547e-01
50%   -5.543567e-02 -1.036978e-01 -6.770077e-02 -1.202260e-01 -9.588946e-02
75%    5.902398e-01  5.991279e-01  5.839361e-01  5.549355e-01  6.274062e-01
max    3.163526e+00  2.890459e+00  3.223672e+00  5.775941e+00  3.270183e+00
x_rna shape is (668, 26094)
Dropped 0 Columns from x_rna
Are there NaN values in x

#### Process Y data

In [24]:
y_condensed = y_multi
#Make a new column containing the number instances a cancer type occurs
y_condensed['Instances'] = y_condensed[0].map(y_condensed[0].value_counts())
#Replace cancer type with 'Other' if that cancer type occurs less than 30 times
y_condensed[0].where(y_condensed['Instances']>=30, 'Other', inplace = True)
#Drop Instances column
y_condensed.drop(columns = ['Instances'], inplace = True)

In [27]:
y_condensed.head()

Unnamed: 0,0
0,Carcinoma Non-Small Cell
1,Other
2,Carcinoma Non-Small Cell
3,Carcinoma Non-Small Cell
4,Carcinoma Squamous Cell


In [33]:
y_condensed[0].value_counts()

Other                       238
Adenocarcinoma              146
Carcinoma                    98
Melanoma                     48
Carcinoma Non-Small Cell     47
Adenocarcinoma Ductal        31
Carcinoma Small Cell         30
Carcinoma Squamous Cell      30
Name: 0, dtype: int64

In [38]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

Using TensorFlow backend.


In [49]:
#Encode Cancer Types as Integers
le = LabelEncoder()
y_integers = le.fit_transform(y_condensed.values)
#One hot encoding 
y_encoded = to_categorical(y_integers)
y_encoded[0:5]

array([[0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.]], dtype=float32)

In [81]:
y_integers

array([3, 7, 3, 3, 5, 3, 3, 0, 0, 0, 0, 7, 7, 3, 7, 0, 0, 7, 5, 0, 3, 7,
       0, 2, 2, 3, 2, 4, 3, 4, 4, 4, 6, 6, 2, 0, 0, 0, 6, 0, 6, 0, 7, 1,
       3, 3, 1, 2, 0, 0, 3, 3, 0, 7, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0,
       0, 7, 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0,
       2, 0, 0, 5, 0, 7, 7, 0, 2, 7, 2, 0, 0, 2, 0, 0, 6, 5, 7, 7, 5, 7,
       0, 7, 0, 7, 7, 7, 7, 6, 7, 4, 2, 3, 0, 0, 0, 2, 0, 2, 7, 2, 0, 4,
       0, 4, 5, 0, 7, 0, 7, 2, 7, 2, 2, 2, 2, 0, 7, 7, 0, 2, 7, 7, 7, 4,
       4, 7, 6, 7, 6, 7, 7, 7, 7, 7, 4, 4, 4, 4, 7, 7, 7, 2, 0, 7, 0, 2,
       3, 1, 1, 2, 7, 0, 7, 2, 7, 2, 7, 7, 7, 0, 2, 5, 0, 3, 7, 7, 7, 7,
       7, 7, 2, 3, 7, 7, 7, 7, 2, 2, 2, 2, 2, 7, 7, 7, 7, 7, 2, 5, 5, 5,
       5, 5, 7, 5, 5, 5, 5, 5, 5, 2, 7, 7, 6, 0, 2, 0, 0, 3, 3, 0, 7, 7,
       7, 7, 2, 7, 7, 7, 7, 2, 3, 4, 2, 4, 0, 7, 0, 7, 7, 2, 4, 4, 4, 7,
       4, 7, 2, 4, 2, 0, 7, 3, 7, 7, 7, 0, 3, 3, 7, 7, 0, 0, 0, 7, 7, 4,
       4, 4, 0, 7, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 0,

### Neural Network

In [68]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier

In [55]:
x_train, x_test, y_train, y_test = train_test_split(x_rna_processed, y_encoded, test_size = 0.25, random_state = 0)

In [59]:
model = Sequential()
model.add(Dense(32, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(8, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [64]:
model.fit(x_train.values, y_train, epochs = 100, batch_size = 64, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1a2929d128>

In [65]:
model.evaluate(x_test, y_test)



[9.65155442175037, 0.4011976050581047]

#### Cross Validation

In [82]:
def create_model():
    model = Sequential()
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(8, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimator = KerasClassifier(build_fn = create_model, epochs = 100, batch_size = 64)

In [83]:
kfold = KFold(n_splits=10, shuffle = True)

In [None]:
results = cross_val_score(estimator, x_rna_processed.values, y_encoded, cv = kfold)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [80]:
print('Accuracy: {0}% ({1}%)'.format(results.mean()*100, results.std()*100))

Accuracy: 24.142921763758267% (15.63019025160806%)
