#### Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [2]:
working_directory = '/Users/a.su/Documents/MultiClassCancer_RNAseq_CNV_lncRNA/'

#Import Data
y_multi = pd.read_table(working_directory + 'CancerTypes_y_multiClass.txt', sep = '\t', header = None)
x_cnv = pd.read_table(working_directory + 'CNV_processed_multiClass.txt', sep = '\t', header = 0)
x_rna = pd.read_table(working_directory + 'RNAseq_processed_multiClass.txt', sep = '\t', header = 0)
x_lnc = pd.read_table(working_directory + 'lncRNA_processed_multiClass.txt', sep = '\t', header = 0)

  interactivity=interactivity, compiler=compiler, result=result)


#### Process X data

In [3]:
def print_dropped_columns(df, df_dropped, df_name):
    print('Dropped {0} Columns from {1}'.format(len(df.columns) - len(df_dropped.columns), df_name))
    
def count_all_zeros (df, axis = 0):
    return len(df.columns) - np.count_nonzero(df.sum(axis = axis), axis = axis)

def preprocess_x(df, df_name, drop_threshold = 0.5):
    
    drop_theshold = drop_threshold*len(df.index)
    df = df.drop('GeneID', axis = 1)    #Remove GeneID column
    df = df.transpose()    #Transpose
    print('{0} shape is {1}'.format(df_name, df.shape))
    
    #Drop columns with more than drop_threshold NaN values
    df_dropped = df.dropna(thresh = drop_threshold)
    print_dropped_columns(df, df_dropped, df_name)
    
    #Impute
    impute_median = SimpleImputer(strategy = 'median')
    #impute_median = Imputer(strategy = 'median') #Use for delta
    df_imputed = pd.DataFrame(impute_median.fit_transform(df_dropped))
    print('Are there NaN values in {0}? {1}'.format(df_name, np.isnan(df_imputed).any().any()))
    
    #Count columns with all zeros
    print('{0} has {1} column(s) with all zeros'.format(df_name, count_all_zeros(df_imputed, 0)))
    #Delete columns with all zeros
    df_nozero = df_imputed.loc[:, (df_imputed != 0).any(axis = 0)]
    print_dropped_columns(df_imputed, df_nozero, df_name)
    print('{0} has {1} column(s) with all zeros'.format(df_name, count_all_zeros(df_nozero, 0)))
    
    #Scale data
    zscore = lambda x: (x-x.mean())/x.std()
    df_processed = df_nozero.transform(zscore)
    print('Are there NaN values in {0}? {1}'.format(df_name, np.isnan(df_processed).any().any()))
    
    #Visualise
    rand_columns = np.random.choice(df_processed.columns.values, size = 5, replace = False)
    print(df_processed[rand_columns].describe())
    print('Are there NaN values in {0}? {1}'.format(df_name, np.isnan(df_processed).any().any()))
    
    #Reset Index
    df_processed = df_processed.reset_index(drop = True)
    print('Are there NaN values in {0}? {1}'.format(df_name, np.isnan(df_processed).any().any()))
    
    return df_processed

In [4]:
x_cnv_processed = preprocess_x(x_cnv, 'x_cnv', 0.5)
x_rna_processed = preprocess_x(x_rna, 'x_rna', 0.5)
x_lnc_processed = preprocess_x(x_lnc, 'x_lnc', 0.5)

x_cnv shape is (668, 26374)
Dropped 0 Columns from x_cnv
Are there NaN values in x_cnv? False
x_cnv has 0 column(s) with all zeros
Dropped 0 Columns from x_cnv
x_cnv has 0 column(s) with all zeros
Are there NaN values in x_cnv? False
              5852          20791         5638          15875         16031
count  6.680000e+02  6.680000e+02  6.680000e+02  6.680000e+02  6.680000e+02
mean  -7.594557e-16 -2.183882e-16 -5.222037e-16 -3.593267e-16 -8.443013e-17
std    1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00
min   -3.019957e+00 -3.180485e+00 -3.228431e+00 -1.828581e+00 -2.977280e+00
25%   -6.640842e-01 -6.043176e-01 -6.656139e-01 -5.297589e-01 -5.238135e-01
50%    8.612865e-02 -1.137956e-01 -1.468127e-01 -2.142764e-01 -7.449771e-02
75%    5.637543e-01  6.476674e-01  6.053930e-01  3.462438e-01  5.251106e-01
max    3.565380e+00  2.824716e+00  4.453621e+00  8.106310e+00  9.132629e+00
Are there NaN values in x_cnv? False
Are there NaN values in x_cnv? False
x_rna sh

In [23]:
#Combine data as pairs
x_cr = pd.concat([x_cnv_processed, x_rna_processed], axis = 1)
x_cl = pd.concat([x_cnv_processed, x_lnc_processed], axis = 1)
x_rl = pd.concat([x_rna_processed, x_lnc_processed], axis = 1)
print('x_cr shape is:{0}  x_cl shape is:{1}  x_rl shape is:{2}'.format(x_cr.shape, x_cl.shape, x_rl.shape))

#Combine all data 
x_all = pd.concat([x_cnv_processed, x_rna_processed, x_lnc_processed], axis = 1)
print(x_all.shape)

x_cr shape is:(668, 52357)  x_cl shape is:(668, 47770)  x_rl shape is:(668, 47379)
(668, 73753)


#### Process Y data

In [8]:
y_condensed = y_multi
#Make a new column containing the number instances a cancer type occurs
y_condensed['Instances'] = y_condensed[0].map(y_condensed[0].value_counts())
#Replace cancer type with 'Other' if that cancer type occurs less than 30 times
y_condensed[0].where(y_condensed['Instances']>=30, 'Other', inplace = True)
#Drop Instances column
y_condensed.drop(columns = ['Instances'], inplace = True)

In [10]:
y_condensed[0].value_counts()

Other                       238
Adenocarcinoma              146
Carcinoma                    98
Melanoma                     48
Carcinoma Non-Small Cell     47
Adenocarcinoma Ductal        31
Carcinoma Small Cell         30
Carcinoma Squamous Cell      30
Name: 0, dtype: int64

In [11]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

Using TensorFlow backend.


In [12]:
#Encode Cancer Types as Integers
le = LabelEncoder()
y_integers = le.fit_transform(y_condensed.values)
#One hot encoding 
y_encoded = to_categorical(y_integers)
y_encoded[0:5]

  y = column_or_1d(y, warn=True)


array([[0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.]], dtype=float32)

### Neural Network

In [13]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras import regularizers

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x_all, y_encoded, test_size = 0.20, random_state = 0)

In [24]:
model = Sequential()
model.add(Dense(32, activation = 'relu', kernel_regularizer = regularizers.l1_l2(0.01, 0.01)))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(8, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
model.fit(x_train.values, y_train, epochs = 50, batch_size = 64, verbose = 1)

In [27]:
model.evaluate(x_test, y_test)



[8.386947062478137, 0.5223880597014925]

In [18]:
from sklearn.metrics import confusion_matrix

In [19]:
y_prediction = model.predict(x_test)

In [20]:
y_prediction[8]

array([0.20643444, 0.09555611, 0.48700547, 0.03400508, 0.02761224,
       0.04988152, 0.01247051, 0.08703456], dtype=float32)

In [21]:
y_pred_int = np.argmax(y_prediction, axis = 1)

In [22]:
y_pred_int

array([7, 2, 7, 3, 7, 4, 6, 0, 2, 4, 2, 6, 0, 0, 0, 2, 0, 6, 7, 7, 0, 4,
       1, 7, 5, 4, 7, 2, 0, 0, 7, 7, 0, 7, 3, 3, 4, 2, 7, 2, 3, 7, 3, 6,
       0, 7, 0, 7, 2, 0, 7, 2, 7, 3, 3, 7, 6, 7, 0, 0, 0, 7, 4, 7, 0, 6,
       1, 6, 0, 7, 7, 7, 2, 7, 7, 7, 7, 7, 7, 2, 7, 6, 2, 2, 0, 6, 0, 7,
       2, 7, 3, 7, 3, 3, 7, 7, 7, 3, 0, 0, 7, 2, 7, 2, 0, 7, 7, 0, 7, 0,
       2, 0, 0, 3, 0, 7, 2, 7, 7, 6, 3, 0, 0, 1, 2, 4, 7, 0, 2, 7, 0, 7,
       2, 3])

#### ROC Curve

In [44]:
def create_model():
    model = Sequential()
    model.add(Dense(32, activation = 'relu', kernel_regularizer = regularizers.l1_l2(0.01, 0.01)))
    model.add(Dense(128, activation = 'relu'))
    model.add(Dense(8, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

def train_predict(x_data, y, model_fn):
    predictions = []
    for x, label in x_data:
        #train_test_split with same random state outputs same split each time, apparently
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)
        model = model_fn()
        print('Training: ' + label)
        model.fit(x_train.values, y_train, epochs = 150, batch_size = 64, verbose = 0)
        print(model.evaluate(x_test, y_test))
        y_pred = model.predict(x_test)
        predictions.append([y, y_pred, label])
    return predictions

def roc_graph(data, title = 'ROC Curve'):
    for y_test, y_prediction, name in data:
        fpr_temp, tpr_temp, thresholds_temp = roc_curve(y_test, y_prediction)
        auc_temp = auc(fpr_temp, tpr_temp)
        plt.plot(fpr_temp, tpr_temp, label = name + ' (AUC: {0:0.2f})'.format(auc_temp))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.title(title)
    plt.show

In [45]:
x_data = [
    [x_cnv_processed, 'CNV Only'],
    [x_rna_processed, 'RNA Only'],
    [x_lnc_processed, 'LNC Only'],
    [x_cr, 'CNV + RNA'],
    [x_cl, 'CNV + LNC'],
    [x_rl, 'RNA + LNC'],
    [x_all, 'CNV + RNA + LNC']
]

In [43]:
predictions = train_predict(x_data, y_encoded, create_model)

Training: CNV Only
[4.653413274394932, 0.4253731343283582]
Training: RNA Only
[6.633053252946085, 0.5671641791044776]
Training: LNC Only
[2.878255719569192, 0.6417910447761194]
Training: CNV + RNA
[6.614955183285386, 0.5373134332806316]
Training: CNV + LNC
[5.401017395418082, 0.6194029859642485]
Training: RNA + LNC
[6.256009735278229, 0.6044776128299201]
Training: CNV + RNA + LNC
[8.069274546495125, 0.5373134337254425]


In [35]:
from sklearn.metrics import roc_curve, auc

In [40]:
roc_graph(predictions)

ValueError: multilabel-indicator format is not supported

#### Cross Validation

In [86]:
from keras.optimizers import Adam

In [87]:
#adam = Adam(lr = 0.5)

In [49]:
#128, 32 gives 20% accuracy 10% std
#1024, 1024 gives 23.5% accuracy 7.6% std

def create_model():
    model = Sequential()
    model.add(Dense(32, activation = 'sigmoid', kernel_regularizer = regularizers.l1(0.01)))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(8, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimator = KerasClassifier(build_fn = create_model, epochs = 100, batch_size = 64)

In [50]:
kfold = KFold(n_splits=5, shuffle = True)

In [51]:
results = cross_val_score(estimator, x_train.values, y_train, cv = kfold)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [53]:
print('Accuracy: {0}% ({1}%)'.format(results.mean()*100, results.std()*100))

Accuracy: 34.45071458974186% (5.615438121359801%)
