# C. Elegans DNA

Import libraries for later use

In [1]:
import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2

Read the C.Elegens .csv file. We add our own headers - labels stands for whether there is a splice site or not and the DNA is a string representing the DNA

In [2]:
df = pd.read_csv('exercise_data/C_elegans_acc_seq.csv', header=None, names=['labels', 'DNA'])

### Doing the Test-Train-Split

In [3]:
from sklearn.model_selection import train_test_split

np.random.seed(28)
train, test = train_test_split(df, test_size=0.2)

# Make a copy of the raw dna sequences for later use with Shogun
train_raw = np.array(train)
test_raw = np.array(test)
X_train = train_raw[:,1]
y_train = train_raw[:,0]
X_test = test_raw[:,1]
y_test = test_raw[:,0]

In [4]:
# Check the label proportions are similar. stratify=True for splitting threw a weird exception
print(100*np.sum(df['labels']==1)/df.shape[0])
print(100*np.sum(train['labels']==1)/train.shape[0])
print(100*np.sum(test['labels']==1)/test.shape[0])

9.090909090909092
8.977272727272727
9.545454545454545


### Mapping DNA to a vector

We will map the DNA into a vector, by mapping each Character (A,T,C,G) into a one-hot vector and then concatenating all these vectors together. As we have a string of 82 Characters this gives us a final vector of length 328

In [5]:
import utility
train['DNA'] = train['DNA'].map(utility.map_dna_into_vector)
test['DNA'] = test['DNA'].map(utility.map_dna_into_vector)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Creating DataFrame for later Evaluation

In [6]:
f1_eval_df = pd.DataFrame(data=[], columns=['Name', 'AUROC', 'AUPRC', 'f1_cv', 'f1_test'])
auroc_eval_df = pd.DataFrame(data=[], columns=['Name', 'AUROC', 'AUPRC', 'f1_cv', 'f1_test'])

## Models

In [7]:
from sklearn.model_selection import GridSearchCV

In [8]:
def evaluate_model(model, params, df, train, test, metric):
    # Put Data into a readable Matrix format
    train_data = np.vstack(train['DNA'].values)
    test_data  = np.vstack(test['DNA'].values)
    
    # Create Instance of our Model
    m = model()
    
    # Search for the best params in our model and print the best score
    clf = GridSearchCV(m, params, scoring=metric, cv=5, n_jobs=-1)
    clf.fit(train_data, train['labels'].values)
    print(f"The best score was: {clf.best_score_}")
    
    # Train our best model on the whole train-dataset
    best_estimator = model(**clf.best_params_)
    best_estimator.fit(train_data, train['labels'].values)
    
    # Evaluate on the Test set
    pred_val = best_estimator.predict(test_data)
    true_val = test['labels'].values
    auroc, auprc, f1_test = utility.get_scores(true_val, pred_val)
    
    # Append to our Dataframe
    df = df.append({'Name': model.__name__, 'AUROC':auroc, 'AUPRC': auprc, 'f1_cv':clf.best_score_, 'f1_test':f1_test}, ignore_index=True)
    return (best_estimator, df)

### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
params = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10, 100],
    #'class_weight': ['balanced']
}

In [11]:
lg_best_estimator, f1_eval_df = evaluate_model(LogisticRegression, params, f1_eval_df, train, test, 'f1')
_, auroc_eval_df = evaluate_model(LogisticRegression, params, auroc_eval_df, train, test, 'roc_auc')



The best score was: 0.7817207800759418
The best score was: 0.9813946282451627




### SVC

In [12]:
from sklearn.svm import SVC

In [13]:
params = {'kernel': ['linear', 'rbf', 'poly'],
          'C': [1, 10, 100],
         # 'class_weight': ['balanced'],
          'gamma': ['auto', 'scale']
         }

In [14]:
svc_best_estimator, f1_eval_df = evaluate_model(SVC, params, f1_eval_df, train, test, 'f1')
_, auroc_eval_df = evaluate_model(SVC, params, auroc_eval_df, train, test, 'roc_auc')

The best score was: 0.7314899728725189
The best score was: 0.9835096896668676


  'precision', 'predicted', average, warn_for)


### Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
params = {
    'n_estimators':[10, 100, 300],    
   # 'class_weight': ['balanced', 'balanced_subsample']
}

In [17]:
rfc_best_estimator, f1_eval_df = evaluate_model(RandomForestClassifier, params, f1_eval_df, train, test, 'f1')
_, auroc_eval_df = evaluate_model(RandomForestClassifier, params, auroc_eval_df, train, test, 'roc_auc')



The best score was: 0.2974020371617215
The best score was: 0.9795150774483892


The best score was: 0.9757707661004833


## SVM + String kernels (using SHOGUN)

In [22]:
import shogun as sg

features_train = sg.StringCharFeatures(list(X_train), sg.DNA)
labels_train = sg.BinaryLabels(y_train.astype(int))
features_test = sg.StringCharFeatures(list(X_test), sg.DNA)
labels_test = sg.BinaryLabels(y_test.astype(int))

### Weighted Degree Kernel

We perform optimization via grid search

In [18]:
#Root
#param_tree_root = sg.ModelSelectionParameters()

#Parameter C
#C = sg.ModelSelectionParameters("C")
#param_tree_root.append_child(C)
#C.build_values(1, 10, sg.R_LINEAR, 1, 2)
#C.set_values(1,2,3)

#kernel = sg.WeightedDegreeStringKernel(features_train, features_train, kernel_degree)
#svm = sg.LibSVM(C, kernel, labels_train)

#C.print_tree()

In [24]:
C = 2
kernel_degree = 3

kernel = sg.WeightedDegreeStringKernel(features_train, features_train, kernel_degree)
svm = sg.LibSVM(C, kernel, labels_train)

# Cross validation
stratified_split = sg.StratifiedCrossValidationSplitting(labels_train, 5)
metric = sg.F1Measure()
cross = sg.CrossValidation(svm, features_train, labels_train, stratified_split, metric)
# 25 runs and 95% confidence intervals
cross.set_num_runs(25)
cross.set_autolock(False)
result = cross.evaluate()
cv_score = sg.CrossValidationResult.obtain_from_generic(result).get_mean()
print("CV score", metric.get_name(), cv_score)

CV score F1Measure 0.8056770322238286


Train on whole train dataset to evaluate test performance

In [19]:
svm.train()
pred_val = svm.apply(features_test).get_labels()
auroc, auprc, f1_test = utility.get_scores(y_test.astype(int), pred_val)
    
# Append to our Dataframe
f1_eval_df = f1_eval_df.append({'Name': 'WDK_' + str(kernel_degree), 'AUROC':auroc, 'AUPRC': auprc, 'f1_cv':cv_score, 'f1_test':f1_test}, ignore_index=True)

### Fixed Degree String Kernel

In [27]:
C = 2
kernel_degree = 3

kernel = sg.FixedDegreeStringKernel(features_train, features_train, kernel_degree)
svm = sg.LibSVM(C, kernel, labels_train)

# Cross validation
stratified_split = sg.StratifiedCrossValidationSplitting(labels_train, 5)
metric = sg.F1Measure()
cross = sg.CrossValidation(svm, features_train, labels_train, stratified_split, metric)
# 25 runs and 95% confidence intervals
cross.set_num_runs(25)
cross.set_autolock(False)
result = cross.evaluate()
cv_score = sg.CrossValidationResult.obtain_from_generic(result).get_mean()
print("CV score", metric.get_name(), cv_score)

CV score F1Measure 0.7802076269985179


Train on whole train dataset to evaluate test performance

In [28]:
svm.train()
pred_val = svm.apply(features_test).get_labels()
auroc, auprc, f1_test = utility.get_scores(y_test.astype(int), pred_val)
    
# Append to our Dataframe
f1_eval_df = f1_eval_df.append({'Name': 'FDK_' + str(kernel_degree), 'AUROC':auroc, 'AUPRC': auprc, 'f1_cv':cv_score, 'f1_test':f1_test}, ignore_index=True)

### Oligo String Kernel

In [29]:
C = 2
kernel_degree = 3
kernel_width = 10

kernel = sg.OligoStringKernel(features_train, features_train, kernel_degree, kernel_width)
svm = sg.LibSVM(C, kernel, labels_train)

# Cross validation
stratified_split = sg.StratifiedCrossValidationSplitting(labels_train, 5)
metric = sg.F1Measure()
cross = sg.CrossValidation(svm, features_train, labels_train, stratified_split, metric)
# 25 runs and 95% confidence intervals
cross.set_num_runs(1)
cross.set_autolock(False)
result = cross.evaluate()
cv_score = sg.CrossValidationResult.obtain_from_generic(result).get_mean()
print("CV score", metric.get_name(), cv_score)

CV score F1Measure 0.7418407239375421


Train on whole train dataset to evaluate test performance

In [21]:
gpc_best_estimator, f1_eval_df = evaluate_model(GaussianProcessClassifier, params, f1_eval_df, train, test, 'f1')
_, auroc_eval_df = evaluate_model(GaussianProcessClassifier, params, auroc_eval_df, train, test, 'roc_auc')



The best score was: 0.7282852086438152




The best score was: 0.9757707661004833


## SVM + String kernels (using SHOGUN)

In [22]:
import shogun as sg

features_train = sg.StringCharFeatures(list(X_train), sg.DNA)
labels_train = sg.BinaryLabels(y_train.astype(int))
features_test = sg.StringCharFeatures(list(X_test), sg.DNA)
labels_test = sg.BinaryLabels(y_test.astype(int))

ModuleNotFoundError: No module named 'shogun'

### Weighted Degree Kernel

We perform optimization via grid search

In [23]:
#Root
#param_tree_root = sg.ModelSelectionParameters()

#Parameter C
#C = sg.ModelSelectionParameters("C")
#param_tree_root.append_child(C)
#C.build_values(1, 10, sg.R_LINEAR, 1, 2)
#C.set_values(1,2,3)

#kernel = sg.WeightedDegreeStringKernel(features_train, features_train, kernel_degree)
#svm = sg.LibSVM(C, kernel, labels_train)

#C.print_tree()

In [24]:
C = 2
kernel_degree = 3

kernel = sg.WeightedDegreeStringKernel(features_train, features_train, kernel_degree)
svm = sg.LibSVM(C, kernel, labels_train)

# Cross validation
stratified_split = sg.StratifiedCrossValidationSplitting(labels_train, 5)
metric = sg.F1Measure()
cross = sg.CrossValidation(svm, features_train, labels_train, stratified_split, metric)
# 25 runs and 95% confidence intervals
cross.set_num_runs(25)
cross.set_autolock(False)
result = cross.evaluate()
cv_score = sg.CrossValidationResult.obtain_from_generic(result).get_mean()
print("CV score", metric.get_name(), cv_score)

CV score F1Measure 0.8056770322238286


Train on whole train dataset to evaluate test performance

In [25]:
svm.train()
pred_val = svm.apply(features_test).get_labels()
auroc, auprc, f1_test = utility.get_scores(y_test.astype(int), pred_val)
    
# Append to our Dataframe
f1_eval_df = f1_eval_df.append({'Name': 'WDK_' + str(kernel_degree), 'AUROC':auroc, 'AUPRC': auprc, 'f1_cv':cv_score, 'f1_test':f1_test}, ignore_index=True)

### Fixed Degree String Kernel

In [27]:
C = 2
kernel_degree = 3

kernel = sg.FixedDegreeStringKernel(features_train, features_train, kernel_degree)
svm = sg.LibSVM(C, kernel, labels_train)

# Cross validation
stratified_split = sg.StratifiedCrossValidationSplitting(labels_train, 5)
metric = sg.F1Measure()
cross = sg.CrossValidation(svm, features_train, labels_train, stratified_split, metric)
# 25 runs and 95% confidence intervals
cross.set_num_runs(25)
cross.set_autolock(False)
result = cross.evaluate()
cv_score = sg.CrossValidationResult.obtain_from_generic(result).get_mean()
print("CV score", metric.get_name(), cv_score)

CV score F1Measure 0.7802076269985179


Train on whole train dataset to evaluate test performance

In [28]:
svm.train()
pred_val = svm.apply(features_test).get_labels()
auroc, auprc, f1_test = utility.get_scores(y_test.astype(int), pred_val)
    
# Append to our Dataframe
f1_eval_df = f1_eval_df.append({'Name': 'FDK_' + str(kernel_degree), 'AUROC':auroc, 'AUPRC': auprc, 'f1_cv':cv_score, 'f1_test':f1_test}, ignore_index=True)

### Oligo String Kernel

In [29]:
C = 2
kernel_degree = 3
kernel_width = 10

kernel = sg.OligoStringKernel(features_train, features_train, kernel_degree, kernel_width)
svm = sg.LibSVM(C, kernel, labels_train)

# Cross validation
stratified_split = sg.StratifiedCrossValidationSplitting(labels_train, 5)
metric = sg.F1Measure()
cross = sg.CrossValidation(svm, features_train, labels_train, stratified_split, metric)
# 25 runs and 95% confidence intervals
cross.set_num_runs(1)
cross.set_autolock(False)
result = cross.evaluate()
cv_score = sg.CrossValidationResult.obtain_from_generic(result).get_mean()
print("CV score", metric.get_name(), cv_score)

CV score F1Measure 0.7418407239375421


Train on whole train dataset to evaluate test performance

In [30]:
svm.train()
pred_val = svm.apply(features_test).get_labels()
auroc, auprc, f1_test = utility.get_scores(y_test.astype(int), pred_val)
    
# Append to our Dataframe
f1_eval_df = f1_eval_df.append({'Name': 'OSK_' + str(kernel_degree) + '_' + str(kernel_width),
                                'AUROC':auroc, 'AUPRC': auprc, 'f1_cv':cv_score, 'f1_test':f1_test}, ignore_index=True)

### Weighted Degree Position String Kernel

In [31]:
C = 2
kernel_degree = 1

kernel = sg.WeightedDegreePositionStringKernel(features_train, features_train, kernel_degree)
svm = sg.LibSVM(C, kernel, labels_train)

# Cross validation
stratified_split = sg.StratifiedCrossValidationSplitting(labels_train, 5)
metric = sg.F1Measure()
cross = sg.CrossValidation(svm, features_train, labels_train, stratified_split, metric)
# 25 runs and 95% confidence intervals
cross.set_num_runs(25)
cross.set_autolock(False)
result = cross.evaluate()
cv_score = sg.CrossValidationResult.obtain_from_generic(result).get_mean()
print("CV score", metric.get_name(), cv_score)

CV score F1Measure 0.7844177775424716


Train on whole train dataset to evaluate test performance

In [32]:
svm.train()
pred_val = svm.apply(features_test).get_labels()
auroc, auprc, f1_test = utility.get_scores(y_test.astype(int), pred_val)
    
# Append to our Dataframe
f1_eval_df = f1_eval_df.append({'Name': 'WDPSK_' + str(kernel_degree),
                                'AUROC':auroc, 'AUPRC': auprc, 'f1_cv':cv_score, 'f1_test':f1_test}, ignore_index=True)

### DL Model

In [23]:
import tensorflow as tf
from tensorflow.python.keras.layers import BatchNormalization,Conv1D,Input,Add,Dense,Flatten
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.optimizers import Adam

def f1(y_true, y_pred):
    y_true = tf.cast(y_true, "int32")
    y_pred = tf.cast(tf.round(y_pred), "int32") # implicit 0.5 threshold via tf.round
    y_correct = tf.cast(tf.equal(y_true,y_pred),"int32")
    sum_true = tf.reduce_sum(y_true, axis=1)
    sum_pred = tf.reduce_sum(y_pred, axis=1)
    sum_correct = tf.reduce_sum(y_correct, axis=1)
    precision = sum_correct / sum_pred
    recall = sum_correct / sum_true
    f_score = 2 * precision * recall / (1 * precision + recall)
    f_score = tf.where(tf.is_nan(f_score), tf.zeros_like(f_score), f_score)
    return tf.reduce_mean(f_score)

def add_RB(x):
    xout=BatchNormalization()(x)
    xout=Conv1D(filters=32,kernel_size=11,dilation_rate=1,padding='same',activation='relu')(x)
    xout=BatchNormalization()(xout)
    xout=Conv1D(filters=32,kernel_size=11,dilation_rate=1,padding='same',activation='relu')(xout)
    return xout

In [24]:
tf.reset_default_graph()
x=Input(shape=[328,1])

x1=Conv1D(filters=32,kernel_size=1,dilation_rate=1,padding='same',activation='relu')(x)

xrb=add_RB(x1)
#xrb=add_RB(xrb)
#xrb=add_RB(xrb)
#xrb=add_RB(xrb)

x2=Conv1D(filters=32,kernel_size=1,dilation_rate=1,padding='same',activation='relu')(xrb)
x3=Conv1D(filters=32,kernel_size=1,dilation_rate=1,padding='same',activation='relu')(x1)

xout=Conv1D(filters=1,kernel_size=1,dilation_rate=1,padding='same',activation='relu')(Add()([x2,x3]))
xout=Flatten()(xout)
xout=Dense(units=1,activation='sigmoid')(xout)

model=Model(x,xout)
model.compile(optimizer=Adam(),loss='binary_crossentropy',metrics=[f1])
class_wt={0:1,1:15}

train_data = np.vstack(train['DNA'].values)[:,:,None]
test_data  = np.vstack(test['DNA'].values)[:,:,None]

train_val=train['labels'].values
train_val[train_val==-1]=0
model.fit(x=train_data,y=train_val,batch_size=64,epochs=20,class_weight=class_wt)

pred_val=(model.predict(test_data)>0.5).astype(np.int)
true_val=test['labels']
true_val[true_val==-1]=0
dl_mtr=utility.get_scores(true_val,pred_val)
dl_f1=utility.f1_score(true_val,pred_val)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


## Evaluation

In [25]:
f1_eval_df = f1_eval_df.append({'Name': 'DL Model',
                                'AUROC':dl_mtr[0], 'AUPRC': dl_mtr[1], 'f1_cv':'Nill', 'f1_test':dl_f1}, ignore_index=True)

In [27]:
f1_eval_df

Unnamed: 0,Name,AUROC,AUPRC,f1_cv,f1_test
0,LogisticRegression,0.869646,0.78256,0.781721,0.771084
1,SVC,0.882807,0.805523,0.73149,0.795181
2,RandomForestClassifier,0.570172,0.540909,0.297402,0.244898
3,GaussianProcessClassifier,0.86839,0.773268,0.728285,0.761905
4,GaussianProcessClassifier,0.86839,0.773268,0.728285,0.761905
5,DL Model,0.911642,0.873339,Nill,0.864198


In [28]:
auroc_eval_df

Unnamed: 0,Name,AUROC,AUPRC,f1_cv,f1_test
0,LogisticRegression,0.869646,0.78256,0.981395,0.771084
1,SVC,0.5,0.547727,0.98351,0.0
2,RandomForestClassifier,0.595238,0.633874,0.979515,0.32
3,GaussianProcessClassifier,0.86839,0.773268,0.975771,0.761905
4,GaussianProcessClassifier,0.86839,0.773268,0.975771,0.761905
