# C. Elegans DNA

Import librarires for later use

In [1]:
import numpy as np
import sklearn
import pandas as pd
%load_ext autoreload
%autoreload 2

Read the C.Elegens .csv file. We add our own headers - labels stands for whether there is a splice site or not and the DNA is a string repressinting the DNA

In [2]:
df = pd.read_csv('exercise_data/C_elegans_acc_seq.csv', header=None, names=['labels', 'DNA'])

### Mapping DNA to a vector

We will map the DNA into a vector, by mapping each Character (A,T,C,G) into a one-hot vector and then concatonating all these vectors together. As we have a string of 82 Characters this gives us a final vector of length 248

In [3]:
import utility
df['DNA'] = df['DNA'].map(utility.map_dna_into_vector)

### Doing the Test-Train-Split

In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)
train_data = np.vstack(train['DNA'].values)
test_data  = np.vstack(test['DNA'].values)

### Creating DataFrame for later Evaluation

In [5]:
eval_df = pd.DataFrame(data=[], columns=['Name', 'AUROC', 'AUPRC', 'f1_cv'])

## Models

In [6]:
from sklearn.model_selection import GridSearchCV

In [7]:
def evaluate_model(model, params, df, train, test):
    # Put Data into a readable Matrix format
    train_data = np.vstack(train['DNA'].values)
    test_data  = np.vstack(test['DNA'].values)
    
    # Create Instance of our Model
    m = model()
    
    # Search for the best params in our model and print the best score
    clf = GridSearchCV(m, params, scoring='f1', cv=5, n_jobs=-1)
    clf.fit(train_data, train['labels'].values)
    print(f"The best score was: {clf.best_score_}")
    
    # Train our best model on the whole train-dataset
    best_estimator = model(**clf.best_params_)
    best_estimotor = best_estimator.fit(train_data, train['labels'].values)
    
    # Evaluate on the Test set
    pred_val = best_estimator.predict(test_data)
    true_val = test['labels'].values
    auroc, auprc = utility.get_scores(true_val, pred_val)
    
    # Append to our Dataframe
    df = df.append({'Name': model.__name__, 'AUROC':auroc , 'AUPRC': auprc, 'f1_cv':clf.best_score_}, ignore_index=True)
    return (best_estimator, df)

### Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
params = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10, 100],
    'class_weight': ['balanced']
}

In [10]:
lg_best_estimator, eval_df = evaluate_model(LogisticRegression, params, eval_df, train, test)

The best score was: 0.7714376328291473




### SVC

In [11]:
from sklearn.svm import SVC

In [12]:
params = {'kernel': ['linear', 'rbf', 'poly'],
          'C': [1, 10, 100],
          'class_weight': ['balanced'],
          'gamma': ['auto', 'scale']
         }

In [13]:
svc_best_estimator, eval_df = evaluate_model(SVC, params, eval_df, train, test)

The best score was: 0.837142344660541


### Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
params = {
    'n_estimators':[10, 100, 300],    
    'class_weight': ['balanced', 'balanced_subsample']
}

In [16]:
rfc_best_estimator, eval_df = evaluate_model(RandomForestClassifier, params, eval_df, train, test)

The best score was: 0.3918118466898955


###  Gaussian Process Classifer

In [17]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, PairwiseKernel

In [18]:
params = {
    'kernel' : [RBF(), PairwiseKernel()]
}

In [20]:
gpc_best_estimator, eval_df = evaluate_model(GaussianProcessClassifier, params, eval_df, train, test)

The best score was: 0.7305223215790513


### DL Model

In [21]:
import tensorflow as tf
from tensorflow.python.keras.layers import BatchNormalization,Conv1D,Input,Add,Dense,Flatten
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.optimizers import Adam

In [22]:
def f1(y_true, y_pred):
    y_true = tf.cast(y_true, "int32")
    print(y_true)
    y_pred = tf.cast(tf.round(y_pred), "int32") # implicit 0.5 threshold via tf.round
    y_correct = tf.cast(tf.equal(y_true,y_pred),"int32")
    print(y_correct)
    sum_true = tf.reduce_sum(y_true, axis=1)
    sum_pred = tf.reduce_sum(y_pred, axis=1)
    sum_correct = tf.reduce_sum(y_correct, axis=1)
    precision = sum_correct / sum_pred
    recall = sum_correct / sum_true
    f_score = 2 * precision * recall / (1 * precision + recall)
    f_score = tf.where(tf.is_nan(f_score), tf.zeros_like(f_score), f_score)
    return tf.reduce_mean(f_score)

def add_RB(x):
    xout=BatchNormalization()(x)
    xout=Conv1D(filters=32,kernel_size=11,dilation_rate=1,padding='same',activation='relu')(x)
    xout=BatchNormalization()(xout)
    xout=Conv1D(filters=32,kernel_size=11,dilation_rate=1,padding='same',activation='relu')(xout)
    return xout



In [23]:
tf.reset_default_graph()
x=Input(shape=[328,1])

x1=Conv1D(filters=32,kernel_size=1,dilation_rate=1,padding='same',activation='relu')(x)
xrb=add_RB(x1)
xrb=add_RB(xrb)
xrb=add_RB(xrb)
xrb=add_RB(xrb)

x2=Conv1D(filters=32,kernel_size=1,dilation_rate=1,padding='same',activation='relu')(xrb)
x3=Conv1D(filters=32,kernel_size=1,dilation_rate=1,padding='same',activation='relu')(x1)

xout=Conv1D(filters=1,kernel_size=1,dilation_rate=1,padding='same',activation='relu')(Add()([x2,x3]))
xout=Flatten()(xout)
xout=Dense(units=1,activation='sigmoid')(xout)

model=Model(x,xout)
model.compile(optimizer=Adam(),loss='binary_crossentropy',metrics=[f1])
class_wt={0:1,1:15}

Tensor("metrics/f1/Cast:0", shape=(?, ?), dtype=int32)
Tensor("metrics/f1/Cast_2:0", shape=(?, ?), dtype=int32)


In [24]:
train_data = np.vstack(train['DNA'].values)[:,:,None]
test_data  = np.vstack(test['DNA'].values)[:,:,None]

train_val=train['labels'].values
train_val[train_val==-1]=0
model.fit(x=train_data,y=train_val,batch_size=64,epochs=20,class_weight=class_wt)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1a28f6c978>

In [25]:
pred_val=(model.predict(test_data)>0.5).astype(np.int)
true_val=test['labels']
true_val[true_val==-1]=0
dl_mtr=utility.get_scores(true_val,pred_val)
dl_f1=utility.f1_score(true_val,pred_val)

eval_df = eval_df.append({'Name': 'DL Model', 'AUROC':dl_mtr[0] , 'AUPRC': dl_mtr[1], 'f1_cv':dl_f1}, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [26]:
print(dl_mtr)
print(dl_f1)
print(eval_df)

(0.835, 0.8177899686520376)
0.7826086956521738
                        Name    AUROC     AUPRC     f1_cv
0         LogisticRegression  0.95500  0.829125  0.771438
1                        SVC  0.96000  0.940268  0.837142
2     RandomForestClassifier  0.57375  0.542208  0.391812
3  GaussianProcessClassifier  0.84500  0.801136  0.730522
4  GaussianProcessClassifier  0.84500  0.801136  0.730522
5                   DL Model  0.83500  0.817790  0.782609


## EValuation

In [27]:
eval_df

Unnamed: 0,Name,AUROC,AUPRC,f1_cv
0,LogisticRegression,0.955,0.829125,0.771438
1,SVC,0.96,0.940268,0.837142
2,RandomForestClassifier,0.57375,0.542208,0.391812
3,GaussianProcessClassifier,0.845,0.801136,0.730522
4,GaussianProcessClassifier,0.845,0.801136,0.730522
5,DL Model,0.835,0.81779,0.782609
