# Human DNA

import libraries for later use

In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
df_train = pd.read_csv('exercise_data/human_dna_train_split.csv')
df_valid = pd.read_csv('exercise_data/human_dna_validation_split.csv')
df_test  = pd.read_csv('exercise_data/human_dna_test_split.csv')

In [3]:
print(f"Size of training_set: {df_train.shape}")
print(f"Shape of test set: {df_test.shape}")
print(f"Shape of validation set: {df_valid.shape}")

Size of training_set: (500000, 2)
Shape of test set: (33333, 2)
Shape of validation set: (33333, 2)


### UnderSampling

In [4]:
from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour

In [5]:
def undersample_dataframe(df):
    X = np.vstack(df['sequences'].values)
    y = df['labels'].values
    undersampler = RandomUnderSampler()
    X_undersampled, y_undersampled = undersampler.fit_resample(X, y)
    undersampled_df = pd.DataFrame({'labels':y_undersampled, 'sequences': X_undersampled[:,0]})
    return undersampled_df

In [6]:
print(f"Size before undersampling {df_train.shape[0]}")
df_train = undersample_dataframe(df_train)
print(f"Size after undersampling {df_train.shape[0]}")

Size before undersampling 500000
Size after undersampling 2942


### Mapping DNA to a vector

We will map the DNA into a vector, by mapping each Character (A,T,C,G) into a one-hot vector and then concatonating all these vectors together. As we have a string of 398 Characters this gives us a final vector of length 1592. 

*This operation takes some time, please be patient*

In [7]:
import utility
df_train['sequences'] = df_train['sequences'].map(utility.map_dna_into_vector)
df_valid['sequences'] = df_valid['sequences'].map(utility.map_dna_into_vector)
df_test['sequences']  = df_test['sequences'].map(utility.map_dna_into_vector)

### Creating a DataFrame for later Evalution

In [8]:
eval_df = pd.DataFrame(data=[], columns=["Name", "AUROC", "AUPRC", "f1_score"])

## Models

In [9]:
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.metrics import f1_score

In [10]:
def evaluate_model(model, params, train, valid, test, eval_df):
    # Put Data into a usable Matrix Format
    train_data = np.vstack(train['sequences'].values)
    valid_data = np.vstack(valid['sequences'].values)
    test_data = np.vstack(valid['sequences'].values)
    
    combined_data = np.vstack([train_data, valid_data])
    combined_labels = np.hstack([train['labels'].values, valid['labels'].values])
    
    # Create Instance of the Model
    m = model()
    
    # Search for the best params in our model and print the best score
    grid = ParameterGrid(params)
    scores = []
    for grid_point in grid:
        m = model(**grid_point)
        m = m.fit(train_data, train['labels'].values)
        pred_val = m.predict(valid_data)
        score = f1_score(valid['labels'].values, pred_val)
        scores.append(score)
        
    best_score = np.max(scores)
    best_idx = np.argmax(scores)
    best_params = grid[best_idx]
    print(f"The best score was: {best_score}")
    
    
    # Train our best model on the whole train-dataset
    best_estimator = model(**best_params)
    best_estimotor = best_estimator.fit(combined_data, combined_labels)
    
    # Evaluate on the Test set
    pred_val = best_estimator.predict(test_data)
    true_val = test['labels'].values
    auroc, auprc = utility.get_scores(true_val, pred_val)
    
    # Append to our Dataframe
    eval_df = eval_df.append({'Name': model.__name__, 'AUROC':auroc , 'AUPRC': auprc, 'f1_cv':best_score}, ignore_index=True)
    return (best_estimator, eval_df)
    
    
    
    

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
params = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10, 100],
    'class_weight': ['balanced'],
}

In [30]:
lg_best_estimator, eval_df = evaluate_model(LogisticRegression, params, df_train, df_valid, df_test, eval_df)



The best score was: 0.023895253682487724




### SVC

In [31]:
from sklearn.svm import SVC

In [39]:
params = {'kernel': ['linear', 'rbf', 'poly'],
          'C': [1, 10],
          'class_weight': ['balanced'],
          'gamma': ['auto', 'scale'],
         }

In [40]:
lg_best_estimator, eval_df = evaluate_model(SVC, params, df_train, df_valid, df_test, eval_df)

The best score was: 0.032343909928352094


### Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
params = {
    'n_estimators':[10, 100, 300],    
    'class_weight': ['balanced', 'balanced_subsample']
}

In [44]:
lg_best_estimator, eval_df = evaluate_model(RandomForestClassifier, params, df_train, df_valid, df_test, eval_df)

The best score was: 0.03065752319483663


###  Gaussian Process Classifer

In [45]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, PairwiseKernel

In [46]:
params = {
    'kernel' : [RBF(), PairwiseKernel()]
}

In [None]:
lg_best_estimator, eval_df = evaluate_model(GaussianProcessClassifier, params, df_train, df_valid, df_test, eval_df)

The best score was: 0.020757020757020753


### DL Model

In [11]:
import tensorflow as tf
from tensorflow.python.keras.layers import BatchNormalization,Conv1D,Input,Add,Dense,Flatten
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.optimizers import Adam


def f1(y_true, y_pred):
    y_true = tf.cast(y_true, "int32")
    y_pred = tf.cast(tf.round(y_pred), "int32") # implicit 0.5 threshold via tf.round
    y_correct = tf.cast(tf.equal(y_true,y_pred),"int32")
    sum_true = tf.reduce_sum(y_true, axis=1)
    sum_pred = tf.reduce_sum(y_pred, axis=1)
    sum_correct = tf.reduce_sum(y_correct, axis=1)
    precision = sum_correct / sum_pred
    recall = sum_correct / sum_true
    f_score = 2 * precision * recall / (1 * precision + recall)
    f_score = tf.where(tf.is_nan(f_score), tf.zeros_like(f_score), f_score)
    return tf.reduce_mean(f_score)

def add_RB(x):
    xout=BatchNormalization()(x)
    xout=Conv1D(filters=32,kernel_size=11,dilation_rate=1,padding='same',activation='relu')(x)
    xout=BatchNormalization()(xout)
    xout=Conv1D(filters=32,kernel_size=11,dilation_rate=1,padding='same',activation='relu')(xout)
    return xout


In [12]:
tf.reset_default_graph()
x=Input(shape=[1592,1])

x1=Conv1D(filters=32,kernel_size=1,dilation_rate=1,padding='same',activation='relu')(x)

xrb=add_RB(x1)
#xrb=add_RB(xrb)
#xrb=add_RB(xrb)
#xrb=add_RB(xrb)

x2=Conv1D(filters=32,kernel_size=1,dilation_rate=1,padding='same',activation='relu')(xrb)
x3=Conv1D(filters=32,kernel_size=1,dilation_rate=1,padding='same',activation='relu')(x1)

xout=Conv1D(filters=1,kernel_size=1,dilation_rate=1,padding='same',activation='relu')(Add()([x2,x3]))
xout=Flatten()(xout)
xout=Dense(units=1,activation='sigmoid')(xout)

model=Model(x,xout)
model.compile(optimizer=Adam(),loss='binary_crossentropy',metrics=[f1])
class_wt={0:1,1:1}

In [13]:
train_data = np.vstack(df_train['sequences'].values)[:,:,None]
test_data  = np.vstack(df_test['sequences'].values)[:,:,None]

train_val=df_train['labels'].values
train_val[train_val==-1]=0
model.fit(x=train_data,y=train_val,batch_size=64,epochs=20,class_weight=class_wt)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0xa1631fa90>

In [None]:
pred_val=(model.predict(test_data)>0.5).astype(np.int)
true_val=df_test['labels']
true_val[true_val==-1]=0

dl_mtr=utility.get_scores(true_val,pred_val)
print(dl_mtr)
dl_f1=utility.f1_score(true_val,pred_val)
print(dl_f1)
eval_df=eval_df.append({'Name':'DL_model','AUROC':dl_mtr[0],'AUPRC':dl_mtr[1],'f1_score':dl_f1},ignore_index=True)

(0.8361572201760743, 0.42637055366557663)
0.030807365439093484


In [33]:
(train_val==1).mean()

0.5

In [38]:
train_data.shape

(2942, 1592, 1)

## Evaluation

In [29]:
eval_df

Unnamed: 0,Name,AUROC,AUPRC,f1_score,F1
0,DL_model,0.829898,0.425823,,0.028694
1,DL_model,0.829898,0.425823,0.028694,
