# C. Elegans DNA

Import librarires for later use

In [457]:
import numpy as np
import sklearn
import pandas as pd

Read the C.Elegens .csv file. We add our own headers - labels stands for whether there is a splice site or not and the DNA is a string repressinting the DNA

In [458]:
df = pd.read_csv('project1_data/exercise_data/C_elegans_acc_seq.csv', header=None, names=['labels', 'DNA'])

### Mapping DNA to a vector

We will map the DNA into a vector, by mapping each Character (A,T,C,G) into a one-hot vector and then concatonating all these vectors together. As we have a string of 82 Characters this gives us a final vector of length 248

In [459]:
mapping = {'A': [1, 0, 0, 0], 
           'T': [0, 1, 0, 0],
           'C': [0, 0, 1, 0],
           'G': [0, 0, 0, 1]
          }

In [460]:
def map_dna_into_vector(string):
    vector = []
    for c in string:
        vector.append(mapping[c])
    vector = np.hstack(vector)
    return vector

In [461]:
df['DNA'] = df['DNA'].map(map_dna_into_vector)

### Doing the Test-Train-Split

In [462]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)
train_data = np.vstack(train['DNA'].values)
test_data  = np.vstack(test['DNA'].values)

### Creating DataFrame for later Evaluation

In [463]:
eval_df = pd.DataFrame(data=[], columns=['Name', 'AUROC', 'AUPRC', 'f1_cv'])

## Models

In [464]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, precision_recall_curve, f1_score

In [465]:
def get_scores(true_val, pred_val):    
    fpr, tpr, thresholds_roc = roc_curve(true_val, pred_val)
    auroc = auc(fpr, tpr)
    
    precision, recall, thresholds_prc = precision_recall_curve(true_val, pred_val)
    auprc = auc(recall, precision)
    return (auroc, auprc)

In [466]:
def evaluate_model(model, params, df, train, test):
    # Put Data into a readable Matrix format
    train_data = np.vstack(train['DNA'].values)
    test_data  = np.vstack(test['DNA'].values)
    
    # Create Instance of our Model
    m = model()
    
    # Search for the best params in our model and print the best score
    clf = GridSearchCV(m, params, scoring='f1', cv=5, n_jobs=-1)
    clf.fit(train_data, train['labels'].values)
    print(f"The best score was: {clf.best_score_}")
    
    # Train our best model on the whole train-dataset
    best_estimator = model(**clf.best_params_)
    best_estimotor = best_estimator.fit(train_data, train['labels'].values)
    
    # Evaluate on the Test set
    pred_val = best_estimator.predict(test_data)
    true_val = test['labels'].values
    auroc, auprc = get_scores(true_val, pred_val)
    
    # Append to our Dataframe
    df = df.append({'Name': model.__name__, 'AUROC':auroc , 'AUPRC': auprc, 'f1_cv':clf.best_score_}, ignore_index=True)
    return (best_estimator, df)

### Logistic Regression

In [467]:
from sklearn.linear_model import LogisticRegression

In [468]:
params = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10, 100],
    'class_weight': ['balanced']
}

In [469]:
lg_best_estimator, eval_df = evaluate_model(LogisticRegression, params, eval_df, train, test)

The best score was: 0.7588536443620176




### SVC

In [470]:
from sklearn.svm import SVC

In [471]:
params = {'kernel': ['linear', 'rbf', 'poly'],
          'C': [1, 10, 100],
          'class_weight': ['balanced'],
          'gamma': ['auto', 'scale']
         }

In [472]:
svc_best_estimator, eval_df = evaluate_model(SVC, params, eval_df, train, test)

The best score was: 0.8155899824016746


### Random Forest

In [473]:
from sklearn.ensemble import RandomForestClassifier

In [474]:
params = {
    'n_estimators':[10, 100, 300],    
    'class_weight': ['balanced', 'balanced_subsample']
}

In [475]:
rfc_best_estimator, eval_df = evaluate_model(RandomForestClassifier, params, eval_df, train, test)

The best score was: 0.3543837067311752


###  Gaussian Process Classifer

In [476]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, PairwiseKernel

In [477]:
params = {
    'kernel' : [RBF(), PairwiseKernel()]
}

In [478]:
gpc_best_estimator, eval_df = evaluate_model(GaussianProcessClassifier, params, eval_df, train, test)

The best score was: 0.7111978043637333


## EValuation

In [479]:
eval_df

Unnamed: 0,Name,AUROC,AUPRC,f1_cv
0,LogisticRegression,0.896986,0.768583,0.758854
1,SVC,0.933787,0.862389,0.81559
2,RandomForestClassifier,0.557592,0.492914,0.354384
3,GaussianProcessClassifier,0.814909,0.716476,0.711198
