# C. Elegans DNA

Import librarires for later use

In [35]:
import numpy as np
import sklearn
import pandas as pd
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Read the C.Elegens .csv file. We add our own headers - labels stands for whether there is a splice site or not and the DNA is a string repressinting the DNA

In [36]:
df = pd.read_csv('exercise_data/C_elegans_acc_seq.csv', header=None, names=['labels', 'DNA'])

### Mapping DNA to a vector

We will map the DNA into a vector, by mapping each Character (A,T,C,G) into a one-hot vector and then concatonating all these vectors together. As we have a string of 82 Characters this gives us a final vector of length 248

In [37]:
import utility
df['DNA'] = df['DNA'].map(utility.map_dna_into_vector)

### Doing the Test-Train-Split

In [38]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)
train_data = np.vstack(train['DNA'].values)
test_data  = np.vstack(test['DNA'].values)

### Creating DataFrame for later Evaluation

In [39]:
eval_df = pd.DataFrame(data=[], columns=['Name', 'AUROC', 'AUPRC', 'f1_cv'])

## Models

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, precision_recall_curve, f1_score

In [41]:
def get_scores(true_val, pred_val):    
    fpr, tpr, thresholds_roc = roc_curve(true_val, pred_val)
    auroc = auc(fpr, tpr)
    
    precision, recall, thresholds_prc = precision_recall_curve(true_val, pred_val)
    auprc = auc(recall, precision)
    return (auroc, auprc)

In [42]:
def evaluate_model(model, params, df, train, test):
    # Put Data into a readable Matrix format
    train_data = np.vstack(train['DNA'].values)
    test_data  = np.vstack(test['DNA'].values)
    
    # Create Instance of our Model
    m = model()
    
    # Search for the best params in our model and print the best score
    clf = GridSearchCV(m, params, scoring='f1', cv=5, n_jobs=-1)
    clf.fit(train_data, train['labels'].values)
    print(f"The best score was: {clf.best_score_}")
    
    # Train our best model on the whole train-dataset
    best_estimator = model(**clf.best_params_)
    best_estimotor = best_estimator.fit(train_data, train['labels'].values)
    
    # Evaluate on the Test set
    pred_val = best_estimator.predict(test_data)
    true_val = test['labels'].values
    auroc, auprc = get_scores(true_val, pred_val)
    
    # Append to our Dataframe
    df = df.append({'Name': model.__name__, 'AUROC':auroc , 'AUPRC': auprc, 'f1_cv':clf.best_score_}, ignore_index=True)
    return (best_estimator, df)

### Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
params = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10, 100],
    'class_weight': ['balanced']
}

In [45]:
lg_best_estimator, eval_df = evaluate_model(LogisticRegression, params, eval_df, train, test)

The best score was: 0.7571151846378499




### SVC

In [46]:
from sklearn.svm import SVC

In [47]:
params = {'kernel': ['linear', 'rbf', 'poly'],
          'C': [1, 10, 100],
          'class_weight': ['balanced'],
          'gamma': ['auto', 'scale']
         }

In [48]:
svc_best_estimator, eval_df = evaluate_model(SVC, params, eval_df, train, test)

The best score was: 0.813190975221637


### Random Forest

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
params = {
    'n_estimators':[10, 100, 300],    
    'class_weight': ['balanced', 'balanced_subsample']
}

In [51]:
rfc_best_estimator, eval_df = evaluate_model(RandomForestClassifier, params, eval_df, train, test)

The best score was: 0.3406322646988788




###  Gaussian Process Classifer

In [52]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, PairwiseKernel

In [None]:
params = {
    'kernel' : [RBF(), PairwiseKernel()]
}

In [None]:
gpc_best_estimator, eval_df = evaluate_model(GaussianProcessClassifier, params, eval_df, train, test)

## EValuation

In [None]:
eval_df