# Human DNA

import libraries for later use

In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
df_train = pd.read_csv('exercise_data/human_dna_train_split.csv')
df_valid = pd.read_csv('exercise_data/human_dna_validation_split.csv')
df_test  = pd.read_csv('exercise_data/human_dna_test_split.csv')

In [3]:
print(f"Size of training_set: {df_train.shape}")
print(f"Shape of test set: {df_test.shape}")
print(f"Shape of validation set: {df_valid.shape}")

Size of training_set: (500000, 2)
Shape of test set: (33333, 2)
Shape of validation set: (33333, 2)


### UnderSampling

In [4]:
from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour

In [5]:
def undersample_dataframe(df):
    X = np.vstack(df['sequences'].values)
    y = df['labels'].values
    undersampler = RandomUnderSampler()
    X_undersampled, y_undersampled = undersampler.fit_resample(X, y)
    undersampled_df = pd.DataFrame({'labels':y_undersampled, 'sequences': X_undersampled[:,0]})
    return undersampled_df

In [6]:
print(f"Size before undersampling {df_train.shape[0]}")
df_train_undersampled = undersample_dataframe(df_train)
print(f"Size after undersampling {df_train_undersampled.shape[0]}")

Size before undersampling 500000
Size after undersampling 2942


### Mapping DNA to a vector

We will map the DNA into a vector, by mapping each Character (A,T,C,G) into a one-hot vector and then concatonating all these vectors together. As we have a string of 398 Characters this gives us a final vector of length 1592. 

*This operation takes some time, please be patient*

In [7]:
import utility
df_train_undersampled['sequences'] = df_train_undersampled['sequences'].map(utility.map_dna_into_vector)
df_valid['sequences'] = df_valid['sequences'].map(utility.map_dna_into_vector)
df_test['sequences']  = df_test['sequences'].map(utility.map_dna_into_vector)

### Creating a DataFrame for later Evalution

In [8]:
eval_df = pd.DataFrame(data=[], columns=["Name", "AUROC", "AUPRC", "f1_cv", "f1_test"])

## Models

In [9]:
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.metrics import f1_score
from multiprocessing import Pool, cpu_count
import os

In [10]:
def evaluator(args):
    # read out parameters
    model, params, train, valid = args
    
    # get train and validation data
    train_data = np.vstack(train['sequences'].values)
    valid_data = np.vstack(valid['sequences'].values)
    
    # Fit Model with parameters 
    m = model(**params)
    m = m.fit(train_data, train['labels'].values)
    
    # Get prediction of our data
    pred_val = m.predict(valid_data)
    score = f1_score(valid['labels'].values, pred_val)
    return score

In [18]:
def evaluate_model(model, params, train, valid, test, eval_df):
    # Put Data into a usable Matrix Format
    train_data = np.vstack(train['sequences'].values)
    valid_data = np.vstack(valid['sequences'].values)
    test_data = np.vstack(valid['sequences'].values)
    
    combined_data = np.vstack([train_data, valid_data])
    combined_labels = np.hstack([train['labels'].values, valid['labels'].values])
    
    # Create Instance of the Model
    m = model()
    
    
    # Search for the best params in our model and print the best score
    p = Pool(cpu_count())
    grid = ParameterGrid(params)
    result = p.map_async(evaluator, [(model, params, train, valid) for params in grid])
    scores = result.get()
    p.close()
    
    # Get best parameters and scores
    best_score = np.max(scores)
    best_params = grid[np.argmax(scores)]
    print(f"The best score was {best_score}")
    
    
    # Train our best model on the whole train-dataset
    best_estimator = model(**best_params)
    best_estimotor = best_estimator.fit(combined_data, combined_labels)
    
    # Evaluate on the Test set
    pred_val = best_estimator.predict(test_data)
    true_val = test['labels'].values
    auroc, auprc, f1 = utility.get_scores(true_val, pred_val)
    
    # Append to our Dataframe
    eval_df = eval_df.append({'Name': model.__name__, 'AUROC':auroc , 'AUPRC': auprc, 'f1_cv':best_score, "f1_test": f1}, ignore_index=True)
    return (best_estimator, eval_df)

### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

In [22]:
params = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10, 100],
    'class_weight': ['balanced'],
    'solver': ['liblinear']
}

In [23]:
lg_best_estimator, eval_df = evaluate_model(LogisticRegression, params, df_train_undersampled, df_valid, df_test, eval_df)

The best score was 0.022753889911326753


### SVC

In [24]:
from sklearn.svm import SVC

In [25]:
params = {'kernel': ['linear', 'rbf', 'poly'],
          'C': [1, 10],
          'class_weight': ['balanced'],
          'gamma': ['auto', 'scale'],
         }

In [26]:
svc_best_estimator, eval_df = evaluate_model(SVC, params, df_train_undersampled, df_valid, df_test, eval_df)

The best score was 0.035142348754448396


### Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
params = {
    'n_estimators':[10, 100, 300],    
    'class_weight': ['balanced', 'balanced_subsample']
}

In [32]:
rfc_best_estimator, eval_df = evaluate_model(RandomForestClassifier, params, df_train_undersampled, df_valid, df_test, eval_df)

The best score was 0.03183791606367583


###  Gaussian Process Classifer

We skip this classifier, as the memory-requirements crashes the kernel with ~64gb ram

## Evaluation

In [None]:
eval_df