# Human DNA

import libraries for later use

In [None]:
import numpy as np
import pandas as pd
import sklearn

In [None]:
df_train = pd.read_csv('exercise_data/human_dna_train_split.csv')
df_valid = pd.read_csv('exercise_data/human_dna_validation_split.csv')
df_test  = pd.read_csv('exercise_data/human_dna_test_split.csv')

In [None]:
print(f"Size of training_set: {df_train.shape}")
print(f"Shape of test set: {df_test.shape}")
print(f"Shape of validation set: {df_valid.shape}")

### UnderSampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour

In [None]:
def undersample_dataframe(df):
    X = np.vstack(df['sequences'].values)
    y = df['labels'].values
    undersampler = RandomUnderSampler()
    X_undersampled, y_undersampled = undersampler.fit_resample(X, y)
    undersampled_df = pd.DataFrame({'labels':y_undersampled, 'sequences': X_undersampled[:,0]})
    return undersampled_df

In [None]:
print(f"Size before undersampling {df_train.shape[0]}")
df_train_undersampled = undersample_dataframe(df_train)
print(f"Size after undersampling {df_train_undersampled.shape[0]}")

### Mapping DNA to a vector

We will map the DNA into a vector, by mapping each Character (A,T,C,G) into a one-hot vector and then concatonating all these vectors together. As we have a string of 398 Characters this gives us a final vector of length 1592. 

*This operation takes some time, please be patient*

In [None]:
import utility
df_train_undersampled['sequences'] = df_train_undersampled['sequences'].map(utility.map_dna_into_vector)
df_valid['sequences'] = df_valid['sequences'].map(utility.map_dna_into_vector)
df_test['sequences']  = df_test['sequences'].map(utility.map_dna_into_vector)

## PCA

At first it seemed like to be a good idea to compress the Vector of length 1592 to the length of 200. Which seemed to be reasonable given to plot PCA produces. It did help somewhat with the training-time, but the f1-score got way worse <0.08. So we dropped the idea of using PCA for compressing the trainings-data

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
pca = PCA()
pca = pca.fit(np.vstack(df_train_undersampled['sequences'].values))

plt.plot(range(len(pca.explained_variance_)), pca.explained_variance_ratio_, )
plt.ylabel('Explained variance ratio')
plt.xlabel('Index of Principal Component')
plt.show()

### Creating a DataFrame for later Evalution

We create the dataframe eval_df for the final evalution. 

In [None]:
eval_df = pd.DataFrame(data=[], columns=["Name", "AUROC", "AUPRC", "f1_cv", "f1_test"])

## Models

In [None]:
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.metrics import f1_score
from multiprocessing import Pool, cpu_count

We thought that the evaluation of the models was kind of slow so we created a parallel job. Each process evaluates a model using the evaluator function

In [None]:
def evaluator(args):
    # read out parameters
    model, params, train, valid = args
    
    # get train and validation data
    train_data = np.vstack(train['sequences'].values)
    valid_data = np.vstack(valid['sequences'].values)
    
    # Fit Model with parameters 
    m = model(**params)
    m = m.fit(train_data, train['labels'].values)
    
    # Get prediction of our data
    pred_val = m.predict(valid_data)
    score = f1_score(valid['labels'].values, pred_val)
    return score

In here we evaluate all models with all the different parameters we specified. First we evaluate the model using the training data and the evaluation data. Then we train the best of those models on the combined training and validation data, to get an estimate of the f1-score on the test-data. 

*It's important to note that we evaluate all the models based on their performance on the validation-set.*

In [None]:
def evaluate_model(model, params, train, valid, test, eval_df):
    # Put Data into a usable Matrix Format
    train_data = np.vstack(train['sequences'].values)
    valid_data = np.vstack(valid['sequences'].values)
    test_data = np.vstack(valid['sequences'].values)
    
    combined_data = np.vstack([train_data, valid_data])
    combined_labels = np.hstack([train['labels'].values, valid['labels'].values])
    
    # Create Instance of the Model
    m = model()
    
    
    # Search for the best params in our model and print the best score
    p = Pool(cpu_count())
    grid = ParameterGrid(params)
    scores = p.map(evaluator, [(model, params, train, valid) for params in grid])
    p.close()
    
    # Get best parameters and scores
    best_score = np.max(scores)
    best_params = grid[np.argmax(scores)]
    print(f"The best score was {best_score}")
    
    
    # Train our best model on the whole train-dataset
    best_estimator = model(**best_params)
    best_estimotor = best_estimator.fit(combined_data, combined_labels)
    
    # Evaluate on the Test set
    pred_val = best_estimator.predict(test_data)
    true_val = test['labels'].values
    auroc, auprc, f1 = utility.get_scores(true_val, pred_val)
    
    # Append to our Dataframe
    eval_df = eval_df.append({'Name': model.__name__, 'AUROC':auroc , 'AUPRC': auprc, 'f1_cv':best_score, "f1_test": f1}, ignore_index=True)
    return (best_estimator, eval_df)

### Logistic Regression

We test logistic Regression with a few different parameters

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
params = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10, 100],
    'class_weight': ['balanced'],
    'solver': ['liblinear']
}

In [None]:
lg_best_estimator, eval_df = evaluate_model(LogisticRegression, params, df_train_undersampled, df_valid, df_test, eval_df)

### SVC

We test Support-Vector-Classifer with different parameters. Be careful, this task takes a long time and uses a lot of memory

In [None]:
from sklearn.svm import SVC

In [None]:
params = {'kernel': ['linear', 'rbf', 'poly'],
          'C': [1, 10],
          'class_weight': ['balanced'],
          'gamma': ['auto', 'scale'],
         }

In [None]:
svc_best_estimator, eval_df = evaluate_model(SVC, params, df_train_undersampled, df_valid, df_test, eval_df)

### Random Forest

We test the RAndom-Forest C

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
params = {
    'n_estimators':[10, 100, 300],    
    'class_weight': ['balanced', 'balanced_subsample']
}

In [None]:
rfc_best_estimator, eval_df = evaluate_model(RandomForestClassifier, params, df_train_undersampled, df_valid, df_test, eval_df)

###  Gaussian Process Classifer

We skip this classifier, as the memory-requirements crashes the kernel with ~64gb ram

## Evaluation

In [None]:
eval_df