# Human DNA

import libraries for later use

In [2]:
import numpy as np
import pandas as pd
import sklearn

In [None]:
df_train = pd.read_csv('exercise_data/human_dna_train_split.csv')
df_valid = pd.read_csv('exercise_data/human_dna_validation_split.csv')
df_test  = pd.read_csv('exercise_data/human_dna_test_split.csv')

### Mapping DNA to a vector

We will map the DNA into a vector, by mapping each Character (A,T,C,G) into a one-hot vector and then concatonating all these vectors together. As we have a string of 398 Characters this gives us a final vector of length 1592. 

*This operation takes some time, please be patient*

In [None]:
import utility
df_train['sequences'] = df_train['sequences'].map(utility.map_dna_into_vector)
df_valid['sequences'] = df_valid['sequences'].map(utility.map_dna_into_vector)
df_test['sequences']  = df_test['sequences'].map(utility.map_dna_into_vector)

### Creating a DataFrame for later Evalution

In [None]:
eval_df = pd.DataFrame(data=[], columns=["Name", "AUROC", "AUPRC", "f1_score"])

## Models

In [None]:
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.metrics import f1_score

In [None]:
def evaluate_model(model, params, train, valid, test, eval_df):
    # Put Data into a usable Matrix Format
    train_data = np.vstack(train['sequences'].values)
    valid_data = np.vstack(valid['sequences'].values)
    test_data = np.vstack(valid['sequences'].values)
    
    combined_data = np.vstack([train_data, valid_data])
    combind_labels = np.hstack([train['labels'].values, valid['labels'].values])
    
    # Create Instance of the Model
    m = model()
    
    # Search for the best params in our model and print the best score
    grid = ParameterGrid(params)
    scores = []
    for grid_point in grid:
        m = model(**grid_point)
        m = m.fit(train_data, train['labels'].values)
        pred_val = m.predict(valid_data)
        score = f1_score(valid['labels'].values, pred_val)
        scores.append(score)
        
    best_score = np.max(scores)
    best_idx = np.argmax(scores)
    best_params = grid[best_idx]
    print(f"The best score was: {best_scores}")
    
    
    # Train our best model on the whole train-dataset
    best_estimator = model(**best_params)
    best_estimotor = best_estimator.fit(combined_data, combined_labels)
    
    # Evaluate on the Test set
    pred_val = best_estimator.predict(test_data)
    true_val = test['labels'].values
    auroc, auprc = utility.get_scores(true_val, pred_val)
    
    # Append to our Dataframe
    df = df.append({'Name': model.__name__, 'AUROC':auroc , 'AUPRC': auprc, 'f1_cv':best_scores}, ignore_index=True)
    return (best_estimator, df)
    
    
    
    

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
params = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10, 100],
    'class_weight': ['balanced'],
    'solver': ['sag']
    'n_jobs': [-1]
}

In [None]:
lg_best_estimator, eval_df = evaluate_model(LogisticRegression, params, df_train, df_valid, df_test, eval_df)