### Imports

In [1]:
import shutil
import csv
import datetime
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from packages.metagenomics import sampling2, encoding2

from packages.LogisticRegression.MulticlassLogisticRegression import MulticlassLogisticRegression,MulticlassLogisticRegression2


In [2]:
def append_results_to_file(filename, fields=None, rows=None):
    
    with open(filename, 'a') as f:

        write = csv.writer(f)

        if fields:
            write.writerow(fields)

        if rows:
            write.writerows(rows)

In [3]:
def run_mlr_classification_recall(X_train, X_test, y_train, y_test, eta, epsilon):
    """
    Score is species level recall.
    """
    mlr = MulticlassLogisticRegression2(eta=eta, epsilon=epsilon)
    mlr.fit(X_train,y_train)
    y_pred = mlr.predict(X_test)
    score = recall_score(y_test, y_pred, average='weighted')
    return score
    

In [4]:
def build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed):
    # delete output directory if it previously exists
    try:
        shutil.rmtree(output_dir)
    except FileNotFoundError:
        print('Existing directory was not found. Process will generate a directory.')

    # build fragments
    sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

In [39]:
def encode_fragments(output_dir, pattern, k, seed):
    
    # encode data
    fragments = sampling2.read_fragments(output_dir, pattern)
    X_enc, y = encoding2.encode_fragment_dataset(fragments,k)
    le = preprocessing.LabelEncoder()
    y_enc = le.fit_transform(y)
    
    # calculate number of classes
    n_classes = len(np.unique(y_enc))
#     print('n_classes:',n_classes)
    n_classes_train = 0
    n_classes_test = 0
    while n_classes_train < n_classes or n_classes_test < n_classes:

        # split data into test and training
        X_train, X_test, y_train, y_test = train_test_split(X_enc.toarray(), y_enc, test_size=0.33)
        n_classes_train = len(np.unique(y_train))
        n_classes_test = len(np.unique(y_test))
#         print('train:',y_train)
#         print('test:', y_test)
    
    print(X_enc.shape)
    
    return X_train, X_test, y_train, y_test

In [6]:
def calc_number_combinations(list_sample_length,list_coverage,list_k,list_eta,list_epsilon):
    n = len(list_sample_length) * len(list_coverage) * len(list_k) *len(list_eta) * len(list_epsilon)
    return n

In [7]:
def grid_search_multiclass_mlr(seq_file, 
                              taxid_file, 
                              output_dir, 
                              pattern, 
                              list_sample_length, 
                              list_coverage, 
                              list_k,
                              list_eta,
                              list_epsilon, 
                              seed,
                              grid_search_file,
                              fields,
                              experiment,
                              score_type):
    
    # set up grid search results file
    append_results_to_file(grid_search_file, fields)
    
    # calculate number of combinations
    n_combinations = calc_number_combinations(list_sample_length,list_coverage,list_k,list_eta,list_epsilon)
    
    # process combinations
    count = 0
    for sample_length in list_sample_length:
        for coverage in list_coverage:
            
            # fragment combination
            build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
            for k in list_k:
                
                # kmer combination
                X_train, X_test, y_train, y_test = encode_fragments(output_dir, pattern,k,seed)
                for eta in list_eta:
                    for epsilon in list_epsilon:
                        
                        # random forest combination
                        score = run_mlr_classification_recall(X_train, X_test, y_train,y_test, eta, epsilon)
                        count += 1
                        
                        # output results to file
                        row = [experiment, 'multiclass', 'Logistic Regression', X_train.shape, sample_length, coverage, k, eta, epsilon, score, score_type]
                        append_results_to_file(grid_search_file, row)
                        
                print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress
    

# Run Set 1 - MLR Toy
2000 lengths dataset

### Run 4.01

In [41]:
%%time

# parameters
seq_file = 'data/train_small-db_toy-2000.fasta'
taxid_file = 'data/train_small-db_toy-2000.taxid'
output_dir = 'data/sampling/sampling-toy-2000-mlr'
pattern = 'fragments*.npy'
seed = 42
date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
experiment = '4.01'
score_type = 'species_recall'

# combinations to try
list_sample_length = [100,200,400]
list_coverage = [0.2,1,10,100,200,400]
list_k = [1,2,4,6,8,10,12]
list_eta = [0.01]
list_epsilon = [0.01]


grid_search_multiclass_mlr(seq_file, 
                          taxid_file, 
                          output_dir, 
                          pattern, 
                          list_sample_length, 
                          list_coverage, 
                          list_k, 
                          list_eta,
                          list_epsilon,
                          seed,
                          grid_search_file,
                          fields,
                          experiment,
                          score_type)


(18, 398)
Percent complete: 0.7936507936507936
(18, 525)
Percent complete: 1.5873015873015872
(18, 432)
Percent complete: 2.380952380952381
(18, 287)
Percent complete: 3.1746031746031744
(18, 216)
Percent complete: 3.968253968253968
(18, 180)
Percent complete: 4.761904761904762
(18, 144)
Percent complete: 5.555555555555555
(85, 400)
Percent complete: 6.349206349206349
(85, 793)
Percent complete: 7.142857142857142
(85, 1722)
Percent complete: 7.936507936507936
(85, 1338)
Percent complete: 8.73015873015873
(85, 1018)
Percent complete: 9.523809523809524
(85, 850)
Percent complete: 10.317460317460316
(85, 680)
Percent complete: 11.11111111111111
(816, 400)
Percent complete: 11.904761904761903
(816, 800)
Percent complete: 12.698412698412698
(816, 5689)
Percent complete: 13.492063492063492
(816, 10714)
Percent complete: 14.285714285714285
(816, 9328)
Percent complete: 15.079365079365079
(816, 7885)
Percent complete: 15.873015873015872
(816, 6317)
Percent complete: 16.666666666666664
(8141, 4

KeyboardInterrupt: 