### Imports

In [20]:
from packages.metagenomics import sampling2, encoding2
from Bio import SeqIO
import numpy as np
from Bio.Seq import Seq
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import shutil
import csv
import datetime
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.metrics import recall_score

### Ideas for model improvement
- consider setting class weights to reflect unequal distribution of data
- try one-vs-all approach
- encode sequences differently depending on whether sequence is a promoter or gene vs CpG islands, etc. Search for known motifs rather than blind k-mer groups.

# Recall score
From the paper: "Performance is measured in terms of species-level recall. We first compute the prediction recall within each species, i.e. the proportion of fragments originating from this species that are correctly classified and consider the average recall observed across species."

In [21]:
y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 1, 1]

recall_score(y_true, y_pred, average='weighted')

0.5

# Grid Search 1 - 2000 size dataset (4 samples)

### Run Grid Search 1 - Multiclass Random Forest Classifier

In [22]:
def append_results_to_file(filename, fields=None, rows=None):
    
    with open(filename, 'a') as f:

        write = csv.writer(f)

        if fields:
            write.writerow(fields)

        if rows:
            write.writerows(rows)

In [23]:
def run_rf_classification(X_train, X_test, y_train,y_test, max_depth, n_estimators, seed):
    """
    Score is subset accuracy.
    """
   
    rf = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=seed)
    rf.fit(X_train,y_train)
    score = rf.score(X_test, y_test)
    return score
    

In [24]:
def run_rf_classification_recall(X_train, X_test, y_train,y_test, max_depth, n_estimators, seed):
    """
    Score is species level recall.
    """
   
    rf = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=seed)
    rf.fit(X_train,y_train)
    y_pred = rf.predict(X_test)
    score = recall_score(y_test, y_pred, average='weighted')
    return score
    

In [25]:
def build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed):
    # delete output directory if it previously exists
    try:
        shutil.rmtree(output_dir)
    except FileNotFoundError:
        print('Existing directory was not found. Process will generate a directory.')

    # build fragments
    sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

In [26]:
def encode_fragments(output_dir, pattern, k, seed):
    
    # encode data
    fragments = sampling2.read_fragments(output_dir, pattern)
    X_enc, y = encoding2.encode_fragment_dataset(fragments,k)
    le = preprocessing.LabelEncoder()
    y_enc = le.fit_transform(y)

    # split data into test and training
    X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size=0.33, random_state=seed)
    
    print(X_enc.shape)
    
    return X_train, X_test, y_train, y_test

In [27]:
def calc_number_combinations(list_sample_length,list_coverage,list_k,list_max_depth,list_n_estimators):
    n = len(list_sample_length) * len(list_coverage) * len(list_k) *len(list_max_depth) * len(list_n_estimators)
    return n

In [28]:
def grid_search_multiclass_rf(seq_file, 
                              taxid_file, 
                              output_dir, 
                              pattern, 
                              list_sample_length, 
                              list_coverage, 
                              list_k,
                              list_max_depth, 
                              list_n_estimators, 
                              seed,
                              grid_search_file,
                              fields,
                              experiment,
                             score_type):
    
    # set up grid search results file
    append_results_to_file(grid_search_file, fields)
    
    # calculate number of combinations
    n_combinations = calc_number_combinations(list_sample_length,list_coverage,list_k,list_max_depth,list_n_estimators)
    
    # process combinations
    count = 0
    for sample_length in list_sample_length:
        for coverage in list_coverage:
            
            # fragment combination
            build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
            for k in list_k:
                
                # kmer combination
                X_train, X_test, y_train, y_test = encode_fragments(output_dir, pattern,k,seed)
                for max_depth in list_max_depth:
                    for n_estimators in list_n_estimators:
                        
                        # random forest combination
                        score = run_rf_classification_recall(X_train, X_test, y_train,y_test, max_depth, n_estimators, seed)
                        count += 1
                        
                        # output results to file
                        row = [experiment, 'multiclass', 'Random Forest', X_train.shape, sample_length, coverage, k, max_depth, n_estimators, score, score_type]
                        append_results_to_file(grid_search_file, row)
                        
                print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress
    

In [29]:
def calc_hyperparameter_relationship(filename):
    """
    Runs logistic regression over hyperparameters to find the regression coefficients.
    This should give some indicator of how hyperparameters are affecting the score.
    """
    # read in grid search results
    df = pd.read_csv(filename)
    X = df.drop(['experiment','score', 'category','classifier'],axis=1)
    y = df['score']
    
    lr = LinearRegression()
    lr.fit(X,y)
    return lr.coef_

### Search 1.1

In [30]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# fields = ['category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']

# # combinations to try
# list_sample_length = [100, 200, 400]
# list_coverage = [0.1, 1, 2, 10]
# list_k = [i for i in range(1,10,2)]
# list_max_depth = [i for i in range(2,20,4)]
# list_n_estimators = [i for i in range(20,200,40)]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields )




In [31]:
# gridsearch_file = 'data/gridsearch/rf-multi.2021.03.28.18.07.15.csv'
# calc_hyperparameter_relationship(gridsearch_file)
# # array([-1.44909382e-04,  1.67645287e-02, -9.17412031e-03,  1.18482770e-03, -1.44167915e-05])

### Search 1.2

In [32]:
# %%time
# 3h 37min 3s

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# fields = ['category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']

# # combinations to try
# list_sample_length = [200, 400]
# list_coverage = [1,10,100,200]
# list_k = [i for i in range(1,20,2)]
# list_max_depth = [i for i in range(6,20,2)]
# list_n_estimators = [i for i in range(50,501,50)]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields )




In [33]:
# gridsearch_file = 'data/gridsearch/rf-multi.2021.03.28.18.31.24.csv'
# calc_hyperparameter_relationship(gridsearch_file)
# # array([-1.79147764e-04,  8.02651943e-04, -1.15739138e-02,  2.46285125e-03, 4.98644276e-06])

### Search 1.3

In [34]:
# %%time
## 2h

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.3'

# # combinations to try
# list_sample_length = [100, 200, 400]
# list_coverage = [0.1,1,10,100,200]
# list_k = [1,2,4,8]
# list_max_depth = [i for i in range(20,51,10)]
# list_n_estimators = [50,100,200,400,800]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 1.4
Stopped short after realizing I didn't need to try as small intervals of max depth

In [35]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.4'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [200]
# list_k = [1]
# list_max_depth = [50]
# list_n_estimators = [50,100,200,400,800,1600,3200]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 1.5

In [36]:
# %%time
## 5.5 h

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.5'

# # combinations to try
# list_sample_length = [100, 200, 400]
# list_coverage = [200]
# list_k = [1,2,4,6]
# list_max_depth = [50, 100, 200]
# list_n_estimators = [50,100,200,400,800,1600,3200]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 1.6

In [37]:
# %%time
# #2.25 h

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.6'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [200]
# list_k = [1,2,3]
# list_max_depth = [55, 65, 75, 85, 95]
# list_n_estimators = [400,500,600,700,800]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


In [38]:
# gridsearch_file = 'data/gridsearch/rf-multi-all.csv'
# calc_hyperparameter_relationship(gridsearch_file)
# # array([-1.95530967e-04,  1.27461350e-03, -1.10693995e-02,  1.45827089e-03, 4.77729253e-05])
# # x1000 =
# # -0.19  sample length
# #  1.27  coverage
# # -10    k
# #  1.45  max_depth
# #  0.047 n_estimators

### Search 1.7

In [39]:
# %%time
# #12 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.7'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [200]
# list_k = [2]
# list_max_depth = [i for i in range(1,501, 50)]
# list_n_estimators = [500]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 1.8

In [40]:
# %%time
# # 9 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.8'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [200]
# list_k = [2]
# list_max_depth = [i for i in range(1,101, 10)]
# list_n_estimators = [500]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 1.9

In [41]:
# %%time
# # 51 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.9'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [200]
# list_k = [2]
# list_max_depth = [i for i in range(10,101, 2)]
# list_n_estimators = [500]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 1.10

In [42]:
# %%time
# # 16 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.10'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [200]
# list_k = [2]
# list_max_depth = [72]
# list_n_estimators = [i for i in range(2,103,10)] + [i for i in range(100,1001,100)]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 1.11

In [43]:
# %%time
# # 12 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.11'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [200]
# list_k = [2]
# list_max_depth = [72]
# list_n_estimators = [i for i in range(200,501,25)]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 1.12

In [44]:
# %%time
# # 1h


# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.12'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [10,20,50,100,200,400]
# list_k = [1,2,4,6,8,10,12,14,16,18,20]
# list_max_depth = [72]
# list_n_estimators = [500]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 1.13
Searching over smaller max_depth and n_estimator space due to limitations as we scale up.

In [45]:
# %%time
# # 22 min


# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch-2000/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.13'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [10,100,200,400]
# list_k = [1,2,4]
# list_max_depth = [i for i in range(5,30,5)]
# list_n_estimators = [i for i in range(20,50,5)]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 1.14
Switching to species-level recall and rechecking highest values.

In [46]:
# %%time


# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch-2000/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','max_depth', 'n_estimators', 'score','score type']
# experiment = '1.14'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [400]
# list_k = [1]
# list_max_depth = [15]
# list_n_estimators = [45]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


(32564, 400)
Percent complete: 100.0
CPU times: user 23.3 s, sys: 210 ms, total: 23.6 s
Wall time: 23.7 s


# Grid Search 2 - 10000 size dataset (198 examples)

### Search 2.01
Determine if method even works with new data.
Result: takes a long time to complete even a single run with 500 estimators and 72 max depth. May need to reduce one or both - reconsider how grid search 1 performs with higher coverage but lower estimators / depth.

In [25]:
# %%time
# # 47 min


# # parameters
# seq_file = 'data/train_small-db_toy-10000.fasta'
# taxid_file = 'data/train_small-db_toy-10000.taxid'
# output_dir = 'data/sampling/sampling-toy-10000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-10000/rf-multi.{}.csv'.format(date_time )
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '2.01'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [10]
# list_k = [1]
# list_max_depth = [72]
# list_n_estimators = [500]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 2.02
10x less estimators. Checking to see how time changes.

In [26]:
# %%time
# # 5 min


# # parameters
# seq_file = 'data/train_small-db_toy-10000.fasta'
# taxid_file = 'data/train_small-db_toy-10000.taxid'
# output_dir = 'data/sampling/sampling-toy-10000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-10000/rf-multi.{}.csv'.format(date_time )
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '2.02'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [10]
# list_k = [1]
# list_max_depth = [72]
# list_n_estimators = [50]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 2.03
10x less depth. Checking to see how time changes.

In [27]:
# %%time
# # 2 min

# # parameters
# seq_file = 'data/train_small-db_toy-10000.fasta'
# taxid_file = 'data/train_small-db_toy-10000.taxid'
# output_dir = 'data/sampling/sampling-toy-10000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-10000/rf-multi.{}.csv'.format(date_time )
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '2.03'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [10]
# list_k = [1]
# list_max_depth = [7]
# list_n_estimators = [500]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 2.04
Stopped early because we don't need to try k values higher than 4 because the accuracy is very low.

In [28]:
# %%time


# # parameters
# seq_file = 'data/train_small-db_toy-10000.fasta'
# taxid_file = 'data/train_small-db_toy-10000.taxid'
# output_dir = 'data/sampling/sampling-toy-10000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-10000/rf-multi.{}.csv'.format(date_time )
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '2.04'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [20,50,100,200,400]
# list_k = [1,2,4,6,8,10,12]
# list_max_depth = [72]
# list_n_estimators = [50]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 2.05
Stopped after first run at 400x sample coverage. Took at least 10 hours for that run alone, maybe longer.

In [29]:
# %%time


# # parameters
# seq_file = 'data/train_small-db_toy-10000.fasta'
# taxid_file = 'data/train_small-db_toy-10000.taxid'
# output_dir = 'data/sampling/sampling-toy-10000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-10000/rf-multi.{}.csv'.format(date_time )
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '2.05'

# # combinations to try
# list_sample_length = [200,400]
# list_coverage = [10,50,100,200,400]
# list_k = [1,2,4]
# list_max_depth = [72]
# list_n_estimators = [50]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 2.06
Testing with small max_depth.

In [2]:
# %%time
# # 10 h 15 min

# # parameters
# seq_file = 'data/train_small-db_toy-10000.fasta'
# taxid_file = 'data/train_small-db_toy-10000.taxid'
# output_dir = 'data/sampling/sampling-toy-10000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-10000/rf-multi.{}.csv'.format(date_time )
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '2.06'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [10,100,200,400]
# list_k = [1,2,4]
# list_max_depth = [15]
# list_n_estimators = [45]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


# Real Runs - 10000 size dataset

### Run 3.01
Testing with small max_depth and n_estimators to get initial round of data for diagrams.

In [None]:
%%time

# parameters
seq_file = 'data/train_small-db_toy-10000.fasta'
taxid_file = 'data/train_small-db_toy-10000.taxid'
output_dir = 'data/sampling/sampling-toy-10000'
pattern = 'fragments*.npy'
seed = None
date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
grid_search_file  = 'data/runs-10000/rf-multi.{}.csv'.format(date_time )
fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','max_depth', 'n_estimators', 'score','score type']
experiment = '3.01'
score_type = 'species_recall'

# combinations to try
list_sample_length = [100,200,400]
list_coverage = [0.1,1,10,100,400]
list_k = [1,2,4,6,8,10,12]
list_max_depth = [15]
list_n_estimators = [50]


grid_search_multiclass_rf(seq_file, 
                          taxid_file, 
                          output_dir, 
                          pattern, 
                          list_sample_length, 
                          list_coverage, 
                          list_k, 
                          list_max_depth,
                          list_n_estimators, 
                          seed,
                          grid_search_file,
                          fields,
                          experiment,
                          score_type)


(1339, 400)
Percent complete: 0.9523809523809524
(1339, 800)
Percent complete: 1.9047619047619049
(1339, 6248)
Percent complete: 2.857142857142857
(1339, 16953)
Percent complete: 3.8095238095238098
(1339, 15609)
Percent complete: 4.761904761904762
(1339, 13287)
Percent complete: 5.714285714285714
(1339, 10647)
Percent complete: 6.666666666666667
(12432, 400)
Percent complete: 7.6190476190476195
(12432, 800)
Percent complete: 8.571428571428571
(12432, 6400)
Percent complete: 9.523809523809524
(12432, 56642)
Percent complete: 10.476190476190476
(12432, 121486)
Percent complete: 11.428571428571429
(12432, 117394)
Percent complete: 12.380952380952381
(12432, 95302)
Percent complete: 13.333333333333334
(123485, 400)
Percent complete: 14.285714285714285
(123485, 800)
Percent complete: 15.238095238095239
(123485, 6400)
Percent complete: 16.19047619047619
(123485, 65530)
Percent complete: 17.142857142857142
(123485, 509744)
Percent complete: 18.095238095238095
(123485, 853683)
Percent complete