### Imports

In [38]:
from packages.metagenomics import sampling2, encoding2
from Bio import SeqIO
import numpy as np
from Bio.Seq import Seq
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import shutil

### Load and Encode Toy Dataset

In [9]:
seq_file = 'data/train_small-db_toy-2000.fasta'
taxid_file = 'data/train_small-db_toy-2000.taxid'
output_dir = 'data/sampling/sampling-toy-2000'
pattern = 'fragments*.npy'
sample_length = 200
coverage = 10
seed = 42



print(fragments_toy.shape)

(409, 201)


In [39]:
def run_rf_classification(seq_file, taxid_file, output_dir, sample_length, coverage, seed, pattern, k, max_depth, n_estimators):
    
    # delete directory
    shutil.rmtree(output_dir)
    
    # build fragments
    sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
    
    # encode data
    fragments = sampling2.read_fragments(output_dir, pattern)
    X_enc, y = encoding2.encode_fragment_dataset(fragments,k)
    le = preprocessing.LabelEncoder()
    y_enc = le.fit_transform(y)

    # split data into test and training
    X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size=0.33, random_state=seed)

    # classifier
    rf = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=seed)
    rf.fit(X_train,y_train)
    score = rf.score(X_test, y_test)
    return score
    

In [36]:
def grid_search_multiclass_rf(list_sample_length, list_coverage, fragments, list_k, list_max_depth, list_n_estimators, seed):
    
    results = []
    for sample_length in list_sample_length:
        for coverage in list_coverage:
            for k in list_k:
                for max_depth in list_max_depth:
                    for n_estimators in list_n_estimators:
                        score = run_rf_classification()
                        print('{}|{}|{}|{}|{}|{}|{}|{}'.format('multiclass', 'Random Forest', sample_length, coverage, k, max_depth, n_estimators, score ))

    

In [37]:
list_k = [i for i in range(4,10,2)]
list_max_depth = [i for i in range(2,20,4)]
list_n_estimators = [i for i in range(20,200,40)]

grid_search_multiclass_rf(sample_length, coverage, fragments_toy, list_k, list_max_depth,list_n_estimators, seed )

multiclass|Random Forest|200|10|4|2|20|0.2
multiclass|Random Forest|200|10|4|2|60|0.2222222222222222
multiclass|Random Forest|200|10|4|2|100|0.18518518518518517
multiclass|Random Forest|200|10|4|2|140|0.18518518518518517
multiclass|Random Forest|200|10|4|2|180|0.18518518518518517
multiclass|Random Forest|200|10|4|6|20|0.28888888888888886
multiclass|Random Forest|200|10|4|6|60|0.25925925925925924
multiclass|Random Forest|200|10|4|6|100|0.2222222222222222
multiclass|Random Forest|200|10|4|6|140|0.21481481481481482
multiclass|Random Forest|200|10|4|6|180|0.2222222222222222
multiclass|Random Forest|200|10|4|10|20|0.2814814814814815
multiclass|Random Forest|200|10|4|10|60|0.28888888888888886
multiclass|Random Forest|200|10|4|10|100|0.2518518518518518
multiclass|Random Forest|200|10|4|10|140|0.2518518518518518
multiclass|Random Forest|200|10|4|10|180|0.2740740740740741
multiclass|Random Forest|200|10|4|14|20|0.28888888888888886
multiclass|Random Forest|200|10|4|14|60|0.3037037037037037
multi

In [None]:
# removing directory
shutil.rmtree('data/sampling/sampling-toy-2000')