### Imports

In [2]:
from packages.metagenomics import sampling2, encoding2
from Bio import SeqIO
import numpy as np
from Bio.Seq import Seq
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import shutil
import csv
import datetime
from sklearn.linear_model import LinearRegression
import pandas as pd

### Ideas for model improvement
- consider setting class weights to reflect unequal distribution of data
- try different loss functions
- try one-vs-all approach
- encode sequences differently depending on whether sequence is a promoter or gene vs CpG islands, etc. Search for known motifs rather than blind k-mer groups.

### Run Grid Search 1 - Multiclass Random Forest Classifier

In [3]:
def append_results_to_file(filename, fields=None, rows=None):
    
    with open(filename, 'a') as f:

        write = csv.writer(f)

        if fields:
            write.writerow(fields)

        if rows:
            write.writerows(rows)

In [4]:
def run_rf_classification(X_train, X_test, y_train,y_test, max_depth, n_estimators, seed):
   
    rf = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=seed)
    rf.fit(X_train,y_train)
    score = rf.score(X_test, y_test)
    return score
    

In [5]:
def build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed):
    # delete output directory if it previously exists
    try:
        shutil.rmtree(output_dir)
    except FileNotFoundError:
        print('Existing directory was not found.')

    # build fragments
    sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

In [6]:
def encode_fragments(output_dir, pattern, k, seed):
    
    # encode data
    fragments = sampling2.read_fragments(output_dir, pattern)
    X_enc, y = encoding2.encode_fragment_dataset(fragments,k)
    le = preprocessing.LabelEncoder()
    y_enc = le.fit_transform(y)

    # split data into test and training
    X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size=0.33, random_state=seed)
    
    print(X_enc.shape)
    
    return X_train, X_test, y_train, y_test

In [7]:
def calc_number_combinations(list_sample_length,list_coverage,list_k,list_max_depth,list_n_estimators):
    n = len(list_sample_length) * len(list_coverage) * len(list_k) *len(list_max_depth) * len(list_n_estimators)
    return n

In [8]:
def grid_search_multiclass_rf(seq_file, 
                              taxid_file, 
                              output_dir, 
                              pattern, 
                              list_sample_length, 
                              list_coverage, 
                              list_k,
                              list_max_depth, 
                              list_n_estimators, 
                              seed,
                              grid_search_file,
                              fields,
                              experiment):
    
    # set up grid search results file
    append_results_to_file(grid_search_file, fields)
    
    # calculate number of combinations
    n_combinations = calc_number_combinations(list_sample_length,list_coverage,list_k,list_max_depth,list_n_estimators)
    
    # process combinations
    count = 0
    for sample_length in list_sample_length:
        for coverage in list_coverage:
            
            # fragment combination
            build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
            for k in list_k:
                
                # kmer combination
                X_train, X_test, y_train, y_test = encode_fragments(output_dir, pattern,k,seed)
                for max_depth in list_max_depth:
                    for n_estimators in list_n_estimators:
                        
                        # random forest combination
                        score = run_rf_classification(X_train, X_test, y_train,y_test, max_depth, n_estimators, seed)
                        count += 1
                        
                        # output results to file
                        row = [experiment, 'multiclass', 'Random Forest', sample_length, coverage, k, max_depth, n_estimators, score]
                        append_results_to_file(grid_search_file, row)
                        
                print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress
    

In [9]:
def calc_hyperparameter_relationship(filename):
    """
    Runs logistic regression over hyperparameters to find the regression coefficients.
    This should give some indicator of how hyperparameters are affecting the score.
    """
    # read in grid search results
    df = pd.read_csv(filename)
    X = df.drop(['experiment','score', 'category','classifier'],axis=1)
    y = df['score']
    
    lr = LinearRegression()
    lr.fit(X,y)
    return lr.coef_

### Search 1.1

In [10]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# fields = ['category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']

# # combinations to try
# list_sample_length = [100, 200, 400]
# list_coverage = [0.1, 1, 2, 10]
# list_k = [i for i in range(1,10,2)]
# list_max_depth = [i for i in range(2,20,4)]
# list_n_estimators = [i for i in range(20,200,40)]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields )




In [11]:
gridsearch_file = 'data/gridsearch/rf-multi.2021.03.28.18.07.15.csv'
calc_hyperparameter_relationship(gridsearch_file)
# array([-1.44909382e-04,  1.67645287e-02, -9.17412031e-03,  1.18482770e-03, -1.44167915e-05])

array([-1.44909382e-04,  1.67645287e-02, -9.17412031e-03,  1.18482770e-03,
       -1.44167915e-05])

### Search 1.2

In [12]:
# %%time
# 3h 37min 3s

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# fields = ['category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']

# # combinations to try
# list_sample_length = [200, 400]
# list_coverage = [1,10,100,200]
# list_k = [i for i in range(1,20,2)]
# list_max_depth = [i for i in range(6,20,2)]
# list_n_estimators = [i for i in range(50,501,50)]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields )




In [13]:
gridsearch_file = 'data/gridsearch/rf-multi.2021.03.28.18.31.24.csv'
calc_hyperparameter_relationship(gridsearch_file)
# array([-1.79147764e-04,  8.02651943e-04, -1.15739138e-02,  2.46285125e-03, 4.98644276e-06])

array([-1.79147764e-04,  8.02651943e-04, -1.15739138e-02,  2.46285124e-03,
        4.98644275e-06])

### Search 1.3

In [14]:
# %%time
## 2h

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.3'

# # combinations to try
# list_sample_length = [100, 200, 400]
# list_coverage = [0.1,1,10,100,200]
# list_k = [1,2,4,8]
# list_max_depth = [i for i in range(20,51,10)]
# list_n_estimators = [50,100,200,400,800]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 1.4
Stopped short after realizing I didn't need to try as small intervals of max depth

In [15]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.4'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [200]
# list_k = [1]
# list_max_depth = [50]
# list_n_estimators = [50,100,200,400,800,1600,3200]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


### Search 1.5

In [16]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# grid_search_file  = 'data/gridsearch/rf-multi.{}.csv'.format(datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') )
# fields = ['experiment','category','classifier','sample_length','coverage','k','max_depth', 'n_estimators', 'score']
# experiment = '1.5'

# # combinations to try
# list_sample_length = [100, 200, 400]
# list_coverage = [200]
# list_k = [1,2,4,6]
# list_max_depth = [50, 100, 200]
# list_n_estimators = [50,100,200,400,800,1600,3200]


# grid_search_multiclass_rf(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_max_depth,
#                           list_n_estimators, 
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment)


(16282, 400)
Percent complete: 8.333333333333332
(16282, 800)
Percent complete: 16.666666666666664
(16282, 6400)
Percent complete: 25.0
(16282, 43939)
Percent complete: 33.33333333333333
(8141, 800)
Percent complete: 41.66666666666667
(8141, 1600)
Percent complete: 50.0
(8141, 12799)
Percent complete: 58.333333333333336
(8141, 77930)
Percent complete: 66.66666666666666
(4071, 1600)
Percent complete: 75.0
(4071, 3200)
Percent complete: 83.33333333333334
(4071, 25514)
Percent complete: 91.66666666666666
(4071, 117598)
Percent complete: 100.0
CPU times: user 5h 5min, sys: 1min 46s, total: 5h 6min 46s
Wall time: 5h 8min 50s
