### Imports

In [1]:
import shutil
import csv
import datetime
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from packages.metagenomics import sampling2, encoding2
from sklearn.linear_model import LogisticRegression

from packages.LogisticRegression.MulticlassLogisticRegression import MulticlassLogisticRegression,MulticlassLogisticRegression2


In [2]:
def append_results_to_file(filename, fields=None, rows=None):
    
    with open(filename, 'a') as f:

        write = csv.writer(f)

        if fields:
            write.writerow(fields)

        if rows:
            write.writerows(rows)

In [3]:
def run_mlr_classification_recall(X_train, X_test, y_train, y_test, eta, epsilon):
    """
    Score is species level recall.
    """
    mlr = MulticlassLogisticRegression2(eta=eta, epsilon=epsilon)
    mlr.fit(X_train,y_train)
    y_pred = mlr.predict(X_test)
    score = recall_score(y_test, y_pred, average='weighted')
    return score
    

In [4]:
def build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed):
    # delete output directory if it previously exists
    try:
        shutil.rmtree(output_dir)
    except FileNotFoundError:
        print('Existing directory was not found. Process will generate a directory.')

    # build fragments
    sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

In [5]:
def encode_fragments(output_dir, pattern, k, seed):
    
    # encode data
    fragments = sampling2.read_fragments(output_dir, pattern)
    X_enc, y = encoding2.encode_fragment_dataset(fragments,k)
    le = preprocessing.LabelEncoder()
    y_enc = le.fit_transform(y)
    
    # calculate number of classes
    n_classes = len(np.unique(y_enc))
#     print('n_classes:',n_classes)
    n_classes_train = 0
    n_classes_test = 0
    while n_classes_train < n_classes or n_classes_test < n_classes:

        # split data into test and training
        X_train, X_test, y_train, y_test = train_test_split(X_enc.toarray(), y_enc, test_size=0.33)
        n_classes_train = len(np.unique(y_train))
        n_classes_test = len(np.unique(y_test))
#         print('train:',len(y_train))
#         print('test:', len(y_test))
    
    print(X_enc.shape)
    
    return X_train, X_test, y_train, y_test

In [6]:
def calc_number_combinations(list_sample_length,list_coverage,list_k,list_eta,list_epsilon):
    n = len(list_sample_length) * len(list_coverage) * len(list_k) *len(list_eta) * len(list_epsilon)
    return n

In [7]:
def grid_search_multiclass_mlr(seq_file, 
                              taxid_file, 
                              output_dir, 
                              pattern, 
                              list_sample_length, 
                              list_coverage, 
                              list_k,
                              list_eta,
                              list_epsilon, 
                              seed,
                              grid_search_file,
                              fields,
                              experiment,
                              score_type):
    
    # set up grid search results file
    append_results_to_file(grid_search_file, fields)
    
    # calculate number of combinations
    n_combinations = calc_number_combinations(list_sample_length,list_coverage,list_k,list_eta,list_epsilon)
    
    # process combinations
    count = 0
    for sample_length in list_sample_length:
        for coverage in list_coverage:
            
            # fragment combination
            build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
            for k in list_k:
                
                # kmer combination
                X_train, X_test, y_train, y_test = encode_fragments(output_dir, pattern,k,seed)
                for eta in list_eta:
                    for epsilon in list_epsilon:
                        
                        # random forest combination
                        score = run_mlr_classification_recall(X_train, X_test, y_train,y_test, eta, epsilon)
                        count += 1
                        
                        # output results to file
                        row = [experiment, 'multiclass', 'Logistic Regression', X_train.shape, sample_length, coverage, k, eta, epsilon, score, score_type]
                        append_results_to_file(grid_search_file, row)
                        
                print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress
    

# Run Set 1 - MLR Toy
2000 lengths dataset

### Run 4.01
Stopped early due to runs taking a long time.

In [8]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.01'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [0.2,1,10]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 4.02

In [9]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.02'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [200]
# list_coverage = [0.3,1,10]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 4.03

In [10]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.03'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [400]
# list_coverage = [0.5,1,10]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 4.04

In [11]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.04'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [100]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 4.05

In [12]:
# %%time
##40 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.05'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [200]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


# Run Set 2 - sklearn with l1 penalty
Compare with sklearn implementation of MLR to see if performance is similar.

In [13]:
# def calc_number_combinations2(list_sample_length,list_coverage,list_k,list_multiclass,list_classweight):
#     n = len(list_sample_length) * len(list_coverage) * len(list_k) *len(list_multiclass) * len(list_classweight)
#     return n

In [14]:
def encode_fragments2(output_dir, pattern, k, seed):
    """
    Does not convert sparse matrix to numpy matrix.
    """
    
    # encode data
    fragments = sampling2.read_fragments(output_dir, pattern)
    X_enc, y = encoding2.encode_fragment_dataset(fragments,k)
    le = preprocessing.LabelEncoder()
    y_enc = le.fit_transform(y)
    
    # calculate number of classes
    n_classes = len(np.unique(y_enc))
#     print('n_classes:',n_classes)
    n_classes_train = 0
    n_classes_test = 0
    while n_classes_train < n_classes or n_classes_test < n_classes:

        # split data into test and training
        X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size=0.33)
        n_classes_train = len(np.unique(y_train))
        n_classes_test = len(np.unique(y_test))
#         print('train:',y_train)
#         print('test:', y_test)
    
    print(X_enc.shape)
    
    return X_train, X_test, y_train, y_test

In [15]:
# def run_lr_classification_recall(X_train, X_test, y_train, y_test, multiclass, classweight, seed ):
#     """
#     Score is species level recall. Uses sklearn version of logistic regression.
#     """
#     lr = LogisticRegression(random_state=seed, multi_class=multiclass, class_weight=classweight )
#     lr.fit(X_train,y_train)
#     y_pred = lr.predict(X_test)
#     score = recall_score(y_test, y_pred, average='weighted')
#     return score
    

In [16]:
# def grid_search_multiclass_lr(seq_file, 
#                               taxid_file, 
#                               output_dir, 
#                               pattern, 
#                               list_sample_length, 
#                               list_coverage, 
#                               list_k,
#                               list_multiclass,
#                               list_classweight,
#                               seed,
#                               grid_search_file,
#                               fields,
#                               experiment,
#                               score_type):
    
#     # set up grid search results file
#     append_results_to_file(grid_search_file, fields)
    
#     # calculate number of combinations
#     n_combinations = calc_number_combinations2(list_sample_length,list_coverage,list_k, list_multiclass, list_classweight)
    
#     # process combinations
#     count = 0
#     for sample_length in list_sample_length:
#         for coverage in list_coverage:
            
#             # fragment combination
#             build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
#             for k in list_k:
                
#                 # kmer combination
#                 X_train, X_test, y_train, y_test = encode_fragments2(output_dir, pattern,k,seed)
                
                
#                 for multiclass in list_multiclass:
#                     for classweight in list_classweight:

                        
#                         # random forest combination
#                         score = run_lr_classification_recall(X_train, X_test, y_train, y_test, multiclass, classweight, seed)
#                         count += 1

#                         # output results to file
#                         row = [experiment, 'multiclass', 'Logistic Regression (sklearn)', X_train.shape, sample_length, coverage, k, multiclass, classweight, score, score_type]
#                         append_results_to_file(grid_search_file, row)

#                 print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress


### Run 5.01

In [17]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','multiclass', 'class_weight', 'score','score type']
# experiment = '5.01'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [1]
# list_multiclass = ['auto']
# list_classweight= [None]


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 5.02

In [18]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','multiclass', 'class_weight', 'score','score type']
# experiment = '5.02'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [2,4,6,8,10,12]
# list_multiclass = ['auto']
# list_classweight= [None]


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


# Run Set 3 - MLR with L2 penalty
- toy-2000 dataset
- See how performance changes with L2 penalty
- Changed breakpoint in gradient descent to 100 from 100,000 as well.

In [19]:
def calc_number_combinations(*args):
    total = 1
    for each in args:
        total *= len(each)
    return total

In [20]:
def parameter_generator(list_sample_length,list_coverage,list_k):
    
    for L in list_sample_length:
        for c in list_coverage:
            for k in list_k:
                yield L, c, k

In [21]:
def hyperparameter_generator(list_eta,list_epsilon, list_penalty, list_l2_lambda,list_max_iter):
    
    for eta in list_eta:
        for e in list_epsilon:
            for penalty in list_penalty:
                for l2 in list_l2_lambda:
                    for m in list_max_iter:
                        yield eta,e,penalty,l2,m

In [22]:
def run_mlr_classification_recall(X_train, X_test, y_train, y_test, eta, epsilon, penalty, l2_lambda, max_iter):
    """
    Score is species level recall.
    """
    mlr = MulticlassLogisticRegression2(eta=eta, 
                                        epsilon=epsilon, 
                                        penalty=penalty, 
                                        l2_lambda=l2_lambda, 
                                        max_iter=max_iter)
    mlr.fit(X_train,y_train)
    y_pred = mlr.predict(X_test)
    score = recall_score(y_test, y_pred, average='weighted')
    return score
    

In [39]:
def grid_search_multiclass_mlr(seq_file, 
                              taxid_file, 
                              output_dir, 
                              pattern, 
                              list_sample_length, 
                              list_coverage, 
                              list_k,
                              list_eta,
                              list_epsilon, 
                              list_penalty,
                              list_l2_lambda,
                              list_max_iter,
                              seed,
                              grid_search_file,
                              fields,
                              experiment,
                              score_type):
    
    # set up grid search results file
    append_results_to_file(grid_search_file, fields)
    
    # calculate number of combinations
    n_combinations = calc_number_combinations(list_sample_length,
                                              list_coverage,
                                              list_k,
                                              list_eta,
                                              list_epsilon, 
                                              list_penalty,
                                              list_l2_lambda, 
                                              list_max_iter)
    
    # process combinations
    count = 0
    sample_length_prev = -1
    coverage_prev = -1
    
    # parameter combinations
    for sample_length, coverage,k in parameter_generator(list_sample_length,list_coverage,list_k):
        print(sample_length, coverage,k)
        
        if sample_length != sample_length_prev or coverage != coverage_prev:
            
            # fragment combination
            build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
            
            # update previous values
            sample_length_prev = sample_length
            coverage_prev = coverage
                
        # kmer from fragments
        X_train, X_test, y_train, y_test = encode_fragments(output_dir, pattern,k,seed)
        
        
        # hyperparameter combinations
        for eta, epsilon, penalty, l2_lambda, max_iter in hyperparameter_generator(list_eta,list_epsilon, list_penalty, list_l2_lambda,list_max_iter):
            print(eta, epsilon, penalty, l2_lambda, max_iter)
            
            # train and score model
            score = run_mlr_classification_recall(X_train, X_test, y_train, y_test, eta, epsilon, penalty, l2_lambda, max_iter)
            count += 1

            # output results to file
            row = [experiment, 'multiclass', 'Logistic Regression', X_train.shape, sample_length, coverage, k, eta, epsilon, penalty, l2_lambda, max_iter, score, score_type]
            append_results_to_file(grid_search_file, row)
                        
        print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress
    

### Run 6.01

In [24]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'l2_penalty','score','score type']
# experiment = '6.01'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [1,10,100,200,400]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]
# list_l2_penalty = [1,10,100, 0.1]


# grid_search_multiclass_mlr_l2(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           list_l2_penalty,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 6.02
Testing out range for lambda. It appears that close to zero (0, 0.1) is best, with values decreasing more after 1.0.

In [44]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment',
#           'category',
#           'classifier',
#           'training shape',
#           'sample_length',
#           'coverage',
#           'k',
#           'eta', 
#           'epsilon', 
#           'penalty',
#           'l2_lambda',
#           'max_iter',
#           'score',
#           'score type']

# experiment = '6.02'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [200]
# list_k = [4]
# list_eta = [0.01]
# list_epsilon = [0.01]
# list_penalty = ['l2']
# list_l2_lambda = [0,0.1,0.5,1,3,6,10]
# list_max_iter = [100]


# grid_search_multiclass_mlr(seq_file, 
#                                 taxid_file, 
#                                 output_dir, 
#                                 pattern, 
#                                 list_sample_length, 
#                                 list_coverage, 
#                                 list_k, 
#                                 list_eta,
#                                 list_epsilon,
#                                 list_penalty,
#                                 list_l2_lambda,
#                                 list_max_iter,
#                                 seed,
#                                 grid_search_file,
#                                 fields,
#                                 experiment,
#                                 score_type)


# Run Set 4 - sklearn with l1 penalty
To compare against L2 results

In [45]:
def hyperparameter_generator_lr(list_penalty,list_multiclass,list_classweight):
    
    for penalty in list_penalty:
        for multiclass in list_multiclass:
            for classweight in list_classweight:
                yield penalty,multiclass,classweight

In [57]:
def run_lr_classification_recall(X_train, X_test, y_train, y_test, penalty,multiclass,classweight, seed):
    """
    Score is species level recall.
    Sets solver to 'saga' for l1 penalty. Uses default solver for l2 penalty. solver='saga'
    """
    lr = LogisticRegression(penalty=penalty, multi_class=multiclass, class_weight=classweight,random_state=seed )
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    score = recall_score(y_test, y_pred, average='weighted')
    return score
    

In [53]:
def grid_search_multiclass_lr(seq_file, 
                          taxid_file, 
                          output_dir, 
                          pattern, 
                          list_sample_length, 
                          list_coverage, 
                          list_k, 
                          list_penalty,
                          list_multiclass,
                          list_classweight,
                          seed,
                          grid_search_file,
                          fields,
                          experiment,
                          score_type):
    
    # set up grid search results file
    append_results_to_file(grid_search_file, fields)
    
    # calculate number of combinations
    n_combinations = calc_number_combinations(list_sample_length,
                                              list_coverage,
                                              list_k,
                                              list_eta,
                                              list_epsilon, 
                                              list_penalty,
                                              list_l2_lambda, 
                                              list_max_iter)
    
    # process combinations
    count = 0
    sample_length_prev = -1
    coverage_prev = -1
    
    # parameter combinations
    for sample_length, coverage,k in parameter_generator(list_sample_length,list_coverage,list_k):
        print(sample_length, coverage,k)
        
        if sample_length != sample_length_prev or coverage != coverage_prev:
            
            # fragment combination
            build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
            
            # update previous values
            sample_length_prev = sample_length
            coverage_prev = coverage
                
        # kmer from fragments
        X_train, X_test, y_train, y_test = encode_fragments2(output_dir, pattern,k,seed)
        
        
        # hyperparameter combinations
        for penalty,multiclass,classweight in hyperparameter_generator_lr(list_penalty,list_multiclass,list_classweight):
            print(penalty,multiclass,classweight)
            
            # train and score model
            score = run_lr_classification_recall(X_train, X_test, y_train, y_test, penalty,multiclass,classweight, seed)
            count += 1

            # output results to file
            row = [experiment, 'multiclass', 'Logistic Regression (sklearn)', X_train.shape, sample_length, coverage, k, penalty, multiclass, classweight, score, score_type]
            append_results_to_file(grid_search_file, row)
                        
        print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress
    

### Run 7.01 - l1 penalty with saga solver
Can't use default solver with l1 penalty.

In [56]:
# %%time
# # 40 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','penalty','multi_class', 'class_weight', 'score','score type']
# experiment = '7.01'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [1,2,4,6,8,10,12]
# list_penalty=['l1']
# list_multiclass = ['auto']
# list_classweight= [None]


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_penalty,
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


100 1 1
(85, 400)
l1 auto None
Percent complete: 0.9523809523809524
100 1 2
(85, 793)
l1 auto None
Percent complete: 1.9047619047619049
100 1 4
(85, 1722)
l1 auto None
Percent complete: 2.857142857142857
100 1 6
(85, 1338)
l1 auto None
Percent complete: 3.8095238095238098
100 1 8
(85, 1018)
l1 auto None
Percent complete: 4.761904761904762
100 1 10
(85, 850)
l1 auto None
Percent complete: 5.714285714285714
100 1 12
(85, 680)
l1 auto None




Percent complete: 6.666666666666667
100 10 1
(816, 400)
l1 auto None




Percent complete: 7.6190476190476195
100 10 2
(816, 800)
l1 auto None




Percent complete: 8.571428571428571
100 10 4
(816, 5689)
l1 auto None




Percent complete: 9.523809523809524
100 10 6
(816, 10714)
l1 auto None




Percent complete: 10.476190476190476
100 10 8
(816, 9328)
l1 auto None
Percent complete: 11.428571428571429
100 10 10
(816, 7885)
l1 auto None
Percent complete: 12.380952380952381
100 10 12
(816, 6317)
l1 auto None
Percent complete: 13.333333333333334
100 100 1
(8141, 400)
l1 auto None
Percent complete: 14.285714285714285
100 100 2
(8141, 800)
l1 auto None




Percent complete: 15.238095238095239
100 100 4
(8141, 6399)
l1 auto None




Percent complete: 16.19047619047619
100 100 6
(8141, 38665)
l1 auto None




Percent complete: 17.142857142857142
100 100 8
(8141, 53604)
l1 auto None




Percent complete: 18.095238095238095
100 100 10
(8141, 48759)
l1 auto None




Percent complete: 19.047619047619047
100 100 12
(8141, 39395)
l1 auto None




Percent complete: 20.0
100 200 1
(16282, 400)
l1 auto None
Percent complete: 20.952380952380953
100 200 2
(16282, 800)
l1 auto None




Percent complete: 21.904761904761905
100 200 4
(16282, 6400)
l1 auto None




Percent complete: 22.857142857142858
100 200 6
(16282, 43939)
l1 auto None




Percent complete: 23.809523809523807
100 200 8
(16282, 68550)
l1 auto None




Percent complete: 24.761904761904763
100 200 10
(16282, 63875)
l1 auto None




Percent complete: 25.71428571428571
100 200 12
(16282, 51766)
l1 auto None




Percent complete: 26.666666666666668
100 400 1
(32564, 400)
l1 auto None
Percent complete: 27.61904761904762
100 400 2
(32564, 800)
l1 auto None




Percent complete: 28.57142857142857
100 400 4
(32564, 6400)
l1 auto None




Percent complete: 29.523809523809526
100 400 6
(32564, 46691)
l1 auto None




Percent complete: 30.476190476190478
100 400 8
(32564, 77733)
l1 auto None




Percent complete: 31.428571428571427
100 400 10
(32564, 73479)
l1 auto None




Percent complete: 32.38095238095238
100 400 12
(32564, 59669)
l1 auto None




Percent complete: 33.33333333333333
200 1 1
(44, 800)
l1 auto None
Percent complete: 34.285714285714285
200 1 2
(44, 1444)
l1 auto None
Percent complete: 35.23809523809524
200 1 4
(44, 1910)
l1 auto None
Percent complete: 36.19047619047619
200 1 6
(44, 1429)
l1 auto None
Percent complete: 37.142857142857146
200 1 8
(44, 1096)
l1 auto None
Percent complete: 38.095238095238095
200 1 10
(44, 878)
l1 auto None
Percent complete: 39.04761904761905
200 1 12
(44, 703)
l1 auto None
Percent complete: 40.0
200 10 1
(409, 800)
l1 auto None




Percent complete: 40.95238095238095
200 10 2
(409, 1600)
l1 auto None




Percent complete: 41.904761904761905
200 10 4
(409, 9082)
l1 auto None




Percent complete: 42.857142857142854
200 10 6
(409, 11985)
l1 auto None
Percent complete: 43.80952380952381
200 10 8
(409, 9852)
l1 auto None
Percent complete: 44.761904761904766
200 10 10




(409, 7945)
l1 auto None
Percent complete: 45.714285714285715
200 10 12
(409, 6361)
l1 auto None
Percent complete: 46.666666666666664
200 100 1
(4071, 800)
l1 auto None




Percent complete: 47.61904761904761
200 100 2
(4071, 1600)
l1 auto None




Percent complete: 48.57142857142857
200 100 4
(4071, 12772)
l1 auto None




Percent complete: 49.523809523809526
200 100 6
(4071, 60742)
l1 auto None




Percent complete: 50.476190476190474
200 100 8
(4071, 71903)
l1 auto None




Percent complete: 51.42857142857142
200 100 10
(4071, 60817)
l1 auto None




Percent complete: 52.38095238095239
200 100 12
(4071, 48991)
l1 auto None




Percent complete: 53.333333333333336
200 200 1
(8141, 800)
l1 auto None




Percent complete: 54.285714285714285
200 200 2
(8141, 1600)
l1 auto None




Percent complete: 55.23809523809524
200 200 4
(8141, 12799)
l1 auto None




Percent complete: 56.19047619047619
200 200 6
(8141, 77930)
l1 auto None




Percent complete: 57.14285714285714
200 200 8
(8141, 108026)
l1 auto None




Percent complete: 58.0952380952381
200 200 10
(8141, 94148)
l1 auto None




Percent complete: 59.04761904761905
200 200 12
(8141, 76066)
l1 auto None




Percent complete: 60.0
200 400 1
(16282, 800)
l1 auto None




Percent complete: 60.952380952380956
200 400 2
(16282, 1600)
l1 auto None




Percent complete: 61.904761904761905
200 400 4
(16282, 12800)
l1 auto None




Percent complete: 62.857142857142854
200 400 6
(16282, 88349)
l1 auto None




Percent complete: 63.8095238095238
200 400 8
(16282, 136646)
l1 auto None




Percent complete: 64.76190476190476
200 400 10
(16282, 121850)
l1 auto None




Percent complete: 65.71428571428571
200 400 12
(16282, 98704)
l1 auto None




Percent complete: 66.66666666666666
400 1 1
(23, 1592)
l1 auto None
Percent complete: 67.61904761904762
400 1 2
(23, 2327)
l1 auto None
Percent complete: 68.57142857142857
400 1 4
(23, 2146)
l1 auto None
Percent complete: 69.52380952380952
400 1 6
(23, 1511)
l1 auto None
Percent complete: 70.47619047619048
400 1 8
(23, 1150)
l1 auto None
Percent complete: 71.42857142857143
400 1 10
(23, 920)
l1 auto None
Percent complete: 72.38095238095238
400 1 12
(23, 759)
l1 auto None
Percent complete: 73.33333333333333
400 10 1
(205, 1600)
l1 auto None




Percent complete: 74.28571428571429
400 10 2
(205, 3199)
l1 auto None




Percent complete: 75.23809523809524
400 10 4
(205, 12558)
l1 auto None




Percent complete: 76.19047619047619
400 10 6
(205, 12586)
l1 auto None
Percent complete: 77.14285714285715
400 10 8
(205, 10003)
l1 auto None
Percent complete: 78.0952380952381
400 10 10
(205, 8033)
l1 auto None
Percent complete: 79.04761904761905
400 10 12




(205, 6631)
l1 auto None
Percent complete: 80.0
400 100 1




(2037, 1600)
l1 auto None




Percent complete: 80.95238095238095
400 100 2
(2037, 3200)
l1 auto None




Percent complete: 81.9047619047619
400 100 4
(2037, 25059)
l1 auto None




Percent complete: 82.85714285714286
400 100 6
(2037, 80853)
l1 auto None




Percent complete: 83.80952380952381
400 100 8
(2037, 81580)
l1 auto None




Percent complete: 84.76190476190476
400 100 10
(2037, 67552)
l1 auto None




Percent complete: 85.71428571428571
400 100 12
(2037, 55940)
l1 auto None




Percent complete: 86.66666666666667
400 200 1
(4071, 1600)
l1 auto None




Percent complete: 87.61904761904762
400 200 2
(4071, 3200)
l1 auto None




Percent complete: 88.57142857142857
400 200 4
(4071, 25514)
l1 auto None




Percent complete: 89.52380952380953
400 200 6
(4071, 117598)
l1 auto None




Percent complete: 90.47619047619048
400 200 8
(4071, 138154)
l1 auto None




Percent complete: 91.42857142857143
400 200 10
(4071, 116996)
l1 auto None




Percent complete: 92.38095238095238
400 200 12
(4071, 97172)
l1 auto None




Percent complete: 93.33333333333333
400 400 1
(8141, 1600)
l1 auto None




Percent complete: 94.28571428571428
400 400 2
(8141, 3200)
l1 auto None




Percent complete: 95.23809523809523
400 400 4
(8141, 25588)
l1 auto None




Percent complete: 96.19047619047619
400 400 6
(8141, 148288)
l1 auto None




Percent complete: 97.14285714285714
400 400 8
(8141, 200257)
l1 auto None




Percent complete: 98.09523809523809
400 400 10
(8141, 174176)
l1 auto None




Percent complete: 99.04761904761905
400 400 12
(8141, 145094)
l1 auto None
Percent complete: 100.0
CPU times: user 40min 34s, sys: 14 s, total: 40min 48s
Wall time: 41min 7s




### Run 7.02 - Balanced class weights
Set solver back to default.

In [60]:
# %%time
# 3 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','penalty','multi_class', 'class_weight', 'score','score type']
# experiment = '7.02'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [1,2,4,6,8,10,12]
# list_penalty=['l2']
# list_multiclass = ['auto']
# list_classweight= ['balanced']


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_penalty,
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


100 1 1
(85, 400)
l2 auto balanced
Percent complete: 0.9523809523809524
100 1 2
(85, 793)
l2 auto balanced
Percent complete: 1.9047619047619049
100 1 4
(85, 1722)
l2 auto balanced
Percent complete: 2.857142857142857
100 1 6
(85, 1338)
l2 auto balanced
Percent complete: 3.8095238095238098
100 1 8
(85, 1018)
l2 auto balanced
Percent complete: 4.761904761904762
100 1 10
(85, 850)
l2 auto balanced
Percent complete: 5.714285714285714
100 1 12
(85, 680)
l2 auto balanced
Percent complete: 6.666666666666667
100 10 1
(816, 400)
l2 auto balanced
Percent complete: 7.6190476190476195
100 10 2
(816, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 8.571428571428571
100 10 4
(816, 5689)
l2 auto balanced
Percent complete: 9.523809523809524
100 10 6
(816, 10714)
l2 auto balanced
Percent complete: 10.476190476190476
100 10 8
(816, 9328)
l2 auto balanced
Percent complete: 11.428571428571429
100 10 10
(816, 7885)
l2 auto balanced
Percent complete: 12.380952380952381
100 10 12
(816, 6317)
l2 auto balanced
Percent complete: 13.333333333333334
100 100 1
(8141, 400)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 14.285714285714285
100 100 2
(8141, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 15.238095238095239
100 100 4
(8141, 6399)
l2 auto balanced
Percent complete: 16.19047619047619
100 100 6
(8141, 38665)
l2 auto balanced
Percent complete: 17.142857142857142
100 100 8
(8141, 53604)
l2 auto balanced
Percent complete: 18.095238095238095
100 100 10
(8141, 48759)
l2 auto balanced
Percent complete: 19.047619047619047
100 100 12
(8141, 39395)
l2 auto balanced
Percent complete: 20.0
100 200 1
(16282, 400)
l2 auto balanced
Percent complete: 20.952380952380953
100 200 2
(16282, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 21.904761904761905
100 200 4
(16282, 6400)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 22.857142857142858
100 200 6
(16282, 43939)
l2 auto balanced
Percent complete: 23.809523809523807
100 200 8
(16282, 68550)
l2 auto balanced
Percent complete: 24.761904761904763
100 200 10
(16282, 63875)
l2 auto balanced
Percent complete: 25.71428571428571
100 200 12
(16282, 51766)
l2 auto balanced
Percent complete: 26.666666666666668
100 400 1
(32564, 400)
l2 auto balanced
Percent complete: 27.61904761904762
100 400 2
(32564, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 28.57142857142857
100 400 4
(32564, 6400)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 29.523809523809526
100 400 6
(32564, 46691)
l2 auto balanced
Percent complete: 30.476190476190478
100 400 8
(32564, 77733)
l2 auto balanced
Percent complete: 31.428571428571427
100 400 10
(32564, 73479)
l2 auto balanced
Percent complete: 32.38095238095238
100 400 12
(32564, 59669)
l2 auto balanced
Percent complete: 33.33333333333333
200 1 1
(44, 800)
l2 auto balanced
Percent complete: 34.285714285714285
200 1 2
(44, 1444)
l2 auto balanced
Percent complete: 35.23809523809524
200 1 4
(44, 1910)
l2 auto balanced
Percent complete: 36.19047619047619
200 1 6
(44, 1429)
l2 auto balanced
Percent complete: 37.142857142857146
200 1 8
(44, 1096)
l2 auto balanced
Percent complete: 38.095238095238095
200 1 10
(44, 878)
l2 auto balanced
Percent complete: 39.04761904761905
200 1 12
(44, 703)
l2 auto balanced
Percent complete: 40.0
200 10 1
(409, 800)
l2 auto balanced
Percent complete: 40.95238095238095
200 10 2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(409, 1600)
l2 auto balanced
Percent complete: 41.904761904761905
200 10 4
(409, 9082)
l2 auto balanced
Percent complete: 42.857142857142854
200 10 6
(409, 11985)
l2 auto balanced
Percent complete: 43.80952380952381
200 10 8
(409, 9852)
l2 auto balanced
Percent complete: 44.761904761904766
200 10 10
(409, 7945)
l2 auto balanced
Percent complete: 45.714285714285715
200 10 12
(409, 6361)
l2 auto balanced
Percent complete: 46.666666666666664
200 100 1
(4071, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 47.61904761904761
200 100 2
(4071, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 48.57142857142857
200 100 4
(4071, 12772)
l2 auto balanced
Percent complete: 49.523809523809526
200 100 6
(4071, 60742)
l2 auto balanced
Percent complete: 50.476190476190474
200 100 8
(4071, 71903)
l2 auto balanced
Percent complete: 51.42857142857142
200 100 10
(4071, 60817)
l2 auto balanced
Percent complete: 52.38095238095239
200 100 12
(4071, 48991)
l2 auto balanced
Percent complete: 53.333333333333336
200 200 1
(8141, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 54.285714285714285
200 200 2
(8141, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 55.23809523809524
200 200 4
(8141, 12799)
l2 auto balanced
Percent complete: 56.19047619047619
200 200 6
(8141, 77930)
l2 auto balanced
Percent complete: 57.14285714285714
200 200 8
(8141, 108026)
l2 auto balanced
Percent complete: 58.0952380952381
200 200 10
(8141, 94148)
l2 auto balanced
Percent complete: 59.04761904761905
200 200 12
(8141, 76066)
l2 auto balanced
Percent complete: 60.0
200 400 1
(16282, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 60.952380952380956
200 400 2
(16282, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 61.904761904761905
200 400 4
(16282, 12800)
l2 auto balanced
Percent complete: 62.857142857142854
200 400 6
(16282, 88349)
l2 auto balanced
Percent complete: 63.8095238095238
200 400 8
(16282, 136646)
l2 auto balanced
Percent complete: 64.76190476190476
200 400 10
(16282, 121850)
l2 auto balanced
Percent complete: 65.71428571428571
200 400 12
(16282, 98704)
l2 auto balanced
Percent complete: 66.66666666666666
400 1 1
(23, 1592)
l2 auto balanced
Percent complete: 67.61904761904762
400 1 2
(23, 2327)
l2 auto balanced
Percent complete: 68.57142857142857
400 1 4
(23, 2146)
l2 auto balanced
Percent complete: 69.52380952380952
400 1 6
(23, 1511)
l2 auto balanced
Percent complete: 70.47619047619048
400 1 8
(23, 1150)
l2 auto balanced
Percent complete: 71.42857142857143
400 1 10
(23, 920)
l2 auto balanced
Percent complete: 72.38095238095238
400 1 12
(23, 759)
l2 auto balanced
Percent complete: 73.33333333333333
400 10 1
(205, 1600)
l2 auto balanced
Percent complete: 74.285714

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 80.95238095238095
400 100 2
(2037, 3200)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 81.9047619047619
400 100 4
(2037, 25059)
l2 auto balanced
Percent complete: 82.85714285714286
400 100 6
(2037, 80853)
l2 auto balanced
Percent complete: 83.80952380952381
400 100 8
(2037, 81580)
l2 auto balanced
Percent complete: 84.76190476190476
400 100 10
(2037, 67552)
l2 auto balanced
Percent complete: 85.71428571428571
400 100 12
(2037, 55940)
l2 auto balanced
Percent complete: 86.66666666666667
400 200 1
(4071, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 87.61904761904762
400 200 2
(4071, 3200)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 88.57142857142857
400 200 4
(4071, 25514)
l2 auto balanced
Percent complete: 89.52380952380953
400 200 6
(4071, 117598)
l2 auto balanced
Percent complete: 90.47619047619048
400 200 8
(4071, 138154)
l2 auto balanced
Percent complete: 91.42857142857143
400 200 10
(4071, 116996)
l2 auto balanced
Percent complete: 92.38095238095238
400 200 12
(4071, 97172)
l2 auto balanced
Percent complete: 93.33333333333333
400 400 1
(8141, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 94.28571428571428
400 400 2
(8141, 3200)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 95.23809523809523
400 400 4
(8141, 25588)
l2 auto balanced
Percent complete: 96.19047619047619
400 400 6
(8141, 148288)
l2 auto balanced
Percent complete: 97.14285714285714
400 400 8
(8141, 200257)
l2 auto balanced
Percent complete: 98.09523809523809
400 400 10
(8141, 174176)
l2 auto balanced
Percent complete: 99.04761904761905
400 400 12
(8141, 145094)
l2 auto balanced
Percent complete: 100.0
CPU times: user 8min 17s, sys: 9.39 s, total: 8min 26s
Wall time: 3min 27s


### Run 7.03 - No penalty, no class weights
Changed solver back to default.

In [59]:
# %%time
# # 5 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','penalty','multi_class', 'class_weight', 'score','score type']
# experiment = '7.03'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [1,2,4,6,8,10,12]
# list_penalty=['none']
# list_multiclass = ['auto']
# list_classweight= [None]


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_penalty,
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


100 1 1
(85, 400)
none auto None
Percent complete: 0.9523809523809524
100 1 2
(85, 793)
none auto None
Percent complete: 1.9047619047619049
100 1 4
(85, 1722)
none auto None
Percent complete: 2.857142857142857
100 1 6
(85, 1338)
none auto None
Percent complete: 3.8095238095238098
100 1 8
(85, 1018)
none auto None
Percent complete: 4.761904761904762
100 1 10
(85, 850)
none auto None
Percent complete: 5.714285714285714
100 1 12
(85, 680)
none auto None
Percent complete: 6.666666666666667
100 10 1
(816, 400)
none auto None
Percent complete: 7.6190476190476195
100 10 2
(816, 800)
none auto None
Percent complete: 8.571428571428571
100 10 4
(816, 5689)
none auto None
Percent complete: 9.523809523809524
100 10 6
(816, 10714)
none auto None
Percent complete: 10.476190476190476
100 10 8
(816, 9328)
none auto None
Percent complete: 11.428571428571429
100 10 10
(816, 7885)
none auto None
Percent complete: 12.380952380952381
100 10 12
(816, 6317)
none auto None
Percent complete: 13.333333333333334

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 14.285714285714285
100 100 2
(8141, 800)
none auto None


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 15.238095238095239
100 100 4
(8141, 6399)
none auto None
Percent complete: 16.19047619047619
100 100 6
(8141, 38665)
none auto None
Percent complete: 17.142857142857142
100 100 8
(8141, 53604)
none auto None
Percent complete: 18.095238095238095
100 100 10
(8141, 48759)
none auto None
Percent complete: 19.047619047619047
100 100 12
(8141, 39395)
none auto None
Percent complete: 20.0
100 200 1
(16282, 400)
none auto None
Percent complete: 20.952380952380953
100 200 2
(16282, 800)
none auto None


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 21.904761904761905
100 200 4
(16282, 6400)
none auto None
Percent complete: 22.857142857142858
100 200 6
(16282, 43939)
none auto None
Percent complete: 23.809523809523807
100 200 8
(16282, 68550)
none auto None
Percent complete: 24.761904761904763
100 200 10
(16282, 63875)
none auto None
Percent complete: 25.71428571428571
100 200 12
(16282, 51766)
none auto None
Percent complete: 26.666666666666668
100 400 1
(32564, 400)
none auto None
Percent complete: 27.61904761904762
100 400 2
(32564, 800)
none auto None


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 28.57142857142857
100 400 4
(32564, 6400)
none auto None
Percent complete: 29.523809523809526
100 400 6
(32564, 46691)
none auto None
Percent complete: 30.476190476190478
100 400 8
(32564, 77733)
none auto None
Percent complete: 31.428571428571427
100 400 10
(32564, 73479)
none auto None
Percent complete: 32.38095238095238
100 400 12
(32564, 59669)
none auto None
Percent complete: 33.33333333333333
200 1 1
(44, 800)
none auto None
Percent complete: 34.285714285714285
200 1 2
(44, 1444)
none auto None
Percent complete: 35.23809523809524
200 1 4
(44, 1910)
none auto None
Percent complete: 36.19047619047619
200 1 6
(44, 1429)
none auto None
Percent complete: 37.142857142857146
200 1 8
(44, 1096)
none auto None
Percent complete: 38.095238095238095
200 1 10
(44, 878)
none auto None
Percent complete: 39.04761904761905
200 1 12
(44, 703)
none auto None
Percent complete: 40.0
200 10 1
(409, 800)
none auto None
Percent complete: 40.95238095238095
200 10 2
(409, 1600)
none auto

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 54.285714285714285
200 200 2
(8141, 1600)
none auto None
Percent complete: 55.23809523809524
200 200 4
(8141, 12799)
none auto None
Percent complete: 56.19047619047619
200 200 6
(8141, 77930)
none auto None
Percent complete: 57.14285714285714
200 200 8
(8141, 108026)
none auto None
Percent complete: 58.0952380952381
200 200 10
(8141, 94148)
none auto None
Percent complete: 59.04761904761905
200 200 12
(8141, 76066)
none auto None
Percent complete: 60.0
200 400 1
(16282, 800)
none auto None


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 60.952380952380956
200 400 2
(16282, 1600)
none auto None
Percent complete: 61.904761904761905
200 400 4
(16282, 12800)
none auto None
Percent complete: 62.857142857142854
200 400 6
(16282, 88349)
none auto None
Percent complete: 63.8095238095238
200 400 8
(16282, 136646)
none auto None
Percent complete: 64.76190476190476
200 400 10
(16282, 121850)
none auto None
Percent complete: 65.71428571428571
200 400 12
(16282, 98704)
none auto None
Percent complete: 66.66666666666666
400 1 1
(23, 1592)
none auto None
Percent complete: 67.61904761904762
400 1 2
(23, 2327)
none auto None
Percent complete: 68.57142857142857
400 1 4
(23, 2146)
none auto None
Percent complete: 69.52380952380952
400 1 6
(23, 1511)
none auto None
Percent complete: 70.47619047619048
400 1 8
(23, 1150)
none auto None
Percent complete: 71.42857142857143
400 1 10
(23, 920)
none auto None
Percent complete: 72.38095238095238
400 1 12
(23, 759)
none auto None
Percent complete: 73.33333333333333
400 10 1
(205

### Run 7.04 - L2 penalty, ovr

In [None]:
%%time

# parameters
seq_file = 'data/train_small-db_toy-2000.fasta'
taxid_file = 'data/train_small-db_toy-2000.taxid'
output_dir = 'data/sampling/sampling-toy-2000'
pattern = 'fragments*.npy'
seed = 42
date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
grid_search_file  = 'data/gridsearch-2000/lr-multi.{}.csv'.format(date_time)
fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','penalty','multi_class', 'class_weight', 'score','score type']
experiment = '7.04'
score_type = 'species_recall'

# combinations to try
list_sample_length = [100,200,400]
list_coverage = [1,10,100,200,400]
list_k = [1,2,4,6,8,10,12]
list_penalty=['l2']
list_multiclass = ['ovr']
list_classweight= [None]


grid_search_multiclass_lr(seq_file, 
                          taxid_file, 
                          output_dir, 
                          pattern, 
                          list_sample_length, 
                          list_coverage, 
                          list_k, 
                          list_penalty,
                          list_multiclass,
                          list_classweight,
                          seed,
                          grid_search_file,
                          fields,
                          experiment,
                          score_type)


100 1 1
(85, 400)
l2 ovr None
Percent complete: 0.9523809523809524
100 1 2
(85, 793)
l2 ovr None
Percent complete: 1.9047619047619049
100 1 4
(85, 1722)
l2 ovr None
Percent complete: 2.857142857142857
100 1 6
(85, 1338)
l2 ovr None
Percent complete: 3.8095238095238098
100 1 8
(85, 1018)
l2 ovr None
Percent complete: 4.761904761904762
100 1 10
(85, 850)
l2 ovr None
Percent complete: 5.714285714285714
100 1 12
(85, 680)
l2 ovr None
Percent complete: 6.666666666666667
100 10 1
(816, 400)
l2 ovr None
Percent complete: 7.6190476190476195
100 10 2
(816, 800)
l2 ovr None
Percent complete: 8.571428571428571
100 10 4
(816, 5689)
l2 ovr None
Percent complete: 9.523809523809524
100 10 6
(816, 10714)
l2 ovr None
Percent complete: 10.476190476190476
100 10 8
(816, 9328)
l2 ovr None
Percent complete: 11.428571428571429
100 10 10
(816, 7885)
l2 ovr None
Percent complete: 12.380952380952381
100 10 12
(816, 6317)
l2 ovr None
Percent complete: 13.333333333333334
100 100 1
(8141, 400)
l2 ovr None
Percent

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Percent complete: 47.61904761904761
200 100 2
(4071, 1600)
l2 ovr None
Percent complete: 48.57142857142857
200 100 4
(4071, 12772)
l2 ovr None
Percent complete: 49.523809523809526
200 100 6
(4071, 60742)
l2 ovr None
Percent complete: 50.476190476190474
200 100 8
(4071, 71903)
l2 ovr None
Percent complete: 51.42857142857142
200 100 10
(4071, 60817)
l2 ovr None
Percent complete: 52.38095238095239
200 100 12
(4071, 48991)
l2 ovr None
Percent complete: 53.333333333333336
200 200 1
(8141, 800)
l2 ovr None


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Percent complete: 54.285714285714285
200 200 2
(8141, 1600)
l2 ovr None


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Percent complete: 55.23809523809524
200 200 4
(8141, 12799)
l2 ovr None
Percent complete: 56.19047619047619
200 200 6
(8141, 77930)
l2 ovr None
Percent complete: 57.14285714285714
200 200 8
(8141, 108026)
l2 ovr None
Percent complete: 58.0952380952381
200 200 10
(8141, 94148)
l2 ovr None
Percent complete: 59.04761904761905
200 200 12
(8141, 76066)
l2 ovr None
Percent complete: 60.0
200 400 1
(16282, 800)
l2 ovr None


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 60.952380952380956
200 400 2
(16282, 1600)
l2 ovr None


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Percent complete: 61.904761904761905
200 400 4
(16282, 12800)
l2 ovr None
Percent complete: 62.857142857142854
200 400 6
(16282, 88349)
l2 ovr None
Percent complete: 63.8095238095238
200 400 8
(16282, 136646)
l2 ovr None
Percent complete: 64.76190476190476
200 400 10
(16282, 121850)
l2 ovr None
Percent complete: 65.71428571428571
200 400 12
(16282, 98704)
l2 ovr None
Percent complete: 66.66666666666666
400 1 1
(23, 1592)
l2 ovr None
Percent complete: 67.61904761904762
400 1 2
(23, 2327)
l2 ovr None
Percent complete: 68.57142857142857
400 1 4
(23, 2146)
l2 ovr None
Percent complete: 69.52380952380952
400 1 6
(23, 1511)
l2 ovr None
Percent complete: 70.47619047619048
400 1 8
(23, 1150)
l2 ovr None
Percent complete: 71.42857142857143
400 1 10
(23, 920)
l2 ovr None
Percent complete: 72.38095238095238
400 1 12
(23, 759)
l2 ovr None
Percent complete: 73.33333333333333
400 10 1
(205, 1600)
l2 ovr None
Percent complete: 74.28571428571429
400 10 2
(205, 3199)
l2 ovr None
Percent complete: 75.23

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Percent complete: 80.95238095238095
400 100 2
(2037, 3200)
l2 ovr None
Percent complete: 81.9047619047619
400 100 4
(2037, 25059)
l2 ovr None
Percent complete: 82.85714285714286
400 100 6
(2037, 80853)
l2 ovr None
Percent complete: 83.80952380952381
400 100 8
(2037, 81580)
l2 ovr None
Percent complete: 84.76190476190476
400 100 10
(2037, 67552)
l2 ovr None
Percent complete: 85.71428571428571
400 100 12
(2037, 55940)
l2 ovr None
Percent complete: 86.66666666666667
400 200 1
(4071, 1600)
l2 ovr None


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Percent complete: 87.61904761904762
400 200 2
(4071, 3200)
l2 ovr None
Percent complete: 88.57142857142857
400 200 4
(4071, 25514)
l2 ovr None
Percent complete: 89.52380952380953
400 200 6
(4071, 117598)
l2 ovr None
Percent complete: 90.47619047619048
400 200 8
(4071, 138154)
l2 ovr None
