### Imports

In [1]:
import shutil
import csv
import datetime
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from packages.metagenomics import sampling2, encoding2
from sklearn.linear_model import LogisticRegression

from packages.LogisticRegression.MulticlassLogisticRegression import MulticlassLogisticRegression,MulticlassLogisticRegression2


### Ideas for improvements
- try converting back to sparse matrix after adding augmented column?

In [2]:
def append_results_to_file(filename, fields=None, rows=None):
    
    with open(filename, 'a') as f:

        write = csv.writer(f)

        if fields:
            write.writerow(fields)

        if rows:
            write.writerows(rows)

In [3]:
def run_mlr_classification_recall(X_train, X_test, y_train, y_test, eta, epsilon):
    """
    Score is species level recall.
    """
    mlr = MulticlassLogisticRegression2(eta=eta, epsilon=epsilon)
    mlr.fit(X_train,y_train)
    y_pred = mlr.predict(X_test)
    score = recall_score(y_test, y_pred, average='weighted')
    return score
    

In [4]:
def build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed):
    # delete output directory if it previously exists
    try:
        shutil.rmtree(output_dir)
    except FileNotFoundError:
        print('Existing directory was not found. Process will generate a directory.')

    # build fragments
    sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

In [5]:
def encode_fragments(output_dir, pattern, k, seed):
    
    # encode data
    fragments = sampling2.read_fragments(output_dir, pattern)
    X_enc, y = encoding2.encode_fragment_dataset(fragments,k)
    le = preprocessing.LabelEncoder()
    y_enc = le.fit_transform(y)
    
    # calculate number of classes
    n_classes = len(np.unique(y_enc))
#     print('n_classes:',n_classes)
    n_classes_train = 0
    n_classes_test = 0
    while n_classes_train < n_classes or n_classes_test < n_classes:

        # split data into test and training
        X_train, X_test, y_train, y_test = train_test_split(X_enc.toarray(), y_enc, test_size=0.33)
        n_classes_train = len(np.unique(y_train))
        n_classes_test = len(np.unique(y_test))
#         print('train:',len(y_train))
#         print('test:', len(y_test))
    
    print(X_enc.shape)
    
    return X_train, X_test, y_train, y_test

In [6]:
def calc_number_combinations(list_sample_length,list_coverage,list_k,list_eta,list_epsilon):
    n = len(list_sample_length) * len(list_coverage) * len(list_k) *len(list_eta) * len(list_epsilon)
    return n

In [7]:
def grid_search_multiclass_mlr(seq_file, 
                              taxid_file, 
                              output_dir, 
                              pattern, 
                              list_sample_length, 
                              list_coverage, 
                              list_k,
                              list_eta,
                              list_epsilon, 
                              seed,
                              grid_search_file,
                              fields,
                              experiment,
                              score_type):
    
    # set up grid search results file
    append_results_to_file(grid_search_file, fields)
    
    # calculate number of combinations
    n_combinations = calc_number_combinations(list_sample_length,list_coverage,list_k,list_eta,list_epsilon)
    
    # process combinations
    count = 0
    for sample_length in list_sample_length:
        for coverage in list_coverage:
            
            # fragment combination
            build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
            for k in list_k:
                
                # kmer combination
                X_train, X_test, y_train, y_test = encode_fragments(output_dir, pattern,k,seed)
                for eta in list_eta:
                    for epsilon in list_epsilon:
                        
                        # random forest combination
                        score = run_mlr_classification_recall(X_train, X_test, y_train,y_test, eta, epsilon)
                        count += 1
                        
                        # output results to file
                        row = [experiment, 'multiclass', 'Logistic Regression', X_train.shape, sample_length, coverage, k, eta, epsilon, score, score_type]
                        append_results_to_file(grid_search_file, row)
                        
                print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress
    

# Run Set 1 - MLR Toy
2000 lengths dataset

### Run 4.01
Stopped early due to runs taking a long time.

In [8]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.01'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [0.2,1,10]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 4.02

In [9]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.02'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [200]
# list_coverage = [0.3,1,10]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 4.03

In [10]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.03'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [400]
# list_coverage = [0.5,1,10]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 4.04

In [11]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.04'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [100]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 4.05

In [12]:
# %%time
##40 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.05'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [200]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


# Run Set 2 - sklearn with l1 penalty
Compare with sklearn implementation of MLR to see if performance is similar.

In [13]:
# def calc_number_combinations2(list_sample_length,list_coverage,list_k,list_multiclass,list_classweight):
#     n = len(list_sample_length) * len(list_coverage) * len(list_k) *len(list_multiclass) * len(list_classweight)
#     return n

In [14]:
def encode_fragments2(output_dir, pattern, k, seed):
    """
    Does not convert sparse matrix to numpy matrix.
    """
    
    # encode data
    fragments = sampling2.read_fragments(output_dir, pattern)
    X_enc, y = encoding2.encode_fragment_dataset(fragments,k)
    le = preprocessing.LabelEncoder()
    y_enc = le.fit_transform(y)
    
    # calculate number of classes
    n_classes = len(np.unique(y_enc))
#     print('n_classes:',n_classes)
    n_classes_train = 0
    n_classes_test = 0
    while n_classes_train < n_classes or n_classes_test < n_classes:

        # split data into test and training
        X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size=0.33)
        n_classes_train = len(np.unique(y_train))
        n_classes_test = len(np.unique(y_test))
#         print('train:',y_train)
#         print('test:', y_test)
    
    print(X_enc.shape)
    
    return X_train, X_test, y_train, y_test

In [15]:
# def run_lr_classification_recall(X_train, X_test, y_train, y_test, multiclass, classweight, seed ):
#     """
#     Score is species level recall. Uses sklearn version of logistic regression.
#     """
#     lr = LogisticRegression(random_state=seed, multi_class=multiclass, class_weight=classweight )
#     lr.fit(X_train,y_train)
#     y_pred = lr.predict(X_test)
#     score = recall_score(y_test, y_pred, average='weighted')
#     return score
    

In [16]:
# def grid_search_multiclass_lr(seq_file, 
#                               taxid_file, 
#                               output_dir, 
#                               pattern, 
#                               list_sample_length, 
#                               list_coverage, 
#                               list_k,
#                               list_multiclass,
#                               list_classweight,
#                               seed,
#                               grid_search_file,
#                               fields,
#                               experiment,
#                               score_type):
    
#     # set up grid search results file
#     append_results_to_file(grid_search_file, fields)
    
#     # calculate number of combinations
#     n_combinations = calc_number_combinations2(list_sample_length,list_coverage,list_k, list_multiclass, list_classweight)
    
#     # process combinations
#     count = 0
#     for sample_length in list_sample_length:
#         for coverage in list_coverage:
            
#             # fragment combination
#             build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
#             for k in list_k:
                
#                 # kmer combination
#                 X_train, X_test, y_train, y_test = encode_fragments2(output_dir, pattern,k,seed)
                
                
#                 for multiclass in list_multiclass:
#                     for classweight in list_classweight:

                        
#                         # random forest combination
#                         score = run_lr_classification_recall(X_train, X_test, y_train, y_test, multiclass, classweight, seed)
#                         count += 1

#                         # output results to file
#                         row = [experiment, 'multiclass', 'Logistic Regression (sklearn)', X_train.shape, sample_length, coverage, k, multiclass, classweight, score, score_type]
#                         append_results_to_file(grid_search_file, row)

#                 print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress


### Run 5.01

In [17]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','multiclass', 'class_weight', 'score','score type']
# experiment = '5.01'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [1]
# list_multiclass = ['auto']
# list_classweight= [None]


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 5.02

In [18]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','multiclass', 'class_weight', 'score','score type']
# experiment = '5.02'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [2,4,6,8,10,12]
# list_multiclass = ['auto']
# list_classweight= [None]


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


# Run Set 3 - MLR with L2 penalty
- toy-2000 dataset
- See how performance changes with L2 penalty
- Changed breakpoint in gradient descent to 100 from 100,000 as well.

In [19]:
def calc_number_combinations(*args):
    total = 1
    for each in args:
        total *= len(each)
    return total

In [20]:
def parameter_generator(list_sample_length,list_coverage,list_k):
    
    for L in list_sample_length:
        for c in list_coverage:
            for k in list_k:
                yield L, c, k

In [21]:
def hyperparameter_generator(list_eta,list_epsilon, list_penalty, list_l2_lambda,list_max_iter):
    
    for eta in list_eta:
        for e in list_epsilon:
            for penalty in list_penalty:
                for l2 in list_l2_lambda:
                    for m in list_max_iter:
                        yield eta,e,penalty,l2,m

In [22]:
def run_mlr_classification_recall(X_train, X_test, y_train, y_test, eta, epsilon, penalty, l2_lambda, max_iter):
    """
    Score is species level recall.
    """
    mlr = MulticlassLogisticRegression2(eta=eta, 
                                        epsilon=epsilon, 
                                        penalty=penalty, 
                                        l2_lambda=l2_lambda, 
                                        max_iter=max_iter)
    mlr.fit(X_train,y_train)
    y_pred = mlr.predict(X_test)
    score = recall_score(y_test, y_pred, average='weighted')
    return score
    

In [23]:
def grid_search_multiclass_mlr(seq_file, 
                              taxid_file, 
                              output_dir, 
                              pattern, 
                              list_sample_length, 
                              list_coverage, 
                              list_k,
                              list_eta,
                              list_epsilon, 
                              list_penalty,
                              list_l2_lambda,
                              list_max_iter,
                              seed,
                              grid_search_file,
                              fields,
                              experiment,
                              score_type):
    
    # set up grid search results file
    append_results_to_file(grid_search_file, fields)
    
    # calculate number of combinations
    n_combinations = calc_number_combinations(list_sample_length,
                                              list_coverage,
                                              list_k,
                                              list_eta,
                                              list_epsilon, 
                                              list_penalty,
                                              list_l2_lambda, 
                                              list_max_iter)
    
    # process combinations
    count = 0
    sample_length_prev = -1
    coverage_prev = -1
    
    # parameter combinations
    for sample_length, coverage,k in parameter_generator(list_sample_length,list_coverage,list_k):
        print(sample_length, coverage,k)
        
        if sample_length != sample_length_prev or coverage != coverage_prev:
            
            # fragment combination
            build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
            
            # update previous values
            sample_length_prev = sample_length
            coverage_prev = coverage
                
        # kmer from fragments
        X_train, X_test, y_train, y_test = encode_fragments(output_dir, pattern,k,seed)
        
        
        # hyperparameter combinations
        for eta, epsilon, penalty, l2_lambda, max_iter in hyperparameter_generator(list_eta,list_epsilon, list_penalty, list_l2_lambda,list_max_iter):
            print(eta, epsilon, penalty, l2_lambda, max_iter)
            
            # train and score model
            score = run_mlr_classification_recall(X_train, X_test, y_train, y_test, eta, epsilon, penalty, l2_lambda, max_iter)
            count += 1

            # output results to file
            row = [experiment, 'multiclass', 'Logistic Regression', X_train.shape, sample_length, coverage, k, eta, epsilon, penalty, l2_lambda, max_iter, score, score_type]
            append_results_to_file(grid_search_file, row)
                        
        print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress
    

### Run 6.01

In [24]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'l2_penalty','score','score type']
# experiment = '6.01'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [1,10,100,200,400]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]
# list_l2_penalty = [1,10,100, 0.1]


# grid_search_multiclass_mlr_l2(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           list_l2_penalty,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 6.02
Testing out range for lambda. It appears that close to zero (0, 0.1) is best, with values decreasing more after 1.0.

In [25]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment',
#           'category',
#           'classifier',
#           'training shape',
#           'sample_length',
#           'coverage',
#           'k',
#           'eta', 
#           'epsilon', 
#           'penalty',
#           'l2_lambda',
#           'max_iter',
#           'score',
#           'score type']

# experiment = '6.02'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [200]
# list_k = [4]
# list_eta = [0.01]
# list_epsilon = [0.01]
# list_penalty = ['l2']
# list_l2_lambda = [0,0.1,0.5,1,3,6,10]
# list_max_iter = [100]


# grid_search_multiclass_mlr(seq_file, 
#                                 taxid_file, 
#                                 output_dir, 
#                                 pattern, 
#                                 list_sample_length, 
#                                 list_coverage, 
#                                 list_k, 
#                                 list_eta,
#                                 list_epsilon,
#                                 list_penalty,
#                                 list_l2_lambda,
#                                 list_max_iter,
#                                 seed,
#                                 grid_search_file,
#                                 fields,
#                                 experiment,
#                                 score_type)


### Run 6.03
Finishing out data for plotting

In [26]:
# %%time
# 5.5 h

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment',
#           'category',
#           'classifier',
#           'training shape',
#           'sample_length',
#           'coverage',
#           'k',
#           'eta', 
#           'epsilon', 
#           'penalty',
#           'l2_lambda',
#           'max_iter',
#           'score',
#           'score type']

# experiment = '6.03'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [400]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]
# list_penalty = [None]
# list_l2_lambda = [0]
# list_max_iter = [100]


# grid_search_multiclass_mlr(seq_file, 
#                                 taxid_file, 
#                                 output_dir, 
#                                 pattern, 
#                                 list_sample_length, 
#                                 list_coverage, 
#                                 list_k, 
#                                 list_eta,
#                                 list_epsilon,
#                                 list_penalty,
#                                 list_l2_lambda,
#                                 list_max_iter,
#                                 seed,
#                                 grid_search_file,
#                                 fields,
#                                 experiment,
#                                 score_type)


### Run 6.04
Finishing out data for plotting
Try a larger learning rate to see if that speeds up processing.

In [27]:
# %%time
# # 4 h 32 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment',
#           'category',
#           'classifier',
#           'training shape',
#           'sample_length',
#           'coverage',
#           'k',
#           'eta', 
#           'epsilon', 
#           'penalty',
#           'l2_lambda',
#           'max_iter',
#           'score',
#           'score type']

# experiment = '6.04'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [200,400]
# list_coverage = [100,200,400]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.02]
# list_epsilon = [0.01]
# list_penalty = [None]
# list_l2_lambda = [0]
# list_max_iter = [100]


# grid_search_multiclass_mlr(seq_file, 
#                                 taxid_file, 
#                                 output_dir, 
#                                 pattern, 
#                                 list_sample_length, 
#                                 list_coverage, 
#                                 list_k, 
#                                 list_eta,
#                                 list_epsilon,
#                                 list_penalty,
#                                 list_l2_lambda,
#                                 list_max_iter,
#                                 seed,
#                                 grid_search_file,
#                                 fields,
#                                 experiment,
#                                 score_type)


# Run Set 4 - sklearn with l1 penalty
To compare against L2 results

In [28]:
def hyperparameter_generator_lr(list_penalty,list_multiclass,list_classweight):
    
    for penalty in list_penalty:
        for multiclass in list_multiclass:
            for classweight in list_classweight:
                yield penalty,multiclass,classweight

In [29]:
def run_lr_classification_recall(X_train, X_test, y_train, y_test, penalty,multiclass,classweight, seed):
    """
    Score is species level recall.
    Sets solver to 'saga' for l1 penalty. Uses default solver for l2 penalty. solver='saga'
    """
    lr = LogisticRegression(penalty=penalty, multi_class=multiclass, class_weight=classweight,random_state=seed )
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    score = recall_score(y_test, y_pred, average='weighted')
    return score
    

In [30]:
def grid_search_multiclass_lr(seq_file, 
                          taxid_file, 
                          output_dir, 
                          pattern, 
                          list_sample_length, 
                          list_coverage, 
                          list_k, 
                          list_penalty,
                          list_multiclass,
                          list_classweight,
                          seed,
                          grid_search_file,
                          fields,
                          experiment,
                          score_type):
    
    # set up grid search results file
    append_results_to_file(grid_search_file, fields)
    
    # calculate number of combinations
    n_combinations = calc_number_combinations(list_sample_length,
                                              list_coverage,
                                              list_k,
                                              list_penalty,
                                              list_multiclass,
                                              list_classweight)
    
    # process combinations
    count = 0
    sample_length_prev = -1
    coverage_prev = -1
    
    # parameter combinations
    for sample_length, coverage,k in parameter_generator(list_sample_length,list_coverage,list_k):
        print(sample_length, coverage,k)
        
        if sample_length != sample_length_prev or coverage != coverage_prev:
            
            # fragment combination
            build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
            
            # update previous values
            sample_length_prev = sample_length
            coverage_prev = coverage
                
        # kmer from fragments
        X_train, X_test, y_train, y_test = encode_fragments2(output_dir, pattern,k,seed)
        
        
        # hyperparameter combinations
        for penalty,multiclass,classweight in hyperparameter_generator_lr(list_penalty,list_multiclass,list_classweight):
            print(penalty,multiclass,classweight)
            
            # train and score model
            score = run_lr_classification_recall(X_train, X_test, y_train, y_test, penalty,multiclass,classweight, seed)
            count += 1

            # output results to file
            row = [experiment, 'multiclass', 'Logistic Regression (sklearn)', X_train.shape, sample_length, coverage, k, penalty, multiclass, classweight, score, score_type]
            append_results_to_file(grid_search_file, row)
                        
        print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress
    

### Run 7.01 - l1 penalty with saga solver
Can't use default solver with l1 penalty.

In [31]:
# %%time
# # 40 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','penalty','multi_class', 'class_weight', 'score','score type']
# experiment = '7.01'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [1,2,4,6,8,10,12]
# list_penalty=['l1']
# list_multiclass = ['auto']
# list_classweight= [None]


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_penalty,
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 7.02 - Balanced class weights
Set solver back to default.

In [32]:
# %%time
# 3 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','penalty','multi_class', 'class_weight', 'score','score type']
# experiment = '7.02'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [1,2,4,6,8,10,12]
# list_penalty=['l2']
# list_multiclass = ['auto']
# list_classweight= ['balanced']


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_penalty,
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 7.03 - No penalty, no class weights
Changed solver back to default.

In [33]:
# %%time
# # 5 min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','penalty','multi_class', 'class_weight', 'score','score type']
# experiment = '7.03'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [1,2,4,6,8,10,12]
# list_penalty=['none']
# list_multiclass = ['auto']
# list_classweight= [None]


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_penalty,
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 7.04 - L2 penalty, ovr

In [34]:
# %%time
# 3min

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','penalty','multi_class', 'class_weight', 'score','score type']
# experiment = '7.04'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [1,2,4,6,8,10,12]
# list_penalty=['l2']
# list_multiclass = ['ovr']
# list_classweight= [None]


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_penalty,
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


# Run Set 9 - sklearn with l2 penalty and class weights on 7400 dataset

### Run 9.01

In [35]:
# %%time
# # 2.5 hours

# # parameters
# seq_file = 'data/train_small-db_toy-7400.fasta'
# taxid_file = 'data/train_small-db_toy-7400.taxid'
# output_dir = 'data/sampling/sampling-toy-7400'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-7400/results-lrpackage-l2-balanced.csv'
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','penalty','multi_class', 'class_weight', 'score','score type']
# experiment = '9.01'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [1,2,4,6] # ,8,10,12
# list_penalty=['l2']
# list_multiclass = ['auto']
# list_classweight= ['balanced']


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_penalty,
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 9.02

In [36]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-7400.fasta'
# taxid_file = 'data/train_small-db_toy-7400.taxid'
# output_dir = 'data/sampling/sampling-toy-7400'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/gridsearch-7400/results-lrpackage-l2-balanced.csv'
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','penalty','multi_class', 'class_weight', 'score','score type']
# experiment = '9.02'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [8,10,12]
# list_penalty=['l2']
# list_multiclass = ['auto']
# list_classweight= ['balanced']


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_penalty,
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


# Run Set 10 - sklearn with l2 penalty and class weights on 5000 dataset

### Run 10.01

In [None]:
%%time

# parameters
seq_file = 'data/train_small-db_toy-5000.fasta'
taxid_file = 'data/train_small-db_toy-5000.taxid'
output_dir = 'data/sampling/sampling-toy-5000'
pattern = 'fragments*.npy'
seed = 42
date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
grid_search_file  = 'data/gridsearch-5000/results-lrpackage-l2-balanced.csv'
fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','penalty','multi_class', 'class_weight', 'score','score type']
experiment = '10.01'
score_type = 'species_recall'

# combinations to try
list_sample_length = [100,200,400]
list_coverage = [1,10,100,200,400]
list_k = [1,2,4,6,8,10,12]
list_penalty=['l2']
list_multiclass = ['auto']
list_classweight= ['balanced']


grid_search_multiclass_lr(seq_file, 
                          taxid_file, 
                          output_dir, 
                          pattern, 
                          list_sample_length, 
                          list_coverage, 
                          list_k, 
                          list_penalty,
                          list_multiclass,
                          list_classweight,
                          seed,
                          grid_search_file,
                          fields,
                          experiment,
                          score_type)


100 1 1
Existing directory was not found. Process will generate a directory.
(2128, 400)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 0.9523809523809524
100 1 2
(2128, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 1.9047619047619049
100 1 4
(2128, 6376)
l2 auto balanced
Percent complete: 2.857142857142857
100 1 6
(2128, 23815)
l2 auto balanced
Percent complete: 3.8095238095238098
100 1 8
(2128, 23478)
l2 auto balanced
Percent complete: 4.761904761904762
100 1 10
(2128, 20066)
l2 auto balanced
Percent complete: 5.714285714285714
100 1 12
(2128, 16089)
l2 auto balanced
Percent complete: 6.666666666666667
100 10 1
(21000, 400)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 7.6190476190476195
100 10 2
(21000, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 8.571428571428571
100 10 4
(21000, 6400)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 9.523809523809524
100 10 6
(21000, 62800)
l2 auto balanced
Percent complete: 10.476190476190476
100 10 8
(21000, 179285)
l2 auto balanced
Percent complete: 11.428571428571429
100 10 10
(21000, 180939)
l2 auto balanced
Percent complete: 12.380952380952381
100 10 12
(21000, 147508)
l2 auto balanced
Percent complete: 13.333333333333334
100 100 1
(209680, 400)
l2 auto balanced
Percent complete: 14.285714285714285
100 100 2
(209680, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 15.238095238095239
100 100 4
(209680, 6400)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 16.19047619047619
100 100 6
(209680, 65533)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 17.142857142857142
100 100 8
(209680, 568839)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 18.095238095238095
100 100 10
(209680, 994839)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 19.047619047619047
100 100 12
(209680, 877597)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 20.0
100 200 1
(419360, 400)
l2 auto balanced
Percent complete: 20.952380952380953
100 200 2
(419360, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 21.904761904761905
100 200 4
(419360, 6400)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 22.857142857142858
100 200 6
(419360, 65536)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 23.809523809523807
100 200 8
(419360, 632206)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 24.761904761904763
100 200 10
(419360, 1283884)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 25.71428571428571
100 200 12
(419360, 1162085)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 26.666666666666668
100 400 1
(838720, 400)
l2 auto balanced
Percent complete: 27.61904761904762
100 400 2
(838720, 800)
l2 auto balanced
Percent complete: 28.57142857142857
100 400 4
(838720, 6400)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 29.523809523809526
100 400 6
(838720, 65536)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 30.476190476190478
100 400 8
(838720, 655218)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 31.428571428571427
100 400 10
(838720, 1420092)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 32.38095238095238
100 400 12
(838720, 1300882)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 33.33333333333333
200 1 1
(1084, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 34.285714285714285
200 1 2
(1084, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 35.23809523809524
200 1 4
(1084, 12331)
l2 auto balanced
Percent complete: 36.19047619047619
200 1 6
(1084, 28756)
l2 auto balanced
Percent complete: 37.142857142857146
200 1 8
(1084, 25258)
l2 auto balanced
Percent complete: 38.095238095238095
200 1 10
(1084, 20477)
l2 auto balanced
Percent complete: 39.04761904761905
200 1 12
(1084, 16399)
l2 auto balanced
Percent complete: 40.0
200 10 1
(10513, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 40.95238095238095
200 10 2
(10513, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 41.904761904761905
200 10 4
(10513, 12800)
l2 auto balanced
Percent complete: 42.857142857142854
200 10 6
(10513, 114383)
l2 auto balanced
Percent complete: 43.80952380952381
200 10 8
(10513, 214539)
l2 auto balanced
Percent complete: 44.761904761904766
200 10 10
(10513, 191379)
l2 auto balanced
Percent complete: 45.714285714285715
200 10 12
(10513, 154690)
l2 auto balanced
Percent complete: 46.666666666666664
200 100 1
(104859, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 47.61904761904761
200 100 2
(104859, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 48.57142857142857
200 100 4
(104859, 12800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 49.523809523809526
200 100 6
(104859, 135119)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 50.476190476190474
200 100 8
(104859, 962079)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 51.42857142857142
200 100 10
(104859, 1331848)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 52.38095238095239
200 100 12
(104859, 1139636)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 53.333333333333336
200 200 1
(209680, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 54.285714285714285
200 200 2
(209680, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 55.23809523809524
200 200 4
(209680, 12800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 56.19047619047619
200 200 6
(209680, 135164)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 57.14285714285714
200 200 8
(209680, 1178457)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 58.0952380952381
200 200 10
(209680, 1965475)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 59.04761904761905
200 200 12
(209680, 1731772)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 60.0
200 400 1
(419360, 800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 60.952380952380956
200 400 2
(419360, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 61.904761904761905
200 400 4
(419360, 12800)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 62.857142857142854
200 400 6
(419360, 135168)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 63.8095238095238
200 400 8
(419360, 1308777)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 64.76190476190476
200 400 10
(419360, 2525133)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 65.71428571428571
200 400 12
(419360, 2280958)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 66.66666666666666
400 1 1
(559, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 67.61904761904762
400 1 2
(559, 3200)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 68.57142857142857
400 1 4
(559, 21489)
l2 auto balanced
Percent complete: 69.52380952380952
400 1 6
(559, 31979)
l2 auto balanced
Percent complete: 70.47619047619048
400 1 8
(559, 26269)
l2 auto balanced
Percent complete: 71.42857142857143
400 1 10
(559, 21156)
l2 auto balanced
Percent complete: 72.38095238095238
400 1 12
(559, 17461)
l2 auto balanced
Percent complete: 73.33333333333333
400 10 1
(5271, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 74.28571428571429
400 10 2
(5271, 3200)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 75.23809523809524
400 10 4
(5271, 25600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 76.19047619047619
400 10 6
(5271, 174120)
l2 auto balanced
Percent complete: 77.14285714285715
400 10 8
(5271, 231164)
l2 auto balanced
Percent complete: 78.0952380952381
400 10 10
(5271, 196163)
l2 auto balanced
Percent complete: 79.04761904761905
400 10 12
(5271, 162704)
l2 auto balanced
Percent complete: 80.0
400 100 1
(52448, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 80.95238095238095
400 100 2
(52448, 3200)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 81.9047619047619
400 100 4
(52448, 25600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 82.85714285714286
400 100 6
(52448, 269447)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 83.80952380952381
400 100 8
(52448, 1370439)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 84.76190476190476
400 100 10
(52448, 1577892)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 85.71428571428571
400 100 12
(52448, 1355720)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 86.66666666666667
400 200 1
(104859, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 87.61904761904762
400 200 2
(104859, 3200)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 88.57142857142857
400 200 4
(104859, 25600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 89.52380952380953
400 200 6
(104859, 270241)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 90.47619047619048
400 200 8
(104859, 1906895)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 91.42857142857143
400 200 10
(104859, 2624438)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 92.38095238095238
400 200 12
(104859, 2313529)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 93.33333333333333
400 400 1
(209680, 1600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 94.28571428571428
400 400 2
(209680, 3200)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 95.23809523809523
400 400 4
(209680, 25600)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 96.19047619047619
400 400 6
(209680, 270321)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 97.14285714285714
400 400 8
(209680, 2326775)
l2 auto balanced


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 98.09523809523809
400 400 10
(209680, 3827270)
l2 auto balanced
