### Imports

In [1]:
import shutil
import csv
import datetime
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from packages.metagenomics import sampling2, encoding2
from sklearn.linear_model import LogisticRegression

from packages.LogisticRegression.MulticlassLogisticRegression import MulticlassLogisticRegression,MulticlassLogisticRegression2


In [2]:
def append_results_to_file(filename, fields=None, rows=None):
    
    with open(filename, 'a') as f:

        write = csv.writer(f)

        if fields:
            write.writerow(fields)

        if rows:
            write.writerows(rows)

In [3]:
def run_mlr_classification_recall(X_train, X_test, y_train, y_test, eta, epsilon):
    """
    Score is species level recall.
    """
    mlr = MulticlassLogisticRegression2(eta=eta, epsilon=epsilon)
    mlr.fit(X_train,y_train)
    y_pred = mlr.predict(X_test)
    score = recall_score(y_test, y_pred, average='weighted')
    return score
    

In [4]:
def build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed):
    # delete output directory if it previously exists
    try:
        shutil.rmtree(output_dir)
    except FileNotFoundError:
        print('Existing directory was not found. Process will generate a directory.')

    # build fragments
    sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

In [18]:
def encode_fragments(output_dir, pattern, k, seed):
    
    # encode data
    fragments = sampling2.read_fragments(output_dir, pattern)
    X_enc, y = encoding2.encode_fragment_dataset(fragments,k)
    le = preprocessing.LabelEncoder()
    y_enc = le.fit_transform(y)
    
    # calculate number of classes
    n_classes = len(np.unique(y_enc))
#     print('n_classes:',n_classes)
    n_classes_train = 0
    n_classes_test = 0
    while n_classes_train < n_classes or n_classes_test < n_classes:

        # split data into test and training
        X_train, X_test, y_train, y_test = train_test_split(X_enc.toarray(), y_enc, test_size=0.33)
        n_classes_train = len(np.unique(y_train))
        n_classes_test = len(np.unique(y_test))
#         print('train:',len(y_train))
#         print('test:', len(y_test))
    
    print(X_enc.shape)
    
    return X_train, X_test, y_train, y_test

In [6]:
def calc_number_combinations(list_sample_length,list_coverage,list_k,list_eta,list_epsilon):
    n = len(list_sample_length) * len(list_coverage) * len(list_k) *len(list_eta) * len(list_epsilon)
    return n

In [7]:
def grid_search_multiclass_mlr(seq_file, 
                              taxid_file, 
                              output_dir, 
                              pattern, 
                              list_sample_length, 
                              list_coverage, 
                              list_k,
                              list_eta,
                              list_epsilon, 
                              seed,
                              grid_search_file,
                              fields,
                              experiment,
                              score_type):
    
    # set up grid search results file
    append_results_to_file(grid_search_file, fields)
    
    # calculate number of combinations
    n_combinations = calc_number_combinations(list_sample_length,list_coverage,list_k,list_eta,list_epsilon)
    
    # process combinations
    count = 0
    for sample_length in list_sample_length:
        for coverage in list_coverage:
            
            # fragment combination
            build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
            for k in list_k:
                
                # kmer combination
                X_train, X_test, y_train, y_test = encode_fragments(output_dir, pattern,k,seed)
                for eta in list_eta:
                    for epsilon in list_epsilon:
                        
                        # random forest combination
                        score = run_mlr_classification_recall(X_train, X_test, y_train,y_test, eta, epsilon)
                        count += 1
                        
                        # output results to file
                        row = [experiment, 'multiclass', 'Logistic Regression', X_train.shape, sample_length, coverage, k, eta, epsilon, score, score_type]
                        append_results_to_file(grid_search_file, row)
                        
                print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress
    

# Run Set 1 - MLR Toy
2000 lengths dataset

### Run 4.01
Stopped early due to runs taking a long time.

In [8]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.01'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100]
# list_coverage = [0.2,1,10]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 4.02

In [19]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.02'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [200]
# list_coverage = [0.3,1,10]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


### Run 4.03

In [20]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000-mlr'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/mlr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','eta', 'epsilon', 'score','score type']
# experiment = '4.03'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [400]
# list_coverage = [0.5,1,10]
# list_k = [1,2,4,6,8,10,12]
# list_eta = [0.01]
# list_epsilon = [0.01]


# grid_search_multiclass_mlr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_eta,
#                           list_epsilon,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


(13, 1547)
Percent complete: 4.761904761904762
(13, 1721)
Percent complete: 9.523809523809524
(13, 1244)
Percent complete: 14.285714285714285
(13, 855)
Percent complete: 19.047619047619047
(13, 650)
Percent complete: 23.809523809523807
(13, 520)
Percent complete: 28.57142857142857
(13, 429)
Percent complete: 33.33333333333333
(23, 1592)
Percent complete: 38.095238095238095
(23, 2327)
Percent complete: 42.857142857142854
(23, 2146)
Percent complete: 47.61904761904761
(23, 1511)
Percent complete: 52.38095238095239
(23, 1150)
Percent complete: 57.14285714285714
(23, 920)
Percent complete: 61.904761904761905
(23, 759)
Percent complete: 66.66666666666666
(205, 1600)
Percent complete: 71.42857142857143
(205, 3199)
Percent complete: 76.19047619047619
(205, 12558)
Percent complete: 80.95238095238095
(205, 12586)
Percent complete: 85.71428571428571
(205, 10003)
Percent complete: 90.47619047619048
(205, 8033)
Percent complete: 95.23809523809523
(205, 6631)
Percent complete: 100.0
CPU times: user

# Run Set 2 - sklearn
Compare with sklearn implementation of MLR to see if performance is similar.

In [27]:
def calc_number_combinations2(list_sample_length,list_coverage,list_k,list_multiclass,list_classweight):
    n = len(list_sample_length) * len(list_coverage) * len(list_k) *len(list_multiclass) * len(list_classweight)
    return n

In [21]:
def encode_fragments2(output_dir, pattern, k, seed):
    """
    Does not convert sparse matrix to numpy matrix first.
    """
    
    # encode data
    fragments = sampling2.read_fragments(output_dir, pattern)
    X_enc, y = encoding2.encode_fragment_dataset(fragments,k)
    le = preprocessing.LabelEncoder()
    y_enc = le.fit_transform(y)
    
    # calculate number of classes
    n_classes = len(np.unique(y_enc))
#     print('n_classes:',n_classes)
    n_classes_train = 0
    n_classes_test = 0
    while n_classes_train < n_classes or n_classes_test < n_classes:

        # split data into test and training
        X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size=0.33)
        n_classes_train = len(np.unique(y_train))
        n_classes_test = len(np.unique(y_test))
#         print('train:',y_train)
#         print('test:', y_test)
    
    print(X_enc.shape)
    
    return X_train, X_test, y_train, y_test

In [30]:
def run_lr_classification_recall(X_train, X_test, y_train, y_test, multiclass, classweight, seed ):
    """
    Score is species level recall. Uses sklearn version of logistic regression.
    """
    lr = LogisticRegression(random_state=seed, multi_class=multiclass, class_weight=classweight )
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    score = recall_score(y_test, y_pred, average='weighted')
    return score
    

In [31]:
def grid_search_multiclass_lr(seq_file, 
                              taxid_file, 
                              output_dir, 
                              pattern, 
                              list_sample_length, 
                              list_coverage, 
                              list_k,
                              list_multiclass,
                              list_classweight,
                              seed,
                              grid_search_file,
                              fields,
                              experiment,
                              score_type):
    
    # set up grid search results file
    append_results_to_file(grid_search_file, fields)
    
    # calculate number of combinations
    n_combinations = calc_number_combinations2(list_sample_length,list_coverage,list_k, list_multiclass, list_classweight)
    
    # process combinations
    count = 0
    for sample_length in list_sample_length:
        for coverage in list_coverage:
            
            # fragment combination
            build_fragments(seq_file, taxid_file, output_dir, sample_length, coverage, seed)
            for k in list_k:
                
                # kmer combination
                X_train, X_test, y_train, y_test = encode_fragments2(output_dir, pattern,k,seed)
                
                
                for multiclass in list_multiclass:
                    for classweight in list_classweight:

                        
                        # random forest combination
                        score = run_lr_classification_recall(X_train, X_test, y_train, y_test, multiclass, classweight, seed)
                        count += 1

                        # output results to file
                        row = [experiment, 'multiclass', 'Logistic Regression (sklearn)', X_train.shape, sample_length, coverage, k, multiclass, classweight, score, score_type]
                        append_results_to_file(grid_search_file, row)

                print('Percent complete: {}'.format(count / n_combinations * 100)) # display progress


### Run 5.01

In [33]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','multiclass', 'class_weight', 'score','score type']
# experiment = '5.01'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [1]
# list_multiclass = ['auto']
# list_classweight= [None]


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


(85, 400)
Percent complete: 6.666666666666667
(816, 400)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 13.333333333333334
(8141, 400)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 20.0
(16282, 400)
Percent complete: 26.666666666666668
(32564, 400)
Percent complete: 33.33333333333333
(44, 800)
Percent complete: 40.0
(409, 800)
Percent complete: 46.666666666666664


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(4071, 800)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 53.333333333333336
(8141, 800)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 60.0
(16282, 800)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 66.66666666666666
(23, 1592)
Percent complete: 73.33333333333333
(205, 1600)
Percent complete: 80.0
(2037, 1600)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 86.66666666666667
(4071, 1600)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 93.33333333333333
(8141, 1600)
Percent complete: 100.0
CPU times: user 57.2 s, sys: 740 ms, total: 57.9 s
Wall time: 42 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Run 5.02

In [34]:
# %%time

# # parameters
# seq_file = 'data/train_small-db_toy-2000.fasta'
# taxid_file = 'data/train_small-db_toy-2000.taxid'
# output_dir = 'data/sampling/sampling-toy-2000'
# pattern = 'fragments*.npy'
# seed = 42
# date_time = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
# grid_search_file  = 'data/runs-2000/lr-multi.{}.csv'.format(date_time)
# fields = ['experiment','category','classifier','training shape','sample_length','coverage','k','multiclass', 'class_weight', 'score','score type']
# experiment = '5.02'
# score_type = 'species_recall'

# # combinations to try
# list_sample_length = [100,200,400]
# list_coverage = [1,10,100,200,400]
# list_k = [2,4,6,8,10,12]
# list_multiclass = ['auto']
# list_classweight= [None]


# grid_search_multiclass_lr(seq_file, 
#                           taxid_file, 
#                           output_dir, 
#                           pattern, 
#                           list_sample_length, 
#                           list_coverage, 
#                           list_k, 
#                           list_multiclass,
#                           list_classweight,
#                           seed,
#                           grid_search_file,
#                           fields,
#                           experiment,
#                           score_type)


(85, 793)
Percent complete: 1.1111111111111112
(85, 1722)
Percent complete: 2.2222222222222223
(85, 1338)
Percent complete: 3.3333333333333335
(85, 1018)
Percent complete: 4.444444444444445
(85, 850)
Percent complete: 5.555555555555555
(85, 680)
Percent complete: 6.666666666666667
(816, 800)
Percent complete: 7.777777777777778
(816, 5689)
Percent complete: 8.88888888888889
(816, 10714)
Percent complete: 10.0
(816, 9328)
Percent complete: 11.11111111111111
(816, 7885)
Percent complete: 12.222222222222221
(816, 6317)
Percent complete: 13.333333333333334
(8141, 800)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 14.444444444444443
(8141, 6399)
Percent complete: 15.555555555555555
(8141, 38665)
Percent complete: 16.666666666666664
(8141, 53604)
Percent complete: 17.77777777777778
(8141, 48759)
Percent complete: 18.88888888888889
(8141, 39395)
Percent complete: 20.0
(16282, 800)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 21.11111111111111
(16282, 6400)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 22.22222222222222
(16282, 43939)
Percent complete: 23.333333333333332
(16282, 68550)
Percent complete: 24.444444444444443
(16282, 63875)
Percent complete: 25.555555555555554
(16282, 51766)
Percent complete: 26.666666666666668
(32564, 800)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 27.77777777777778
(32564, 6400)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 28.888888888888886
(32564, 46691)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 30.0
(32564, 77733)
Percent complete: 31.11111111111111
(32564, 73479)
Percent complete: 32.22222222222222
(32564, 59669)
Percent complete: 33.33333333333333
(44, 1444)
Percent complete: 34.44444444444444
(44, 1910)
Percent complete: 35.55555555555556
(44, 1429)
Percent complete: 36.666666666666664
(44, 1096)
Percent complete: 37.77777777777778
(44, 878)
Percent complete: 38.88888888888889
(44, 703)
Percent complete: 40.0
(409, 1600)
Percent complete: 41.11111111111111
(409, 9082)
Percent complete: 42.22222222222222
(409, 11985)
Percent complete: 43.333333333333336
(409, 9852)
Percent complete: 44.44444444444444
(409, 7945)
Percent complete: 45.55555555555556
(409, 6361)
Percent complete: 46.666666666666664
(4071, 1600)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 47.77777777777778
(4071, 12772)
Percent complete: 48.888888888888886
(4071, 60742)
Percent complete: 50.0
(4071, 71903)
Percent complete: 51.11111111111111
(4071, 60817)
Percent complete: 52.22222222222223
(4071, 48991)
Percent complete: 53.333333333333336
(8141, 1600)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 54.44444444444444
(8141, 12799)
Percent complete: 55.55555555555556
(8141, 77930)
Percent complete: 56.666666666666664
(8141, 108026)
Percent complete: 57.77777777777777
(8141, 94148)
Percent complete: 58.88888888888889
(8141, 76066)
Percent complete: 60.0
(16282, 1600)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 61.111111111111114
(16282, 12800)
Percent complete: 62.22222222222222
(16282, 88349)
Percent complete: 63.33333333333333
(16282, 136646)
Percent complete: 64.44444444444444
(16282, 121850)
Percent complete: 65.55555555555556
(16282, 98704)
Percent complete: 66.66666666666666
(23, 2327)
Percent complete: 67.77777777777779
(23, 2146)
Percent complete: 68.88888888888889
(23, 1511)
Percent complete: 70.0
(23, 1150)
Percent complete: 71.11111111111111
(23, 920)
Percent complete: 72.22222222222221
(23, 759)
Percent complete: 73.33333333333333
(205, 3199)
Percent complete: 74.44444444444444
(205, 12558)
Percent complete: 75.55555555555556
(205, 12586)
Percent complete: 76.66666666666667
(205, 10003)
Percent complete: 77.77777777777779
(205, 8033)
Percent complete: 78.88888888888889
(205, 6631)
Percent complete: 80.0
(2037, 3200)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 81.11111111111111
(2037, 25059)
Percent complete: 82.22222222222221
(2037, 80853)
Percent complete: 83.33333333333334
(2037, 81580)
Percent complete: 84.44444444444444
(2037, 67552)
Percent complete: 85.55555555555556
(2037, 55940)
Percent complete: 86.66666666666667
(4071, 3200)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 87.77777777777777
(4071, 25514)
Percent complete: 88.88888888888889
(4071, 117598)
Percent complete: 90.0
(4071, 138154)
Percent complete: 91.11111111111111
(4071, 116996)
Percent complete: 92.22222222222223
(4071, 97172)
Percent complete: 93.33333333333333
(8141, 3200)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Percent complete: 94.44444444444444
(8141, 25588)
Percent complete: 95.55555555555556
(8141, 148288)
Percent complete: 96.66666666666667
(8141, 200257)
Percent complete: 97.77777777777777
(8141, 174176)
Percent complete: 98.88888888888889
(8141, 145094)
Percent complete: 100.0
CPU times: user 8min, sys: 7.09 s, total: 8min 7s
Wall time: 2min 59s
