# Hyperparameter search

Very good performance on binary classifiers, but I wonder if I can improve the performance of the multiclass classifier?

In [1]:
cancer_type = ''
classifier_type = 'SVM' # can be LR (logistic regression) or SVM (support vector machine)


In [2]:
# adding this path so we can import get_train_and_test
import sys
path = '../' # needs to be in top folder (where get_train_and_test.py is)
sys.path.append(path)
from get_train_and_test import get_train_and_test
root_path = path

seed = 42 # using a seed for splitting up the train and test data 
# for some reason this didn't work for LR, so the results don't use a seed.
m_values_train, m_values_test, diagnoses_train, diagnoses_test = get_train_and_test(cancer_type, use_small=False, root_path = root_path, model_path = root_path + '/simple_methods/', model_type = classifier_type + '_' + cancer_type, seed = seed)
    

Getting values and diagnoses from: 
/Tank/methylation-patterns-code/methylation-patterns-izzy/data_preprocessing/dataset/pandas/m_values/TCGA-all.csv
/Tank/methylation-patterns-code/methylation-patterns-izzy/data_preprocessing/dataset/pandas/diagnoses/TCGA-all.csv
m_value and diagnoses shapes:
(276016, 6224)
(6224,)
m values train, m values test, diagnoses train, diagnoses test shapes:
(4668, 276016) (1556, 276016) (4668,) (1556,)


In [3]:
m_values_train

matrix([[-1.21669116, -1.51058577,  3.340167  , ...,  0.15871411,
         -4.94125638,  3.65870998],
        [-2.17623572,  0.7018979 ,  3.45276462, ...,  0.08917521,
         -0.04220124,  3.86710338],
        [-0.11018184, -2.92325689,  3.35058589, ..., -3.27780004,
          0.01476019,  4.03479652],
        ...,
        [-1.42362442, -2.00625456,  2.15450508, ..., -3.82074194,
         -0.18886663,  3.27053655],
        [-0.61854086, -0.69444665,  3.05009639, ..., -4.44277087,
         -0.25476051,  4.11387971],
        [-3.4226674 , -2.64021861,  3.20787357, ...,  0.0439369 ,
         -0.12555141,  0.9169289 ]])

In [4]:
from scipy.stats import randint, uniform

# random search
if classifier_type == 'LR':
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression()
    
    
    to_try = {'penalty': ['l2'],
             'solver': ['saga'], # did try: 'newton-cg', 'sag', 'saga', 'lbfgs'],
             'max_iter': randint(100, 500)}

elif classifier_type == 'SVM':
    from sklearn.svm import SVC
    model = SVC()

#     to_try = {'kernel': ['rbf', 'sigmoid'],
#              'tol': [1e-3, 1e-4, 1e-5],
#              'gamma': ['auto', 'scale'],
#              'probability': [True]}

to_try = {'kernel': ['rbf'], # sigmoid resulted in very low acc I think
         'tol': [1e-3, 1e-4], # 1e-5 takes too long
         'gamma': ['auto', 'scale'],
         'probability': [True]}
    
    
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(model, to_try, n_iter=3, scoring='f1_micro', cv=5, verbose=5)
random_search.fit(m_values_train, diagnoses_train)
classifier_type

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] tol=0.001, probability=True, kernel=rbf, gamma=auto .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tol=0.001, probability=True, kernel=rbf, gamma=auto, score=0.953, total=291.4min
[CV] tol=0.001, probability=True, kernel=rbf, gamma=auto .............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 291.4min remaining:    0.0s


[CV]  tol=0.001, probability=True, kernel=rbf, gamma=auto, score=0.950, total=284.6min
[CV] tol=0.001, probability=True, kernel=rbf, gamma=auto .............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 575.9min remaining:    0.0s


[CV]  tol=0.001, probability=True, kernel=rbf, gamma=auto, score=0.950, total=287.8min
[CV] tol=0.001, probability=True, kernel=rbf, gamma=auto .............


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 863.8min remaining:    0.0s


[CV]  tol=0.001, probability=True, kernel=rbf, gamma=auto, score=0.960, total=286.5min
[CV] tol=0.001, probability=True, kernel=rbf, gamma=auto .............


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 1150.3min remaining:    0.0s


[CV]  tol=0.001, probability=True, kernel=rbf, gamma=auto, score=0.952, total=285.4min
[CV] tol=0.0001, probability=True, kernel=rbf, gamma=auto ............
[CV]  tol=0.0001, probability=True, kernel=rbf, gamma=auto, score=0.953, total=295.2min
[CV] tol=0.0001, probability=True, kernel=rbf, gamma=auto ............
[CV]  tol=0.0001, probability=True, kernel=rbf, gamma=auto, score=0.950, total=287.1min
[CV] tol=0.0001, probability=True, kernel=rbf, gamma=auto ............
[CV]  tol=0.0001, probability=True, kernel=rbf, gamma=auto, score=0.950, total=291.2min
[CV] tol=0.0001, probability=True, kernel=rbf, gamma=auto ............
[CV]  tol=0.0001, probability=True, kernel=rbf, gamma=auto, score=0.960, total=289.2min
[CV] tol=0.0001, probability=True, kernel=rbf, gamma=auto ............
[CV]  tol=0.0001, probability=True, kernel=rbf, gamma=auto, score=0.952, total=286.9min
[CV] tol=0.0001, probability=True, kernel=rbf, gamma=scale ...........
[CV]  tol=0.0001, probability=True, kernel=rbf,

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 3835.0min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=SVC(C=1.0, cache_size=200, class_weight=None,
                                 coef0=0.0, decision_function_shape='ovr',
                                 degree=3, gamma='auto_deprecated',
                                 kernel='rbf', max_iter=-1, probability=False,
                                 random_state=None, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='warn', n_iter=3, n_jobs=None,
                   param_distributions={'gamma': ['auto', 'scale'],
                                        'kernel': ['rbf'],
                                        'probability': [True],
                                        'tol': [0.001, 0.0001]},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring='f1_micro', verbose=5)

'SVM'

In [None]:

random_search_results = random_search.cv_results_
random_search_best_estimator = random_search.best_estimator_

# showing parameters and scores
scores = random_search.cv_results_['mean_test_score']
params = random_search.cv_results_['params']
import pandas as pd
pd_results = pd.DataFrame(params)
pd_results["score"] = scores
pd_results.sort_values(by="score", ascending = False)

print(pd_results)

# saving best model
params = random_search.best_params_
import json
params = json.dumps(params) # get string rep of dictionary
import joblib # joblib is apparently more efficient than pickle functions for model saving (see https://scikit-learn.org/stable/modules/model_persistence.html)
joblib.dump(random_search_best_estimator, 'saved_models/'+classifier_type+'_model_multiclass_best_params_'+params+'.pkl')

