<a href="https://colab.research.google.com/github/sp8rks/MaterialsInformatics/blob/main/worked_examples/hyperparameter_opt/materials_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Grid vs. Random Search Hyperparameter Optimization

## Setup

### Installation

In [1]:
!pip install matbench
!pip install CBFV



### Imports

In [2]:
import pandas as pd
from matbench.bench import MatbenchBenchmark
from CBFV.composition import generate_features

### Data

In [3]:
mb = MatbenchBenchmark(subset=["matbench_expt_is_metal"])
task = list(mb.tasks)[0]
task.load()
fold0 = task.folds[0]
train_inputs, train_outputs = task.get_train_and_val_data(fold0)
test_inputs, test_outputs = task.get_test_data(fold0, include_target=True)
print(train_inputs[0:2], train_outputs[0:2])
print(train_outputs.shape, test_outputs.shape)
        

2022-02-09 13:07:40 INFO     Initialized benchmark 'matbench_v0.1' with 1 tasks: 
['matbench_expt_is_metal']
2022-02-09 13:07:40 INFO     Loading dataset 'matbench_expt_is_metal'...
Fetching matbench_expt_is_metal.json.gz from https://ml.materialsproject.org/projects/matbench_expt_is_metal.json.gz to C:\Users\taylo\miniconda3\envs\my_pymatgen\lib\site-packages\matminer\datasets\matbench_expt_is_metal.json.gz


Fetching https://ml.materialsproject.org/projects/matbench_expt_is_metal.json.gz in MB: 0.034816MB [00:00, 19.99MB/s]  

2022-02-09 13:07:41 INFO     Dataset 'matbench_expt_is_metal loaded.
mbid
mb-expt-is-metal-0001      Ag(AuS)2
mb-expt-is-metal-0002    Ag(W3Br7)2
Name: composition, dtype: object mbid
mb-expt-is-metal-0001    True
mb-expt-is-metal-0002    True
Name: is_metal, dtype: bool
(3936,) (985,)





In [4]:
train_inputs.describe()

count         3936
unique        3936
top       Ag(AuS)2
freq             1
Name: composition, dtype: object

In [5]:
train_outputs.describe()

count      3936
unique        2
top       False
freq       1976
Name: is_metal, dtype: object

In [6]:
train_df = pd.DataFrame({"formula": train_inputs, "target": train_outputs})
test_df = pd.DataFrame({"formula": test_inputs, "target": test_outputs})
train_df

Unnamed: 0_level_0,formula,target
mbid,Unnamed: 1_level_1,Unnamed: 2_level_1
mb-expt-is-metal-0001,Ag(AuS)2,True
mb-expt-is-metal-0002,Ag(W3Br7)2,True
mb-expt-is-metal-0003,Ag0.5Ge1Pb1.75S4,False
mb-expt-is-metal-0005,Ag2BBr,True
mb-expt-is-metal-0006,Ag2BiO3,True
...,...,...
mb-expt-is-metal-4916,ZrSiTe,True
mb-expt-is-metal-4917,ZrTaN3,False
mb-expt-is-metal-4918,ZrTe,True
mb-expt-is-metal-4920,ZrTiF6,True


In [7]:
X_train, y_train, _, _ = generate_features(train_df)
X_train

Processing Input Data: 100%|█████████████████████████████████████████████████████| 3936/3936 [00:01<00:00, 3128.50it/s]


	Featurizing Compositions...


Assigning Features...: 100%|█████████████████████████████████████████████████████| 3936/3936 [00:02<00:00, 1854.98it/s]


	Creating Pandas Objects...


Unnamed: 0,avg_Atomic_Number,avg_Atomic_Weight,avg_Period,avg_group,avg_families,avg_Metal,avg_Nonmetal,avg_Metalliod,avg_Mendeleev_Number,avg_l_quantum_number,...,mode_polarizability(A^3),mode_Melting_point_(K),mode_Boiling_Point_(K),mode_Density_(g/mL),mode_specific_heat_(J/g_K)_,mode_heat_of_fusion_(kJ/mol)_,mode_heat_of_vaporization_(kJ/mol)_,mode_thermal_conductivity_(W/(m_K))_,mode_heat_atomization(kJ/mol),mode_Cohesive_energy
0,47.400000,113.186656,4.600000,13.000000,5.200000,0.600000,0.400000,0.000000,74.600000,1.200000,...,2.900,385.95,717.85,2.07000,0.128,1.71750,9.8000,0.26900,279.0,2.85
1,46.714286,110.931629,4.619048,13.571429,6.666667,0.333333,0.666667,0.000000,81.000000,1.238095,...,3.100,265.95,331.95,3.12000,0.473,5.28600,15.4380,0.12200,112.0,1.22
2,36.275862,85.159738,4.000000,14.896552,6.172414,0.310345,0.551724,0.137931,83.482759,0.931034,...,2.900,385.95,717.85,2.07000,0.710,1.71750,9.8000,0.26900,279.0,2.85
3,33.500000,76.612850,4.000000,13.000000,5.500000,0.500000,0.250000,0.250000,74.250000,0.500000,...,7.900,1235.15,2485.15,10.50000,0.235,11.30000,250.5800,429.00000,284.0,2.95
4,33.500000,78.785828,3.666667,14.166667,5.666667,0.500000,0.500000,0.000000,79.500000,0.666667,...,0.793,54.75,90.15,0.00143,0.920,0.22259,3.4099,0.02674,249.0,2.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3931,35.333333,82.303167,4.333333,11.333333,5.333333,0.333333,0.666667,0.000000,70.666667,1.333333,...,5.400,722.65,1262.95,2.33000,0.200,16.90000,52.5500,2.35000,197.0,2.19
3932,26.800000,62.838424,3.400000,10.800000,5.800000,0.400000,0.600000,0.000000,67.600000,1.400000,...,1.100,63.25,77.35,0.00125,1.040,0.36040,2.7928,0.02598,473.0,4.92
3933,46.000000,109.412000,5.000000,10.000000,5.000000,0.500000,0.500000,0.000000,67.000000,1.500000,...,5.500,722.65,1262.95,6.24000,0.200,16.90000,52.5500,2.35000,197.0,2.19
3934,14.500000,31.636802,2.625000,13.750000,7.000000,0.250000,0.750000,0.000000,80.625000,1.250000,...,0.634,53.35,85.05,0.00170,0.820,0.25520,3.2698,0.02790,79.0,0.84


In [8]:
X_test, y_test, _, _ = generate_features(test_df)
X_test

Processing Input Data: 100%|███████████████████████████████████████████████████████| 985/985 [00:00<00:00, 2955.72it/s]


	Featurizing Compositions...


Assigning Features...: 100%|███████████████████████████████████████████████████████| 985/985 [00:00<00:00, 1760.15it/s]


	Creating Pandas Objects...


Unnamed: 0,avg_Atomic_Number,avg_Atomic_Weight,avg_Period,avg_group,avg_families,avg_Metal,avg_Nonmetal,avg_Metalliod,avg_Mendeleev_Number,avg_l_quantum_number,...,mode_polarizability(A^3),mode_Melting_point_(K),mode_Boiling_Point_(K),mode_Density_(g/mL),mode_specific_heat_(J/g_K)_,mode_heat_of_fusion_(kJ/mol)_,mode_heat_of_vaporization_(kJ/mol)_,mode_thermal_conductivity_(W/(m_K))_,mode_heat_atomization(kJ/mol),mode_Cohesive_energy
0,46.206897,111.032290,4.551724,14.896552,6.172414,0.310345,0.551724,0.137931,84.034483,0.931034,...,3.800,490.15,958.15,4.79000,0.320,6.69400,37.7000,0.52000,227.0,2.46
1,25.750000,57.815474,3.375000,14.000000,5.750000,0.375000,0.500000,0.125000,78.375000,0.625000,...,0.793,54.75,90.15,0.00143,0.920,0.22259,3.4099,0.02674,249.0,2.62
2,46.750000,107.506150,5.000000,10.750000,4.000000,1.000000,0.000000,0.000000,64.250000,0.500000,...,7.900,1235.15,2485.15,10.50000,0.235,11.30000,250.5800,429.00000,284.0,2.95
3,27.125000,61.084025,3.500000,13.125000,5.500000,0.500000,0.500000,0.000000,74.875000,0.750000,...,0.793,54.75,90.15,0.00143,0.920,0.22259,3.4099,0.02674,249.0,2.62
4,42.454545,97.547122,4.636364,13.000000,5.272727,0.636364,0.363636,0.000000,74.818182,0.363636,...,7.900,1235.15,2485.15,10.50000,0.235,11.30000,250.5800,429.00000,284.0,2.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,24.000000,51.785333,3.666667,12.000000,6.000000,0.333333,0.666667,0.000000,73.333333,1.333333,...,2.900,385.95,717.85,2.07000,0.710,1.71750,9.8000,0.26900,279.0,2.85
981,45.000000,104.684667,5.000000,9.000000,4.666667,0.666667,0.333333,0.000000,61.666667,1.666667,...,6.600,904.15,2223.15,6.51000,0.210,16.90000,77.1400,22.70000,262.0,2.75
982,37.000000,85.092000,4.500000,10.000000,5.500000,0.500000,0.500000,0.000000,66.500000,1.500000,...,3.800,490.15,958.15,4.79000,0.270,6.69400,37.7000,0.52000,227.0,2.46
983,22.666667,49.131667,3.666667,10.666667,5.333333,0.333333,0.666667,0.000000,66.666667,1.333333,...,5.400,1683.15,2628.15,2.33000,0.710,50.55000,384.2200,148.00000,452.0,4.63


## Train

We can do hyperparameter tuning in different ways. Two common ways are grid search (less efficient) and random search (more efficient). Below are examples taken/modified from the website https://www.geeksforgeeks.org/hyperparameter-tuning/


In [13]:
#Grid search first using logistic regression classifier model
#Necessary imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np

# Creating the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}
  
# Instantiating logistic regression classifier
logreg = LogisticRegression()
  
# Instantiating the GridSearchCV object
logreg_grid = GridSearchCV(logreg, param_grid, cv = 5)
  
logreg_grid.fit(X_train, y_train)
  
# Print the tuned parameters and score
print("Grid tuned Logistic Regression Parameters: {}".format(logreg_grid.best_params_)) 
print("Best score is {}".format(logreg_grid.best_score_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Tuned Logistic Regression Parameters: {'C': 1e-05}
Best score is 0.8180864169660538


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
#Now we can try random search with logistic regression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
  
# Creating the hyperparameter grid 
param_dist = {"C": randint(-5,15)}
  
# Instantiating Decision Tree classifier
logreg = LogisticRegression()
  
# Instantiating RandomizedSearchCV object
logreg_random = RandomizedSearchCV(logreg, param_dist, cv = 5)
  
logreg_random.fit(X_train, y_train)
  
# Print the tuned parameters and score
print("Random tuned Logistic Regression Parameters: {}".format(logreg_random.best_params_))
print("Best score is {}".format(logreg_random.best_score_))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Tuned Decision Tree Parameters: {'C': 7}
Best score is 0.8201165513193456


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


We can do the same grid vs random search with another model, like a decision tree classifier

In [16]:
#grid search for decision tree hyperparameters
  
# Creating the hyperparameter grid 
param_grid = {"max_depth": range(1,10),
              "max_features": range(1,10),
              "min_samples_leaf": range(1,10),
              "criterion": ["gini", "entropy"]}

# Instantiating Decision Tree classifier
tree = DecisionTreeClassifier()
  
# Instantiating GridSearchCV object
tree_grid = GridSearchCV(tree, param_grid, cv = 5)
  
tree_grid.fit(X_train, y_train)
  
# Print the tuned parameters and score
print("Grid tuned Decision Tree Parameters: {}".format(tree_grid.best_params_))
print("Best score is {}".format(tree_grid.best_score_))


Grid tuned Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 6, 'max_features': 9, 'min_samples_leaf': 1}
Best score is 0.8414673082256721


In [19]:
#random search for decision tree hyperparameters
  
# Creating the hyperparameter grid 
param_dist = {"max_depth": randint(1,10),
              "max_features": randint(1,10),
              "min_samples_leaf": randint(1,10),
              "criterion": ["gini", "entropy"]}

# Instantiating Decision Tree classifier
tree = DecisionTreeClassifier()
  
# Instantiating RandomizedSearchCV object
tree_random = RandomizedSearchCV(tree, param_dist, cv = 5)
  
tree_random.fit(X_train, y_train)
  
# Print the tuned parameters and score
print("Random tuned Decision Tree Parameters: {}".format(tree_random.best_params_))
print("Best score is {}".format(tree_random.best_score_))


Random tuned Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 7, 'max_features': 5, 'min_samples_leaf': 1}
Best score is 0.8246947542231309


## Test