# HEAPML Project
## Random Forest
This notebook outlines the steps needed to train the RF model for the HEAPML project.

In [2]:
### GENERAL IMPORTS ###
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

### PYMATGEN/MATMINER IMPORTS ###
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer

### SKLEARN IMPORTS ###
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split, RepeatedStratifiedKFold
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, precision_score, recall_score

### SKOMPTOMIZE IMPORTS ###
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from skopt.plots import plot_convergence

### 1. Import Featurized Data

In [3]:
feature_calculators = MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset('magpie')])
feature_labels = feature_calculators.feature_labels()

alloys = pd.read_csv('./data/featurized_alloys.csv')

display(alloys)

Unnamed: 0,formula,phase,composition_obj,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,MagpieData minimum Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,AgAlCoCrCuNi,3,Ag1 Al1 Co1 Cr1 Cu1 Ni1,6,0.408248,0.302853,0.238495,0.215285,0.199372,13.0,...,1.548471,0.357311,0.476415,0.0,194.0,229.0,35.0,220.500000,8.833333,194.0
1,AgCoCrFeMnNi,1,Ag1 Co1 Cr1 Fe1 Mn1 Ni1,6,0.408248,0.302853,0.238495,0.215285,0.199372,24.0,...,2.110663,0.709140,0.746951,0.0,194.0,229.0,35.0,219.833333,9.555556,194.0
2,Al0.02CoCrFeMnNi,1,Al0.02 Co1 Cr1 Fe1 Mn1 Ni1,6,0.445450,0.340633,0.274847,0.250697,0.233988,13.0,...,2.110663,0.847577,0.782462,0.0,194.0,229.0,35.0,218.824701,10.617292,194.0
3,Al0.03CoCrFeMnNi,1,Al0.03 Co1 Cr1 Fe1 Mn1 Ni1,6,0.444586,0.339956,0.274300,0.250199,0.233523,13.0,...,2.110663,0.845892,0.782246,0.0,194.0,229.0,35.0,218.836978,10.605947,194.0
4,Al0.04CoCrFeMnNi,1,Al0.04 Co1 Cr1 Fe1 Mn1 Ni1,6,0.443735,0.339282,0.273756,0.249702,0.233059,13.0,...,2.110663,0.844214,0.782026,0.0,194.0,229.0,35.0,218.849206,10.594608,194.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068,Zr2.0TiHfVNb2.0,1,Zr2 Ti1 Hf1 V1 Nb2,5,0.473804,0.381200,0.331220,0.315980,0.306266,22.0,...,0.000023,0.000003,0.000006,0.0,194.0,229.0,35.0,209.000000,17.142857,194.0
1069,ZrTiHfCuNiFe,1,Zr1 Ti1 Hf1 Cu1 Ni1 Fe1,6,0.408248,0.302853,0.238495,0.215285,0.199372,22.0,...,2.110663,0.451013,0.601344,0.0,194.0,229.0,35.0,210.166667,16.166667,194.0
1070,ZrTiHfNb0.5Mo0.5,1,Zr1 Ti1 Hf1 Nb0.5 Mo0.5,5,0.467707,0.370312,0.312720,0.292700,0.279049,22.0,...,0.000023,0.000006,0.000008,0.0,194.0,229.0,35.0,202.750000,13.125000,194.0
1071,ZrTiHfNb0.5Ta0.5,1,Zr1 Ti1 Hf1 Nb0.5 Ta0.5,5,0.467707,0.370312,0.312720,0.292700,0.279049,22.0,...,0.000023,0.000006,0.000008,0.0,194.0,229.0,35.0,202.750000,13.125000,194.0


### 2. Generate Dataset
*Note: the formula, phase and composition_obj columns are removed from $X$*.

In [4]:
# Choose training columns from dataframe
x_cols = [c for c in alloys.columns if c not in ['formula', 'phase', 'composition_obj']]

y = alloys['phase'].values
X = alloys[x_cols]

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

display(X)

Unnamed: 0,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,6,0.408248,0.302853,0.238495,0.215285,0.199372,13.0,47.0,34.0,28.000000,...,1.548471,0.357311,0.476415,0.0,194.0,229.0,35.0,220.500000,8.833333,194.0
1,6,0.408248,0.302853,0.238495,0.215285,0.199372,24.0,47.0,23.0,29.500000,...,2.110663,0.709140,0.746951,0.0,194.0,229.0,35.0,219.833333,9.555556,194.0
2,6,0.445450,0.340633,0.274847,0.250697,0.233988,13.0,28.0,15.0,25.948207,...,2.110663,0.847577,0.782462,0.0,194.0,229.0,35.0,218.824701,10.617292,194.0
3,6,0.444586,0.339956,0.274300,0.250199,0.233523,13.0,28.0,15.0,25.922465,...,2.110663,0.845892,0.782246,0.0,194.0,229.0,35.0,218.836978,10.605947,194.0
4,6,0.443735,0.339282,0.273756,0.249702,0.233059,13.0,28.0,15.0,25.896825,...,2.110663,0.844214,0.782026,0.0,194.0,229.0,35.0,218.849206,10.594608,194.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068,5,0.473804,0.381200,0.331220,0.315980,0.306266,22.0,72.0,50.0,39.857143,...,0.000023,0.000003,0.000006,0.0,194.0,229.0,35.0,209.000000,17.142857,194.0
1069,6,0.408248,0.302853,0.238495,0.215285,0.199372,22.0,72.0,50.0,36.166667,...,2.110663,0.451013,0.601344,0.0,194.0,229.0,35.0,210.166667,16.166667,194.0
1070,5,0.467707,0.370312,0.312720,0.292700,0.279049,22.0,72.0,50.0,43.875000,...,0.000023,0.000006,0.000008,0.0,194.0,229.0,35.0,202.750000,13.125000,194.0
1071,5,0.467707,0.370312,0.312720,0.292700,0.279049,22.0,73.0,51.0,47.750000,...,0.000023,0.000006,0.000008,0.0,194.0,229.0,35.0,202.750000,13.125000,194.0


### 3. Train Model

In [None]:
# Define model using default hyperparameters and fit the model
rf = RandomForestClassifier(random_state=0, n_jobs=-1)

# Train model
rf.fit(X_train, y_train)

### 4. Evaluate Model

In [6]:
y_pred = rf.predict(X_test)

print('Accuracy: %.6f' % accuracy_score(y_test, y_pred))
print('Precision: %.6f' % precision_score(y_test, y_pred, average='macro'))
print('Recall: %.6f' % recall_score(y_test, y_pred, average='macro'))

Accuracy: 0.681690
Precision: 0.648348
Recall: 0.628710


### 5. Feature Selection

In [7]:
# Rank features using permutation importance
permutation_importance = permutation_importance(rf, X_train, y_train, n_repeats=20, random_state=0, n_jobs=-1)

p_i = sorted(zip(feature_labels, permutation_importance.importances_mean), key=lambda x: x[1], reverse=True)
p_i = pd.DataFrame(p_i, columns=['Label', 'Mean Score'])

display(p_i)

Unnamed: 0,Label,Mean Score
0,MagpieData avg_dev CovalentRadius,0.000975
1,MagpieData mean NdValence,0.000836
2,MagpieData mean Electronegativity,0.000627
3,MagpieData mean NsUnfilled,0.000418
4,0-norm,0.000348
...,...,...
133,MagpieData mean Row,-0.000279
134,MagpieData mean NdUnfilled,-0.000279
135,MagpieData range GSvolume_pa,-0.000279
136,MagpieData range NUnfilled,-0.000348


### 6. Regenerate Dataset

In [8]:
feature_count = 26

feature_selection = p_i['Label'].head(feature_count).values

# Choose training columns from dataframe
x_cols = [c for c in alloys.columns if c in feature_selection]

y = alloys['phase'].values
X = alloys[x_cols]

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

display(X)
for feature in feature_selection:
    print(feature)

Unnamed: 0,0-norm,MagpieData avg_dev Number,MagpieData mean MendeleevNumber,MagpieData avg_dev AtomicWeight,MagpieData maximum MeltingT,MagpieData range MeltingT,MagpieData mean Column,MagpieData avg_dev Column,MagpieData avg_dev Row,MagpieData range CovalentRadius,...,MagpieData avg_dev NpValence,MagpieData mean NdValence,MagpieData mean NsUnfilled,MagpieData avg_dev NpUnfilled,MagpieData avg_dev NdUnfilled,MagpieData mean NUnfilled,MagpieData mean GSvolume_pa,MagpieData avg_dev GSvolume_pa,MagpieData mean GSmagmom,MagpieData avg_dev SpaceGroupNumber
0,6,6.666667,61.666667,16.247130,2180.0,1246.53,10.000000,1.666667,0.333333,24.0,...,0.277778,6.666667,0.500000,1.388889,1.666667,3.000000,12.605833,2.532778,0.357311,8.833333
1,6,5.833333,56.666667,14.385292,2180.0,945.07,8.500000,1.500000,0.277778,21.0,...,0.000000,6.833333,0.333333,0.000000,1.500000,3.500000,11.550431,1.593190,0.709140,9.555556
2,6,1.257123,55.071713,2.269390,2180.0,1246.53,8.019920,1.219028,0.007936,18.0,...,0.007936,6.175299,0.199203,0.039682,1.053952,4.003984,10.617965,0.319247,0.847577,10.617292
3,6,1.285330,55.107356,2.310700,2180.0,1246.53,8.029821,1.228415,0.011857,18.0,...,0.011857,6.163022,0.198807,0.059286,1.060832,4.005964,10.629620,0.332514,0.845892,10.605947
4,6,1.313303,55.142857,2.351666,2180.0,1246.53,8.039683,1.237717,0.015747,18.0,...,0.015747,6.150794,0.198413,0.078735,1.067649,4.007937,10.641227,0.345673,0.844214,10.594608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068,5,9.918367,45.142857,25.038430,2750.0,809.00,4.428571,0.489796,0.489796,22.0,...,0.000000,2.714286,0.285714,0.000000,0.816327,7.571429,19.235714,3.109388,0.000003,17.142857
1069,6,13.222222,52.000000,34.830733,2506.0,1148.23,6.833333,2.833333,0.666667,51.0,...,0.000000,5.000000,0.166667,0.000000,3.000000,5.166667,15.700833,4.994167,0.451013,16.166667
1070,5,14.062500,45.125000,37.743226,2896.0,955.00,4.375000,0.562500,0.500000,21.0,...,0.000000,2.625000,0.250000,0.000000,0.937500,7.625000,19.755000,2.942500,0.000006,13.125000
1071,5,18.437500,44.875000,49.261696,3290.0,1349.00,4.250000,0.375000,0.656250,15.0,...,0.000000,2.375000,0.125000,0.000000,0.562500,7.750000,20.058750,2.638750,0.000006,13.125000


MagpieData avg_dev CovalentRadius
MagpieData mean NdValence
MagpieData mean Electronegativity
MagpieData mean NsUnfilled
0-norm
MagpieData avg_dev NdUnfilled
MagpieData avg_dev SpaceGroupNumber
MagpieData mean GSvolume_pa
MagpieData avg_dev Electronegativity
MagpieData avg_dev GSvolume_pa
MagpieData mean GSmagmom
MagpieData avg_dev Number
MagpieData mean MendeleevNumber
MagpieData avg_dev AtomicWeight
MagpieData maximum MeltingT
MagpieData avg_dev Column
MagpieData range CovalentRadius
MagpieData mode CovalentRadius
MagpieData mean NsValence
MagpieData avg_dev NpValence
MagpieData avg_dev NpUnfilled
MagpieData mean NUnfilled
MagpieData range MeltingT
MagpieData mean Column
MagpieData avg_dev Row
MagpieData mean CovalentRadius


### 7. Retrain Model

In [None]:
# Define model using default hyperparameters and fit the model
rf = RandomForestClassifier(random_state=0, n_jobs=-1)

# Train model
rf.fit(X_train, y_train)

### 8. Re-evaluate Model

In [10]:
y_pred = rf.predict(X_test)

print('Accuracy: %.6f' % accuracy_score(y_test, y_pred))
print('Precision: %.6f' % precision_score(y_test, y_pred, average='macro'))
print('Recall: %.6f' % recall_score(y_test, y_pred, average='macro'))

Accuracy: 0.721127
Precision: 0.689247
Recall: 0.664312


### 9. Tune Hyperparameters

In [10]:
space = [Integer(10, 10**5, 'log-uniform', name='n_estimators'), 
         Integer(1, 10**5, 'log-uniform', name='max_depth'), 
         Integer(2, 10**3, 'uniform', name='min_samples_split'), 
         Integer(1, 10**3, 'uniform', name='min_samples_leaf'), 
         Real(10**-7, 0.5, 'log-uniform', name='min_weight_fraction_leaf'), 
         Integer(1, 138, 'uniform', name='max_features'), 
         Integer(2, 10**5, 'log-uniform', name='max_leaf_nodes'), 
         Real(10**-5, 10**-1, 'log-uniform', name='ccp_alpha'), 
         Integer(1, 138, 'uniform', name='max_samples')]

In [11]:
@use_named_args(space)
def objective(**params):
    rf = RandomForestClassifier(random_state=0, n_jobs=-1)
    rf.set_params(**params)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
    accuracy = cross_val_score(rf, X_train, y_train, cv=cv, scoring='accuracy')
    accuracy = np.mean(accuracy)
    return 1.0 - accuracy

class tqdm_skopt(object):
    def __init__(self, **kwargs):
        self._bar = tqdm(**kwargs)
        
    def __call__(self, res):
        self._bar.update()

In [None]:
n_calls = 100
result = gp_minimize(objective,
                     space,
                     n_calls=n_calls,
                     random_state=0,
                     callback=[tqdm_skopt(total=n_calls, desc="Progress")])

plot_convergence(result)
plt.savefig('plot.png')

print(result.x)
print(1-result.fun)

### 10. Retrain Model

In [None]:
# Define model using default hyperparameters and fit the model
rf = RandomForestClassifier(n_estimators=1000,
                            max_depth=1000,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            min_weight_fraction_leaf=1e-05,
                            max_features=17,
                            max_leaf_nodes=681,
                            ccp_alpha=0.0017758739729781885,
                            max_samples=138,
                            random_state=0)

# Train model
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print('Accuracy: %.6f' % accuracy_score(y_test, y_pred))
print('Precision: %.6f' % precision_score(y_test, y_pred, average='macro'))
print('Recall: %.6f' % recall_score(y_test, y_pred, average='macro'))