# HEAPML Project
## Gradient Boosted Decision Tree
This notebook outlines the steps needed to train the GBDT model for the HEAPML project.

In [1]:
### GENERAL IMPORTS ###
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

### PYMATGEN/MATMINER IMPORTS ###
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer

### SKLEARN IMPORTS ###
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split, RepeatedStratifiedKFold
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, precision_score, recall_score

### SKOMPTOMIZE IMPORTS ###
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from skopt.plots import plot_convergence

### 1. Import Featurized Data

In [2]:
feature_calculators = MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset('magpie')])
feature_labels = feature_calculators.feature_labels()

alloys = pd.read_csv('./data/featurized_alloys.csv')

display(alloys)

Unnamed: 0,formula,phase,composition_obj,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,MagpieData minimum Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,AgAlCoCrCuNi,3,Ag1 Al1 Co1 Cr1 Cu1 Ni1,6,0.408248,0.302853,0.238495,0.215285,0.199372,13.0,...,1.548471,0.357311,0.476415,0.0,194.0,229.0,35.0,220.500000,8.833333,194.0
1,AgCoCrFeMnNi,1,Ag1 Co1 Cr1 Fe1 Mn1 Ni1,6,0.408248,0.302853,0.238495,0.215285,0.199372,24.0,...,2.110663,0.709140,0.746951,0.0,194.0,229.0,35.0,219.833333,9.555556,194.0
2,Al0.02CoCrFeMnNi,1,Al0.02 Co1 Cr1 Fe1 Mn1 Ni1,6,0.445450,0.340633,0.274847,0.250697,0.233988,13.0,...,2.110663,0.847577,0.782462,0.0,194.0,229.0,35.0,218.824701,10.617292,194.0
3,Al0.03CoCrFeMnNi,1,Al0.03 Co1 Cr1 Fe1 Mn1 Ni1,6,0.444586,0.339956,0.274300,0.250199,0.233523,13.0,...,2.110663,0.845892,0.782246,0.0,194.0,229.0,35.0,218.836978,10.605947,194.0
4,Al0.04CoCrFeMnNi,1,Al0.04 Co1 Cr1 Fe1 Mn1 Ni1,6,0.443735,0.339282,0.273756,0.249702,0.233059,13.0,...,2.110663,0.844214,0.782026,0.0,194.0,229.0,35.0,218.849206,10.594608,194.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068,Zr2.0TiHfVNb2.0,1,Zr2 Ti1 Hf1 V1 Nb2,5,0.473804,0.381200,0.331220,0.315980,0.306266,22.0,...,0.000023,0.000003,0.000006,0.0,194.0,229.0,35.0,209.000000,17.142857,194.0
1069,ZrTiHfCuNiFe,1,Zr1 Ti1 Hf1 Cu1 Ni1 Fe1,6,0.408248,0.302853,0.238495,0.215285,0.199372,22.0,...,2.110663,0.451013,0.601344,0.0,194.0,229.0,35.0,210.166667,16.166667,194.0
1070,ZrTiHfNb0.5Mo0.5,1,Zr1 Ti1 Hf1 Nb0.5 Mo0.5,5,0.467707,0.370312,0.312720,0.292700,0.279049,22.0,...,0.000023,0.000006,0.000008,0.0,194.0,229.0,35.0,202.750000,13.125000,194.0
1071,ZrTiHfNb0.5Ta0.5,1,Zr1 Ti1 Hf1 Nb0.5 Ta0.5,5,0.467707,0.370312,0.312720,0.292700,0.279049,22.0,...,0.000023,0.000006,0.000008,0.0,194.0,229.0,35.0,202.750000,13.125000,194.0


### 2. Generate Dataset
*Note: the formula, phase and composition_obj columns are removed from $X$*

In [3]:
# Choose training columns from dataframe
x_cols = [c for c in alloys.columns if c not in ['formula', 'phase', 'composition_obj']]

y = alloys['phase'].values
X = alloys[x_cols]

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

display(X)

Unnamed: 0,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,6,0.408248,0.302853,0.238495,0.215285,0.199372,13.0,47.0,34.0,28.000000,...,1.548471,0.357311,0.476415,0.0,194.0,229.0,35.0,220.500000,8.833333,194.0
1,6,0.408248,0.302853,0.238495,0.215285,0.199372,24.0,47.0,23.0,29.500000,...,2.110663,0.709140,0.746951,0.0,194.0,229.0,35.0,219.833333,9.555556,194.0
2,6,0.445450,0.340633,0.274847,0.250697,0.233988,13.0,28.0,15.0,25.948207,...,2.110663,0.847577,0.782462,0.0,194.0,229.0,35.0,218.824701,10.617292,194.0
3,6,0.444586,0.339956,0.274300,0.250199,0.233523,13.0,28.0,15.0,25.922465,...,2.110663,0.845892,0.782246,0.0,194.0,229.0,35.0,218.836978,10.605947,194.0
4,6,0.443735,0.339282,0.273756,0.249702,0.233059,13.0,28.0,15.0,25.896825,...,2.110663,0.844214,0.782026,0.0,194.0,229.0,35.0,218.849206,10.594608,194.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068,5,0.473804,0.381200,0.331220,0.315980,0.306266,22.0,72.0,50.0,39.857143,...,0.000023,0.000003,0.000006,0.0,194.0,229.0,35.0,209.000000,17.142857,194.0
1069,6,0.408248,0.302853,0.238495,0.215285,0.199372,22.0,72.0,50.0,36.166667,...,2.110663,0.451013,0.601344,0.0,194.0,229.0,35.0,210.166667,16.166667,194.0
1070,5,0.467707,0.370312,0.312720,0.292700,0.279049,22.0,72.0,50.0,43.875000,...,0.000023,0.000006,0.000008,0.0,194.0,229.0,35.0,202.750000,13.125000,194.0
1071,5,0.467707,0.370312,0.312720,0.292700,0.279049,22.0,73.0,51.0,47.750000,...,0.000023,0.000006,0.000008,0.0,194.0,229.0,35.0,202.750000,13.125000,194.0


### 3. Train Model

In [None]:
# Define models using default hyperparameters and fit the model
gbdt = GradientBoostingClassifier(random_state=0)

# Train model
gbdt.fit(X_train, y_train)

### 4. Evaluate Model

In [6]:
y_pred = gbdt.predict(X_test)

print('Accuracy: %.6f' % accuracy_score(y_test, y_pred))
print('Precision: %.6f' % precision_score(y_test, y_pred, average='macro'))
print('Recall: %.6f' % recall_score(y_test, y_pred, average='macro'))

Accuracy: 0.656338
Precision: 0.613606
Recall: 0.594534


### 5. Feature Selection

In [7]:
# Rank features using permutation importance
permutation_importance = permutation_importance(gbdt, X_train, y_train, n_repeats=20, random_state=0, n_jobs=-1)

p_i = sorted(zip(feature_labels, permutation_importance.importances_mean), key=lambda x: x[1], reverse=True)
p_i = pd.DataFrame(p_i, columns=['Label', 'Mean Score'])

display(p_i)

Unnamed: 0,Label,Mean Score
0,MagpieData avg_dev Column,0.035724
1,MagpieData avg_dev GSvolume_pa,0.028134
2,MagpieData avg_dev CovalentRadius,0.028064
3,MagpieData range GSvolume_pa,0.021379
4,MagpieData mean NValence,0.017618
...,...,...
133,MagpieData mean NsValence,-0.000975
134,MagpieData avg_dev AtomicWeight,-0.001184
135,MagpieData minimum MeltingT,-0.002298
136,MagpieData mean CovalentRadius,-0.002577


### 6. Regenerate Dataset

In [20]:
feature_count = 26

feature_selection = p_i['Label'].head(feature_count).values

# Choose training columns from dataframe
x_cols = [c for c in alloys.columns if c in feature_selection]

y = alloys['phase'].values
X = alloys[x_cols]

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

display(X)
for feature in feature_selection:
    print(feature)

Unnamed: 0,0-norm,2-norm,3-norm,5-norm,7-norm,MagpieData mean MendeleevNumber,MagpieData mean AtomicWeight,MagpieData range MeltingT,MagpieData mean MeltingT,MagpieData avg_dev MeltingT,...,MagpieData mean NValence,MagpieData avg_dev NValence,MagpieData avg_dev NpUnfilled,MagpieData avg_dev NUnfilled,MagpieData range GSvolume_pa,MagpieData mean GSvolume_pa,MagpieData avg_dev GSvolume_pa,MagpieData avg_dev GSmagmom,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber
0,6,0.408248,0.302853,0.238495,0.215285,61.666667,61.336406,1246.53,1533.695000,358.305000,...,8.333333,2.555556,1.388889,1.666667,6.235,12.605833,2.532778,0.476415,220.500000,8.833333
1,6,0.408248,0.302853,0.238495,0.215285,56.666667,64.712323,945.07,1706.821667,219.904444,...,8.500000,1.500000,0.000000,1.500000,6.085,11.550431,1.593190,0.746951,219.833333,9.555556
2,6,0.445450,0.340633,0.274847,0.250697,55.071713,55.965213,1246.53,1797.742908,157.575372,...,7.980080,1.219028,0.039682,1.199981,6.235,10.617965,0.319247,0.782462,218.824701,10.617292
3,6,0.444586,0.339956,0.274300,0.250199,55.107356,55.907592,1246.53,1796.024672,158.628491,...,7.970179,1.228415,0.059286,1.199957,6.235,10.629620,0.332514,0.782246,218.836978,10.605947
4,6,0.443735,0.339282,0.273756,0.249702,55.142857,55.850199,1246.53,1794.313254,159.672021,...,7.960317,1.237717,0.078735,1.199924,6.235,10.641227,0.345673,0.782026,218.849206,10.594608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068,5,0.473804,0.381200,0.331220,0.315980,45.142857,92.222751,809.00,2340.857143,280.979592,...,6.428571,3.306122,0.000000,0.489796,10.185,19.235714,3.109388,0.000006,209.000000,17.142857
1069,6,0.408248,0.302853,0.238495,0.215285,52.000000,82.610900,1148.23,1911.961667,279.705000,...,9.166667,3.833333,0.000000,2.833333,12.875,15.700833,4.994167,0.601344,210.166667,16.166667
1070,5,0.467707,0.370312,0.312720,0.292700,45.125000,103.003547,955.00,2349.500000,315.000000,...,7.875000,5.062500,0.000000,0.562500,7.505,19.755000,2.942500,0.000008,202.750000,13.125000
1071,5,0.467707,0.370312,0.312720,0.292700,44.875000,113.627032,1349.00,2398.750000,364.250000,...,9.500000,6.625000,0.000000,0.375000,6.505,20.058750,2.638750,0.000008,202.750000,13.125000


MagpieData avg_dev Column
MagpieData avg_dev GSvolume_pa
MagpieData avg_dev CovalentRadius
MagpieData range GSvolume_pa
MagpieData mean NValence
MagpieData mean SpaceGroupNumber
MagpieData mean AtomicWeight
5-norm
MagpieData avg_dev MeltingT
MagpieData mean NdValence
MagpieData mean Column
MagpieData avg_dev GSmagmom
7-norm
MagpieData mean Electronegativity
MagpieData avg_dev NValence
MagpieData mean MendeleevNumber
MagpieData mean MeltingT
MagpieData range NValence
3-norm
MagpieData avg_dev NUnfilled
MagpieData mean GSvolume_pa
0-norm
MagpieData avg_dev SpaceGroupNumber
2-norm
MagpieData range MeltingT
MagpieData avg_dev NpUnfilled


### 7. Retrain Model

In [None]:
# Define model using default hyperparameters and fit the model
gbdt = GradientBoostingClassifier(random_state=0)

# Train model
gbdt.fit(X_train, y_train)

### 8. Re-evaluate Model

In [22]:
y_pred = gbdt.predict(X_test)

print('Accuracy: %.6f' % accuracy_score(y_test, y_pred))
print('Precision: %.6f' % precision_score(y_test, y_pred, average='macro'))
print('Recall: %.6f' % recall_score(y_test, y_pred, average='macro'))

Accuracy: 0.673239
Precision: 0.637702
Recall: 0.601744
