# HEAPML Project
## Gradient Boosted Decision Tree
This notebook outlines the steps needed to train the GBDT model for the HEAPML project.

In [24]:
import numpy as np
import pandas as pd

### PYMATGEN/MATMINER IMPORTS ###
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer

### SKLEARN IMPORTS ###
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.inspection import permutation_importance

### 1. Import Featurized Data

In [25]:
feature_calculators = MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset('magpie')])
feature_labels = feature_calculators.feature_labels()

alloys = pd.read_csv('../data/featurized_alloys.csv')

display(alloys)

Unnamed: 0,formula,phase,composition_obj,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,MagpieData minimum Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,AgAlCoCrCuNi,3,Ag1 Al1 Co1 Cr1 Cu1 Ni1,6,0.408248,0.302853,0.238495,0.215285,0.199372,13.0,...,1.548471,0.357311,0.476415,0.0,194.0,229.0,35.0,220.500000,8.833333,194.0
1,AgCoCrFeMnNi,1,Ag1 Co1 Cr1 Fe1 Mn1 Ni1,6,0.408248,0.302853,0.238495,0.215285,0.199372,24.0,...,2.110663,0.709140,0.746951,0.0,194.0,229.0,35.0,219.833333,9.555556,194.0
2,Al0.02CoCrFeMnNi,1,Al0.02 Co1 Cr1 Fe1 Mn1 Ni1,6,0.445450,0.340633,0.274847,0.250697,0.233988,13.0,...,2.110663,0.847577,0.782462,0.0,194.0,229.0,35.0,218.824701,10.617292,194.0
3,Al0.03CoCrFeMnNi,1,Al0.03 Co1 Cr1 Fe1 Mn1 Ni1,6,0.444586,0.339956,0.274300,0.250199,0.233523,13.0,...,2.110663,0.845892,0.782246,0.0,194.0,229.0,35.0,218.836978,10.605947,194.0
4,Al0.04CoCrFeMnNi,1,Al0.04 Co1 Cr1 Fe1 Mn1 Ni1,6,0.443735,0.339282,0.273756,0.249702,0.233059,13.0,...,2.110663,0.844214,0.782026,0.0,194.0,229.0,35.0,218.849206,10.594608,194.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068,Zr2.0TiHfVNb2.0,1,Zr2 Ti1 Hf1 V1 Nb2,5,0.473804,0.381200,0.331220,0.315980,0.306266,22.0,...,0.000023,0.000003,0.000006,0.0,194.0,229.0,35.0,209.000000,17.142857,194.0
1069,ZrTiHfCuNiFe,1,Zr1 Ti1 Hf1 Cu1 Ni1 Fe1,6,0.408248,0.302853,0.238495,0.215285,0.199372,22.0,...,2.110663,0.451013,0.601344,0.0,194.0,229.0,35.0,210.166667,16.166667,194.0
1070,ZrTiHfNb0.5Mo0.5,1,Zr1 Ti1 Hf1 Nb0.5 Mo0.5,5,0.467707,0.370312,0.312720,0.292700,0.279049,22.0,...,0.000023,0.000006,0.000008,0.0,194.0,229.0,35.0,202.750000,13.125000,194.0
1071,ZrTiHfNb0.5Ta0.5,1,Zr1 Ti1 Hf1 Nb0.5 Ta0.5,5,0.467707,0.370312,0.312720,0.292700,0.279049,22.0,...,0.000023,0.000006,0.000008,0.0,194.0,229.0,35.0,202.750000,13.125000,194.0


### 2. Create Dataset

In [26]:
# Choose training columns from dataframe
x_cols = [c for c in alloys.columns if c not in ['formula', 'phase', 'composition_obj']]

y = alloys['phase'].values
X = alloys[x_cols]

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

display(X)

Unnamed: 0,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,6,0.408248,0.302853,0.238495,0.215285,0.199372,13.0,47.0,34.0,28.000000,...,1.548471,0.357311,0.476415,0.0,194.0,229.0,35.0,220.500000,8.833333,194.0
1,6,0.408248,0.302853,0.238495,0.215285,0.199372,24.0,47.0,23.0,29.500000,...,2.110663,0.709140,0.746951,0.0,194.0,229.0,35.0,219.833333,9.555556,194.0
2,6,0.445450,0.340633,0.274847,0.250697,0.233988,13.0,28.0,15.0,25.948207,...,2.110663,0.847577,0.782462,0.0,194.0,229.0,35.0,218.824701,10.617292,194.0
3,6,0.444586,0.339956,0.274300,0.250199,0.233523,13.0,28.0,15.0,25.922465,...,2.110663,0.845892,0.782246,0.0,194.0,229.0,35.0,218.836978,10.605947,194.0
4,6,0.443735,0.339282,0.273756,0.249702,0.233059,13.0,28.0,15.0,25.896825,...,2.110663,0.844214,0.782026,0.0,194.0,229.0,35.0,218.849206,10.594608,194.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068,5,0.473804,0.381200,0.331220,0.315980,0.306266,22.0,72.0,50.0,39.857143,...,0.000023,0.000003,0.000006,0.0,194.0,229.0,35.0,209.000000,17.142857,194.0
1069,6,0.408248,0.302853,0.238495,0.215285,0.199372,22.0,72.0,50.0,36.166667,...,2.110663,0.451013,0.601344,0.0,194.0,229.0,35.0,210.166667,16.166667,194.0
1070,5,0.467707,0.370312,0.312720,0.292700,0.279049,22.0,72.0,50.0,43.875000,...,0.000023,0.000006,0.000008,0.0,194.0,229.0,35.0,202.750000,13.125000,194.0
1071,5,0.467707,0.370312,0.312720,0.292700,0.279049,22.0,73.0,51.0,47.750000,...,0.000023,0.000006,0.000008,0.0,194.0,229.0,35.0,202.750000,13.125000,194.0


### 3. Train Model

In [27]:
# Define models using default hyperparameters and fit the model
gbdt = GradientBoostingClassifier(random_state=0)

# Train model
gbdt.fit(X_train, y_train)

### 4. Evaluate Model

In [28]:
# Get accuracy scores and RMSE scores from 10-fold CV
# Define 10-fold CV
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=0)

train_acc_score = gbdt.score(X_train, y_train)
test_acc_score = gbdt.score(X_test, y_test)

cv_score = np.sqrt(abs(np.mean(cross_val_score(gbdt, X, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=5))))

print('RF Train Accuracy Score: ' + str(train_acc_score))
print('RF Test Accuracy Score: ' + str(test_acc_score))
print('CV Score: ' + str(cv_score))

RF Train Accuracy Score: 0.9512534818941504
RF Test Accuracy Score: 0.6563380281690141
CV Score: 0.8290387783150772


### 5. Feature Selection

In [29]:
# Rank features using permutation importance
permutation_importance = permutation_importance(gbdt, X_test, y_test, n_repeats=30, random_state=0, n_jobs=2)

p_i = sorted(zip(feature_labels, permutation_importance.importances_mean), key=lambda x: x[1], reverse=True)
p_i = pd.DataFrame(p_i, columns=['Label', 'Mean Score'])

display(p_i)

Unnamed: 0,Label,Mean Score
0,MagpieData avg_dev Column,0.009484
1,MagpieData avg_dev CovalentRadius,0.008545
2,MagpieData range GSvolume_pa,0.007887
3,MagpieData avg_dev GSmagmom,0.006103
4,MagpieData mean NValence,0.005258
...,...,...
133,MagpieData mean Electronegativity,-0.010986
134,MagpieData mean Column,-0.011549
135,MagpieData mean NpValence,-0.011831
136,MagpieData avg_dev NsValence,-0.013052


### 6. Create New Dataset

In [42]:
feature_count = 26

feature_selection = p_i['Label'].head(feature_count).values

# Choose training columns from dataframe
x_cols = [c for c in alloys.columns if c in feature_selection]

y = alloys['phase'].values
X = alloys[x_cols]

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

display(X)

Unnamed: 0,7-norm,MagpieData minimum Number,MagpieData maximum Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,MagpieData maximum MendeleevNumber,MagpieData mode MendeleevNumber,MagpieData minimum MeltingT,...,MagpieData range NValence,MagpieData mean NValence,MagpieData range NdUnfilled,MagpieData avg_dev NUnfilled,MagpieData maximum GSvolume_pa,MagpieData range GSvolume_pa,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mean SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,0.215285,13.0,47.0,28.000000,6.666667,13.0,49.0,73.0,49.0,933.47,...,8.0,8.333333,5.0,1.666667,16.480,6.235,0.357311,0.476415,220.500000,194.0
1,0.215285,24.0,47.0,29.500000,5.833333,24.0,49.0,65.0,49.0,1234.93,...,5.0,8.500000,5.0,1.500000,16.330,6.085,0.709140,0.746951,219.833333,194.0
2,0.250697,13.0,28.0,25.948207,1.257123,24.0,49.0,73.0,49.0,933.47,...,7.0,7.980080,5.0,1.199981,16.480,6.235,0.847577,0.782462,218.824701,194.0
3,0.250199,13.0,28.0,25.922465,1.285330,24.0,49.0,73.0,49.0,933.47,...,7.0,7.970179,5.0,1.199957,16.480,6.235,0.845892,0.782246,218.836978,194.0
4,0.249702,13.0,28.0,25.896825,1.313303,24.0,49.0,73.0,49.0,933.47,...,7.0,7.960317,5.0,1.199924,16.480,6.235,0.844214,0.782026,218.849206,194.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068,0.315980,22.0,72.0,39.857143,9.918367,40.0,43.0,47.0,44.0,1941.00,...,14.0,6.428571,2.0,0.489796,23.195,10.185,0.000003,0.000006,209.000000,194.0
1069,0.215285,22.0,72.0,36.166667,13.222222,22.0,43.0,64.0,43.0,1357.77,...,14.0,9.166667,8.0,2.833333,23.195,12.875,0.451013,0.601344,210.166667,194.0
1070,0.292700,22.0,72.0,43.875000,14.062500,22.0,43.0,50.0,43.0,1941.00,...,14.0,7.875000,3.0,0.562500,23.195,7.505,0.000006,0.000008,202.750000,194.0
1071,0.292700,22.0,73.0,47.750000,18.437500,22.0,43.0,48.0,43.0,1941.00,...,15.0,9.500000,2.0,0.375000,23.195,6.505,0.000006,0.000008,202.750000,194.0


### 7. Retrain Model

In [43]:
# Define model using default hyperparameters and fit the model
gbdt = GradientBoostingClassifier(random_state=0)

# Train model
gbdt.fit(X_train, y_train)

### 8. Evaluate New Model

In [44]:
# Get accuracy scores and RMSE scores from 10-fold CV
# Define 10-fold CV
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=0)

train_acc_score = gbdt.score(X_train, y_train)
test_acc_score = gbdt.score(X_test, y_test)

cv_score = np.sqrt(abs(np.mean(cross_val_score(gbdt, X, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=5))))

print('RF Train Accuracy Score: ' + str(train_acc_score))
print('RF Test Accuracy Score: ' + str(test_acc_score))
print('CV Score: ' + str(cv_score))

RF Train Accuracy Score: 0.9484679665738162
RF Test Accuracy Score: 0.6957746478873239
CV Score: 0.8312434854404739
