# HEAPML Project
## Random Forest
This notebook outlines the steps needed to train the RF model

In [36]:
### GENERAL IMPORTS ###
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### PYMATGEN/MATMINER IMPORTS ###
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer

### SKLEARN IMPORTS ###
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.inspection import permutation_importance

### 1. Import Featurized Data

In [17]:
feature_calculators = MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset('magpie')])
feature_labels = feature_calculators.feature_labels()

alloys = pd.read_csv('../data/featurized_alloys.csv')

print(alloys.head())

            formula  phase             composition_obj  0-norm    2-norm  \
0      AgAlCoCrCuNi      3     Ag1 Al1 Co1 Cr1 Cu1 Ni1       6  0.408248   
1      AgCoCrFeMnNi      1     Ag1 Co1 Cr1 Fe1 Mn1 Ni1       6  0.408248   
2  Al0.02CoCrFeMnNi      1  Al0.02 Co1 Cr1 Fe1 Mn1 Ni1       6  0.445450   
3  Al0.03CoCrFeMnNi      1  Al0.03 Co1 Cr1 Fe1 Mn1 Ni1       6  0.444586   
4  Al0.04CoCrFeMnNi      1  Al0.04 Co1 Cr1 Fe1 Mn1 Ni1       6  0.443735   

     3-norm    5-norm    7-norm   10-norm  MagpieData minimum Number  ...  \
0  0.302853  0.238495  0.215285  0.199372                       13.0  ...   
1  0.302853  0.238495  0.215285  0.199372                       24.0  ...   
2  0.340633  0.274847  0.250697  0.233988                       13.0  ...   
3  0.339956  0.274300  0.250199  0.233523                       13.0  ...   
4  0.339282  0.273756  0.249702  0.233059                       13.0  ...   

   MagpieData range GSmagmom  MagpieData mean GSmagmom  \
0                   1.

### 2. Create Dataset

In [18]:
# Choose training columns from dataframe
x_cols = [c for c in alloys.columns if c not in ['formula', 'phase', 'composition_obj']]

y = alloys['phase'].values
X = alloys[x_cols]

print(X.head())

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

   0-norm    2-norm    3-norm    5-norm    7-norm   10-norm  \
0       6  0.408248  0.302853  0.238495  0.215285  0.199372   
1       6  0.408248  0.302853  0.238495  0.215285  0.199372   
2       6  0.445450  0.340633  0.274847  0.250697  0.233988   
3       6  0.444586  0.339956  0.274300  0.250199  0.233523   
4       6  0.443735  0.339282  0.273756  0.249702  0.233059   

   MagpieData minimum Number  MagpieData maximum Number  \
0                       13.0                       47.0   
1                       24.0                       47.0   
2                       13.0                       28.0   
3                       13.0                       28.0   
4                       13.0                       28.0   

   MagpieData range Number  MagpieData mean Number  ...  \
0                     34.0               28.000000  ...   
1                     23.0               29.500000  ...   
2                     15.0               25.948207  ...   
3                     15.0    

### 3. Train Model

In [34]:
# Define model using default hyperparameters and fit the model
rf = RandomForestClassifier()

# Train model
rf.fit(X_train, y_train)

### 4. Evaluate Model

In [27]:
# Get accuracy scores and RMSE scores from 10-fold CV
# Define 10-fold CV
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=0)

acc_score = rf.score(X_test, y_test)

cv_score = np.sqrt(abs(np.mean(cross_val_score(rf, X, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=5))))

print('Accuracy Score: ' + str(acc_score))
print('CV Score: ' + str(cv_score))

Accuracy Score: 0.7314814814814815
CV Score: 0.8210201423071629


### 5. Feature Selection

In [61]:
# Rank features using feature importance
feature_importance = rf.feature_importances_

f_i = sorted(zip(feature_labels, feature_importance), key=lambda x: x[1], reverse=True)
f_i = pd.DataFrame(f_i, columns=['Label', 'Score'])

print(f_i)

                              Label     Score
0         MagpieData avg_dev Column  0.024598
1       MagpieData mean GSvolume_pa  0.022925
2      MagpieData range GSvolume_pa  0.022510
3    MagpieData avg_dev GSvolume_pa  0.021393
4          MagpieData mean NValence  0.021091
..                              ...       ...
133      MagpieData mean NfUnfilled  0.000000
134   MagpieData avg_dev NfUnfilled  0.000000
135      MagpieData mode NfUnfilled  0.000000
136    MagpieData minimum GSbandgap  0.000000
137     MagpieData minimum GSmagmom  0.000000

[138 rows x 2 columns]


In [51]:
# Rank features using permutation importance
permutation_importance = permutation_importance(rf, X_test, y_test, n_repeats=15)

p_i = sorted(zip(feature_labels, permutation_importance.importances_mean), key=lambda x: x[1], reverse=True)
p_i = pd.DataFrame(p_i, columns=['Label', 'Mean Score'])

print(p_i)

                                   Label  Mean Score
0              MagpieData mean NdValence    0.019136
1              MagpieData mean NsValence    0.017284
2           MagpieData avg_dev NsValence    0.016667
3                 MagpieData mean Number    0.016667
4             MagpieData mean NpUnfilled    0.015432
..                                   ...         ...
133          MagpieData minimum GSmagmom    0.000000
134          MagpieData maximum GSmagmom    0.000000
135            MagpieData range GSmagmom    0.000000
136  MagpieData maximum SpaceGroupNumber    0.000000
137        MagpieData range AtomicWeight   -0.008642

[138 rows x 2 columns]


### 6. Create New Dataset

In [65]:
feature_count = 10

feature_selection = f_i['Label'].head(feature_count).values

# Choose training columns from dataframe
x_cols = [c for c in alloys.columns if c in feature_selection]

y = alloys['phase'].values
X = alloys[x_cols]

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

### 7. Retrain Model

In [66]:
# Define model using default hyperparameters and fit the model
rf = RandomForestClassifier()

# Train model
rf.fit(X_train, y_train)

### 8. Evaluate New Model

In [67]:
# Get accuracy scores and RMSE scores from 10-fold CV
# Define 10-fold CV
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=0)

acc_score = rf.score(X_test, y_test)

cv_score = np.sqrt(abs(np.mean(cross_val_score(rf, X, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=5))))

print('Accuracy Score: ' + str(acc_score))
print('CV Score: ' + str(cv_score))

Accuracy Score: 0.7014084507042253
CV Score: 0.8501165599229172
