# HEAPML Project
## Gradient Boosted Decision Tree

In [None]:
### GENERAL IMPORTS ###
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

### PYMATGEN/MATMINER IMPORTS ###
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer

### SKLEARN IMPORTS ###
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split, RepeatedStratifiedKFold
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### SKOMPTOMIZE IMPORTS ###
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from skopt.plots import plot_convergence

### 1. Import Featurized Data

In [None]:
feature_calculators = MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset('magpie')])
feature_labels = feature_calculators.feature_labels()

alloys = pd.read_csv('./data/featurized_alloys.csv')

display(alloys)

### 2. Generate Dataset
*Note: the formula, phase and composition_obj columns are removed from $X$*

In [None]:
x_cols = [c for c in alloys.columns if c not in ['formula', 'phase', 'composition_obj']]

y = alloys['phase'].values
X = alloys[x_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, stratify=y)

display(X)

### 3. Train Model

In [None]:
gbdt = GradientBoostingClassifier(random_state=0)

gbdt.fit(X_train, y_train)

### 4. Evaluate Model

In [None]:
y_pred = gbdt.predict(X_test)

print('Precision: %.6f' % precision_score(y_test, y_pred, average='macro'))
print('Recall: %.6f' % recall_score(y_test, y_pred, average='macro'))
print('F1: %.6f' % f1_score(y_test, y_pred, average='macro'))

### 5. Feature Selection

In [None]:
permutation_importance = permutation_importance(gbdt, X_train, y_train, n_repeats=20, random_state=0, scoring='accuracy', n_jobs=-1)

p_i = sorted(zip(feature_labels, permutation_importance.importances_mean), key=lambda x: x[1], reverse=True)
p_i = pd.DataFrame(p_i, columns=['Label', 'Mean Score'])

display(p_i)

In [None]:
f1_scores = []
precision_scores = []
recall_scores = []

for feature_count in range(1, 139):
    feature_selection = p_i['Label'].head(feature_count).values

    x_cols = [c for c in alloys.columns if c in feature_selection]

    y = alloys['phase'].values
    X = alloys[x_cols]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, stratify=y)

    gbdt = GradientBoostingClassifier(random_state=0)

    gbdt.fit(X_train, y_train)

    y_pred = gbdt.predict(X_test)

    f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    precision_scores.append(precision_score(y_test, y_pred, average='macro'))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))

print(f1_scores)
print(precision_scores)
print(recall_scores)

### 6. Regenerate Dataset

In [None]:
feature_count = 18

feature_selection = p_i['Label'].head(feature_count).values

x_cols = [c for c in alloys.columns if c in feature_selection]

y = alloys['phase'].values
X = alloys[x_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

display(X)
for feature in feature_selection:
    print(feature)

### 7. Retrain Model

In [None]:
gbdt = GradientBoostingClassifier(random_state=0)

gbdt.fit(X_train, y_train)

### 8. Re-evaluate Model

In [None]:
y_pred = gbdt.predict(X_test)

print('Precision: %.6f' % precision_score(y_test, y_pred, average='macro'))
print('Recall: %.6f' % recall_score(y_test, y_pred, average='macro'))
print('F1: %.6f' % f1_score(y_test, y_pred, average='macro'))