In [114]:
import json
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

seed = 42

df = pd.read_csv("mushrooms.csv")

X, y = df.drop("class", axis=1), df["class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=seed
)

y_train = y_train.astype('category').cat.codes
y_test = y_test.astype('category').cat.codes

ohe = OneHotEncoder(sparse=True)
X_train_enc = ohe.fit_transform(X_train)
X_test_enc = ohe.transform(X_test)

# X_train_enc = pd.DataFrame(X_train_enc, columns = ohe.get_feature_names())
# X_test_enc = pd.DataFrame(X_test_enc, columns = ohe.get_feature_names())

rf = RandomForestClassifier(max_depth=10, random_state=seed)
rf.fit(X_train_enc, y_train)


RandomForestClassifier(max_depth=10, random_state=42)

In [115]:
rf.score(X_test_enc, y_test)

1.0

In [116]:
rf.predict(X_test_enc)

array([0, 1, 1, ..., 1, 0, 0], dtype=int8)

In [117]:
rf.feature_importances_

array([2.52757840e-03, 1.90054071e-04, 7.36426433e-04, 5.19659222e-04,
       2.98789865e-04, 9.17279890e-04, 5.13075473e-03, 8.49187623e-05,
       6.83468840e-03, 1.70264394e-03, 3.35265783e-03, 6.32481745e-04,
       1.04016607e-03, 6.03314594e-04, 7.08374754e-04, 1.26849916e-03,
       2.49128282e-04, 2.49645026e-04, 3.08951478e-03, 2.91121950e-03,
       2.56678627e-02, 2.90199198e-02, 5.68571633e-03, 9.84317421e-03,
       7.11312274e-02, 3.36736653e-03, 8.51647011e-04, 1.48402968e-01,
       1.53875783e-02, 5.37913875e-03, 4.32696635e-03, 1.92624611e-04,
       1.63224010e-04, 1.29100303e-02, 1.19161394e-02, 6.05771256e-02,
       6.97410112e-02, 3.88025147e-02, 5.70809468e-04, 6.38661705e-04,
       3.23471497e-03, 4.33207506e-04, 8.20754758e-04, 0.00000000e+00,
       2.06333699e-04, 1.41161986e-03, 8.05804848e-04, 1.12236263e-03,
       2.21691184e-04, 6.60823431e-03, 9.20012586e-03, 8.66375608e-03,
       1.18010743e-02, 6.13295136e-03, 1.13209376e-02, 2.47981776e-03,
      

In [118]:
import seaborn as sns
import matplotlib.pyplot as plt
# Calculate feature importance in random forest
importances = rf.feature_importances_
labels = ohe.get_feature_names()
feature_df = pd.DataFrame(
    list(zip(labels, importances)), columns=["feature", "importance"]
)
feature_df = feature_df.sort_values(by="importance", ascending=False)

# image formatting
axis_fs = 18  # fontsize
title_fs = 22  # fontsize
sns.set(style="whitegrid", rc={'figure.figsize':(10, 28)})
ax = sns.barplot(x="importance", y="feature", data=feature_df)
ax.set_xlabel("Importance", fontsize=axis_fs)
ax.set_ylabel("Feature", fontsize=axis_fs)  # ylabel
ax.set_title("Random forest\nfeature importance", fontsize=title_fs)

plt.tight_layout()
plt.savefig("feature_importance.png", dpi=120)
plt.close()

In [123]:
y_test.values

array([0, 1, 1, ..., 1, 0, 0], dtype=int8)

In [120]:
predictions = rf.predict(X_test_enc)
predictions 

array([0, 1, 1, ..., 1, 0, 0], dtype=int8)

In [125]:
from sklearn import metrics
precision, recall, prc_thresholds = metrics.precision_recall_curve(y_test.values, predictions)
fpr, tpr, roc_thresholds = metrics.roc_curve(y_test, predictions)

avg_prec = metrics.average_precision_score(y_test, predictions)
roc_auc = metrics.roc_auc_score(y_test, predictions)

In [126]:
precision

array([1., 1.])

In [127]:
recall

array([1., 0.])

In [128]:
prc_thresholds

array([1], dtype=int8)

In [156]:
import json
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

seed = 42

df = pd.read_csv("mushrooms.csv")

X, y = df.drop("class", axis=1), df["class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=seed
)

y_train = y_train.astype("category").cat.codes
y_test = y_test.astype("category").cat.codes


ohe = OneHotEncoder(sparse=True)
X_train_enc = ohe.fit_transform(X_train)
X_test_enc = ohe.transform(X_test)

# X_train_enc = pd.DataFrame(X_train_enc, columns = ohe.get_feature_names())
# X_test_enc = pd.DataFrame(X_test_enc, columns = ohe.get_feature_names())

rf = RandomForestClassifier(max_depth=100, random_state=seed)
rf.fit(X_train_enc, y_train)

predictions = rf.predict(X_test_enc)

train_score = rf.score(X_train_enc, y_train) * 100
test_score = rf.score(X_test_enc, y_test) * 100


precision, recall, prc_thresholds = metrics.precision_recall_curve(
    y_test, predictions
)
fpr, tpr, roc_thresholds = metrics.roc_curve(y_test, predictions)

avg_prec = metrics.average_precision_score(y_test, predictions)
roc_auc = metrics.roc_auc_score(y_test, predictions)

# Calculate feature importance in random forest
importances = rf.feature_importances_
labels = ohe.get_feature_names()
feature_df = pd.DataFrame(
    list(zip(labels, importances)), columns=["feature", "importance"]
)
feature_df = feature_df.sort_values(by="importance", ascending=False)

# image formatting
axis_fs = 18  # fontsize
title_fs = 22  # fontsize
sns.set(style="whitegrid", rc={"figure.figsize": (10, 28)})
ax = sns.barplot(x="importance", y="feature", data=feature_df)
ax.set_xlabel("Importance", fontsize=axis_fs)
ax.set_ylabel("Feature", fontsize=axis_fs)  # ylabel
ax.set_title("Random forest\nfeature importance", fontsize=title_fs)

plt.tight_layout()
plt.savefig("feature_importance.png", dpi=120)
plt.close()

nth_point = math.ceil(len(prc_thresholds) / 1000)
prc_points = list(zip(precision, recall, prc_thresholds))[::nth_point]
with open("prc.json", "w") as fd:
    json.dump(
        {
            "prc": [
                {"precision": float(p), "recall": float(r), "threshold": float(t)}
                for p, r, t in prc_points
            ]
        },
        fd,
        indent=4,
    )

with open("roc.json", "w") as fd:
    json.dump(
        {
            "roc": [
                {"fpr": float(fp), "tpr": float(tp), "threshold": float(t)}
                for fp, tp, t in zip(fpr, tpr, roc_thresholds)
            ]
        },
        fd,
        indent=4,
    )

with open("encoder.pkl", "wb") as f:
    pickle.dump(ohe, f)

with open("rf.pkl", "wb") as m:
    pickle.dump(rf, m)

with open("scores.json", "w") as fd:
    json.dump({"accuracy_score": test_score}, fd)

with open("metrics.txt", "w") as outfile:
    outfile.write("Training accuracy: %2.1f%%\n" % train_score)
    outfile.write("Test accuracy: %2.1f%%\n" % test_score)

In [154]:
with open('prc.json', "w") as fd:
    json.dump(
        {
            "prc": [
                {"precision": float(p), "recall": float(r), "threshold": float(t)}
                for p, r, t in prc_points
            ]
        },
        fd,
        indent=4,
    )
