<a href="https://colab.research.google.com/github/stephenfrein/tree_models/blob/master/Random_Forests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# same setup from our last example

# modules we will need
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree.export import export_text

# get raw data and put in data frame
url = "https://drive.google.com/uc?export=download&id=1Okb4RuShkQ0-dxMj0xeBWnrmq1uIsuOw"
cars_raw = pd.read_csv(url)
cars_raw.head(n=10)

# create a separate copy - usually need to massage the data
cars_clean = cars_raw.copy()

# one-hot encode character variables
cars_clean = pd.get_dummies(cars_clean,columns=["purchase_cost","maint_cost","trunk_size","safety_rating"])
# drop na (null values)
cars_clean = cars_clean.dropna()

# predictor variables - all but column called acceptability
X = cars_clean.drop("acceptability",1)
# target variable
y = cars_clean["acceptability"]

# split into training (70%) and test (30%) sets with seed value for reproducibiity
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [None]:
## random forest approach
from sklearn.ensemble import RandomForestClassifier
# create new classifier - 100 trees with max depth of 3 per tree
rf=RandomForestClassifier(n_estimators=500, max_depth=3)
# train the model
rf.fit(X_train,y_train)
# predict the response for test dataset
y_pred=rf.predict(X_test)
# how many were classified correctly
print("Random Forest Accuracy:", metrics.accuracy_score(y_test, y_pred))



In [None]:
# precision - when we said a car would be acceptable, how often were we right
print("Precision:",metrics.precision_score(y_test, y_pred, pos_label="acc"))

In [None]:
# recall - what % of the ultimately acceptable cars did we find
print("Recall:",metrics.recall_score(y_test, y_pred, pos_label="acc"))

In [None]:
# F1  is harmonic mean of precision and recall - an "average" that is weighted toward the lower number
print("F1 Score:",metrics.f1_score(y_test, y_pred, pos_label="acc"))

In [None]:
# which features are most important?
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

# Exercise #3

Rebuild your IMDB model as a random forest and see if you can improve its performance.

What were the most important variables in your model?

In [None]:
# enjoy!
