# Random Forest 

In [1]:
from sklearn.datasets import load_iris

iris = load_iris()

# Both of the below are the same and we don't need either of them!
# X = pd.DataFrame(iris.data, columns=iris.feature_names)
# X = pd.DataFrame(iris["data"], columns=iris["feature_names"])

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.3, random_state=1, stratify=iris.target
)

---
### Hyperparameters
- **criterion**: measures the quality of the split (gini, entropy for the information gain)
    - gini impurity is defined as the sum of the squared probabilities of each class
    - information gain is defined as the decrease in entropy
    - random forest tries to maximize the information gain at each node
- **max_depth**: maximum depth of the tree (default - none)
- **min_samples_split**: minimum number of samples required to split an internal node
- **random_state**: random seed used to generate the random subsets of features and data
- **n_estimators**: number of decision tree base estimators
- **max_features**: maximum number of features to consider when looking for the best split
---

In [3]:
from sklearn.ensemble import RandomForestClassifier

# create instance of model and pass the parameters to it
forest = RandomForestClassifier(
    criterion="gini", n_estimators=5, random_state=1, n_jobs=2
)

forest.fit(X_train, y_train)

In [4]:
from sklearn.metrics import accuracy_score

y_pred = forest.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.9777777777777777
