# Ensemble Learning and Random Forest 

- Ensemble --> groip of predictors are called Ensemble 
     - That is why this is called ensemble learning 
     - Ensemble learning algorithms are called the "Ensemble method" 

In [1]:
import numpy as np
import os

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split


log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf=RandomForestClassifier(n_estimators=100 ,random_state=42)
svm_clf=SVC(gamma="scale",random_state=42)

voting_clf =VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard') 
X, y = make_moons(n_samples=5000, noise=0.2, random_state=42) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

voting_clf.fit(X_train,y_train) 



In [3]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf): 
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test) 
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.883
RandomForestClassifier 0.974
SVC 0.975
VotingClassifier 0.975


## Soft voting 
    - If all the clsifiers you choose can take the class probabilities (i.e. they have the method prdict_proba()) 
    - Then you can tell Sciit-Learn to predict the class with the highest class probability, averaged over all the individual claisfiers 

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split


log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", probability=True, random_state=42) 


voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft') 


X, y = make_moons(n_samples=5000, noise=0.3, random_state=42) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [3]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf): 
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test) 
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.842
RandomForestClassifier 0.902
SVC 0.909
VotingClassifier 0.905


In [4]:
from sklearn.ensemble import BaggingClassifier 
from sklearn.tree import DecisionTreeClassifier 

# if bootstrap - Fals --> you activate the pasting methid
# n_jobs determines the number of CPU cores that are to be used in the processing, setting to -1 will use all the avialbel CPU cores 

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, 
    max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train) 
y_pred = bag_clf.predict(X_test)
print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

VotingClassifier 0.909


In [6]:
tree_clf=DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train , y_train)
y_pred_tree=tree_clf.predict(X_test)
print(accuracy_score(y_test,y_pred_tree))

0.865


# Random Forest

- Random forest is an ensemble of Decision Trees, generally trained via the bagging method ( or sometimes pasting) 
- Typically setting the max_samples =  size of the traing set 

In [8]:
from sklearn.ensemble import RandomForestClassifier
rdn_clf=RandomForestClassifier(n_estimators=500,max_leaf_nodes=16, n_jobs=1)
rdn_clf.fit(X_train ,y_train)

y_pred_rf=rnd_clf.predict(X_test)

In [9]:
print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

VotingClassifier 0.909


In [12]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16), 
    n_estimators=500, max_samples=1, bootstrap=True, n_jobs=-1)