<a href="https://colab.research.google.com/github/rodrigofardin/Reconhecimento-de-Padroes/blob/main/Ensembles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import random
import warnings
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from scipy import stats
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from collections import Counter
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier

In [17]:
request = fetch_openml(data_id=1504, as_frame=True)

In [22]:
X = np.array(request.data)
y = request.target.map({"1":1,"2":2})
X.shape, y.shape

((1941, 33), (1941,))

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1455, 33), (486, 33), (1455,), (486,))

In [25]:
modelo = VotingClassifier([
    ('knn', KNeighborsClassifier()),
    ('gnb', GaussianNB()),
    ('per', Perceptron())
])
modelo.fit(X_train, y_train)
vote_pred = modelo.predict(X_test)
vote_hits = vote_pred == y_test
vote_hits, sum(vote_hits)/len(vote_hits)

(1605    False
 1502     True
 70       True
 976      True
 1052     True
         ...  
 987      True
 745      True
 567      True
 198      True
 15       True
 Name: Class, Length: 486, dtype: bool, 0.6440329218106996)

In [26]:
modelo = BaggingClassifier(DecisionTreeClassifier(splitter='random'), 
                           n_estimators=100, 
                           max_features=0.15,
                           random_state=42)
modelo.fit(X_train, y_train)
bag_pred = modelo.predict(X_test)
bag_hits = bag_pred == y_test
bag_hits, sum(bag_hits)/len(bag_hits)

(1605    False
 1502     True
 70       True
 976      True
 1052     True
         ...  
 987      True
 745      True
 567      True
 198      True
 15       True
 Name: Class, Length: 486, dtype: bool, 0.8148148148148148)

In [27]:
modelo = RandomForestClassifier(random_state=42)
modelo.fit(X_train, y_train)
rfc_pred = modelo.predict(X_test)
rfc_hits = rfc_pred == y_test
rfc_hits, sum(rfc_hits)/len(rfc_hits)

(1605    True
 1502    True
 70      True
 976     True
 1052    True
         ... 
 987     True
 745     True
 567     True
 198     True
 15      True
 Name: Class, Length: 486, dtype: bool, 0.9917695473251029)

In [28]:
modelo = ExtraTreesClassifier(random_state=42)
modelo.fit(X_train, y_train)
etc_pred = modelo.predict(X_test)
etc_hits = etc_pred == y_test
etc_hits, sum(etc_hits)/len(etc_hits)

(1605    True
 1502    True
 70      True
 976     True
 1052    True
         ... 
 987     True
 745     True
 567     True
 198     True
 15      True
 Name: Class, Length: 486, dtype: bool, 1.0)

In [29]:
random.seed(42)

def maisFrequente(y):
  return Counter(y.flat).most_common(1)[0][0]

def caracteristicaValor(X):
  feat = random.randint(0, X.shape[1]-1)
  xmin = np.min(X[:,feat])
  xmax = np.max(X[:,feat])
  valor = random.random()*(xmax-xmin)+xmin
  return feat, valor

class Arvore(BaseEstimator, ClassifierMixin):
  def fit(self, X, y):
    self.caracteristica, self.valor = caracteristicaValor(X)
    maiores = X[:,self.caracteristica] > self.valor
    if sum(maiores)>0 and sum(~maiores)>0:
      self.maiores = Arvore()
      self.maiores.fit(X[maiores,:], y[maiores])
      self.menores = Arvore()
      self.menores.fit(X[~maiores,:], y[~maiores])
    else:
      self.resposta = maisFrequente(y)

  def predict(self, X):
    y = np.empty((X.shape[0]), dtype=np.int64)
    if hasattr(self, "resposta"):
      y[:] = self.resposta
    else:
      maiores = X[:,self.caracteristica] > self.valor
      y[maiores] = self.maiores.predict(X[maiores,:])
      y[~maiores] = self.menores.predict(X[~maiores,:])
    return y

modelo = BaggingClassifier(Arvore(), 
                           n_estimators=200,
                           max_features=0.1,
                           random_state=42)
modelo.fit(X_train, y_train)
bag_pred = modelo.predict(X_test)
bag_hits = bag_pred == y_test
bag_hits, sum(bag_hits)/len(bag_hits)

(1605    False
 1502    False
 70       True
 976      True
 1052     True
         ...  
 987      True
 745      True
 567      True
 198      True
 15       True
 Name: Class, Length: 486, dtype: bool, 0.6419753086419753)

In [30]:
modelo = AdaBoostClassifier(DecisionTreeClassifier(max_depth=25, splitter='random'), 
                            learning_rate=0.15, random_state=42)
modelo.fit(X_train, y_train)
abc_pred = modelo.predict(X_test)
abc_hits = abc_pred == y_test
abc_hits, sum(abc_hits)/len(abc_hits)

(1605    True
 1502    True
 70      True
 976     True
 1052    True
         ... 
 987     True
 745     True
 567     True
 198     True
 15      True
 Name: Class, Length: 486, dtype: bool, 1.0)

In [32]:
warnings.filterwarnings('ignore')

voting = VotingClassifier([
    ('knn', KNeighborsClassifier()),
    ('gnb', GaussianNB()),
    ('per', Perceptron())
])

modelo = StackingClassifier([
    ('voting', voting),
    ('extrat', ExtraTreesClassifier()),
    ('ranfor', RandomForestClassifier())
], cv=3, passthrough=True)

modelo.fit(X_train, y_train)
stack_pred = modelo.predict(X_test)
stack_hits = stack_pred == y_test
stack_hits, sum(stack_hits)/len(stack_hits)

(1605    False
 1502    False
 70       True
 976      True
 1052     True
         ...  
 987      True
 745      True
 567      True
 198      True
 15       True
 Name: Class, Length: 486, dtype: bool, 0.6481481481481481)

In [41]:
modelo = StackingClassifier([
    ('randomforest42', RandomForestClassifier(random_state=42)),
    ('randomforest43', RandomForestClassifier(random_state=43)),
    ('randomforest44', RandomForestClassifier(random_state=44))
], cv=3, passthrough=True)

modelo.fit(X_train, y_train)
sc_pr = modelo.predict(X_test)
schits = sc_pr == y_test
schits, sum(schits)/len(schits)

(1605    False
 1502    False
 70       True
 976      True
 1052     True
         ...  
 987      True
 745      True
 567      True
 198      True
 15       True
 Name: Class, Length: 486, dtype: bool, 0.6481481481481481)

In [42]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

parametros = {'knn__n_neighbors': [3,5,7, 11,13],
              'gnb__var_smoothing': [0.00000001, 0.000000001, 0.00000001], 
              }

modelo = GridSearchCV(voting, parametros, cv=3)

modelo.fit(X_train, y_train)
stack_pred = modelo.predict(X_test)
stack_hits = stack_pred == y_test
stack_hits, sum(stack_hits)/len(stack_hits)

(1605    False
 1502     True
 70       True
 976      True
 1052     True
         ...  
 987      True
 745      True
 567      True
 198      True
 15       True
 Name: Class, Length: 486, dtype: bool, 0.6419753086419753)