<a href="https://colab.research.google.com/github/rodrigofer89/Machine-Learning/blob/main/Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Aula 24 - Métodos de ensemble

In [15]:
# imports
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [16]:
# Ler o dataset
df = pd.read_csv('Churn_Modelling.csv')
df.sample(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6366,6367,15677324,Botts,683,Germany,Male,73,9,124730.26,1,1,1,51999.5,0
8265,8266,15618562,Woodward,618,Germany,Female,40,0,140306.38,1,1,0,160618.61,1
6077,6078,15738588,Nebechi,660,Germany,Female,37,2,133200.09,1,0,0,71433.88,0
8365,8366,15659949,Chiu,850,France,Male,31,1,96399.31,2,1,0,106534.15,0
7668,7669,15611794,Galloway,526,Germany,Male,61,6,133845.28,2,1,1,45180.8,0
2876,2877,15585284,Thomson,604,Spain,Female,35,7,147285.52,1,1,1,57807.05,0
5248,5249,15700366,Burton,669,France,Male,39,3,119452.03,1,1,1,171575.54,0
2884,2885,15776047,Nicholls,620,France,Female,29,3,0.0,2,0,1,153392.28,0
8369,8370,15632882,Konovalova,684,Germany,Male,37,1,126817.13,2,1,1,29995.83,1
6635,6636,15678374,Colombo,666,France,Female,59,5,0.0,2,1,1,185123.09,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [18]:
# Separação de dados de treinamento e teste, preprocessamento
X = df.iloc[:, 3:13].values
y = df.iloc[:, 13].values

le = LabelEncoder()
X[:, 1] = le.fit_transform(X[:, 1])

le_sex = LabelEncoder()
X[:, 2] = le.fit_transform(X[:, 2])

ct = ColumnTransformer( [ ('Geography', OneHotEncoder(), [1]) ],
                       remainder='passthrough')

X = ct.fit_transform(X)
X = X[:, 1:]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.15,
                                                    random_state=0,
                                                    stratify=y)

In [30]:
# Escalar as features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [31]:
# Treinando modelos
model1 = LogisticRegression()
model2 = DecisionTreeClassifier(max_depth=2)
model3 = SVC()
model4 = KNeighborsClassifier(n_neighbors=5)


model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)
model4.fit(X_train, y_train)

y_pred1 = model1.predict(X_test)
y_pred2 = model2.predict(X_test)
y_pred3 = model3.predict(X_test)
y_pred4 = model4.predict(X_test)

cm_model1 = confusion_matrix(y_test, y_pred1)
cm_model2 = confusion_matrix(y_test, y_pred2)
cm_model3 = confusion_matrix(y_test, y_pred3)
cm_model4 = confusion_matrix(y_test, y_pred4)

display(cm_model1)
display(cm_model2)
display(cm_model3)
display(cm_model4)


array([[1147,   47],
       [ 245,   61]])

array([[1104,   90],
       [ 181,  125]])

array([[1172,   22],
       [ 206,  100]])

array([[1131,   63],
       [ 203,  103]])

In [32]:
# 10-fold CV
kfold = KFold(n_splits=10)
result1 = cross_val_score(model1, X_train, y_train, cv=kfold)
result2 = cross_val_score(model2, X_train, y_train, cv=kfold)
result3 = cross_val_score(model3, X_train, y_train, cv=kfold)
result4 = cross_val_score(model4, X_train, y_train, cv=kfold)

# Resultado
print('ACC Regressão Logistica: ', result1.mean())
print('ACC    àvore de Decisão: ', result2.mean())
print('ACC                 SVM: ', result3.mean())
print('ACC                 KNN: ', result4.mean())

ACC Regressão Logistica:  0.8110588235294116
ACC    àvore de Decisão:  0.8311764705882352
ACC                 SVM:  0.8558823529411764
ACC                 KNN:  0.8294117647058823


In [33]:
# Esquema de enseble
estimators = [('regressão', model1),
              ('árvore', model2),
              ('svm', model3),
              ('knn', model4)]

ensemble = VotingClassifier(estimators)
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

In [34]:
result_ensemble = cross_val_score(ensemble, X_train, y_train, cv=kfold)
print('ACC Ensemble: ', result_ensemble.mean())

ACC Ensemble:  0.8425882352941176


In [35]:
confusion_matrix(y_test, y_pred)

array([[1181,   13],
       [ 234,   72]])