# Ensemble

### Classificar se uma pessoa vai fechar a conta no banco
Dataset: https://www.kaggle.com/shrutimechlearn/churn-modelling

In [None]:
# Importar as bibliotecas

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

Precision (Positive Predictive Value)
Recall (True Positive Rate)
F1  is the harmonic mean of precision and recall
Support The number of occurrences of each label in y_true

Precision (Positive Predictive Value) --> tp / (tp + fp) 
Recall (True Positive Rate) --> tp / (tp + fn)


In [None]:

df = pd.read_csv('Churn_Modelling.csv')
df.head()

In [None]:
df.info()


In [None]:
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis = 1)
df.head()

In [None]:
df.describe().T

In [None]:
# Avaliando a quantidade de NA´s (missing)
pd.DataFrame(zip(df.isnull().sum(), df.isnull().sum()/len(df)), columns=['Count', 'Proportion'], index=df.columns)


In [None]:
# Uma variável é a variável target: "Exited" : Quantos Churn
(df.Exited.value_counts() / df.shape[0]) * 100

In [None]:
sb.histplot(data=df, x="Age");

In [None]:
# Variável Age
sb.boxplot(x='Exited', y='Age', data=df)
plt.title('Distribuição de Age por Churn')
plt.show()

In [None]:
sb.histplot(data=df, x="CreditScore");

In [None]:
sb.boxplot(x='Exited', y='CreditScore', data=df, palette='hls');

In [None]:
sb.histplot(data=df, x="Tenure");

In [None]:
sb.boxplot(x='Exited', y='Tenure', data=df, palette='hls');

In [None]:
# comparison between HasCrCard and target

Tenure_cat = pd.crosstab(df['Exited'],df['Tenure'])
Tenure_cat.div(Tenure_cat.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(6, 6));

In [None]:
sb.histplot(data=df, x="Balance");

In [None]:
sb.boxplot(x='Exited', y='Balance', data=df, palette='hls');

In [None]:
sb.boxplot(x='Exited', y='NumOfProducts', data=df, palette='hls');

In [None]:
# comparison between NumOfProducts and target

NumOfProducts_cat = pd.crosstab(df['Exited'],df['NumOfProducts'])
NumOfProducts_cat.div(NumOfProducts_cat.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(6, 6));

In [None]:
# 

In [None]:
sb.histplot(data=df, x="EstimatedSalary");

In [None]:
sb.boxplot(x='Exited', y='EstimatedSalary', data=df, palette='hls');

In [None]:
# comparison between Gender and target

Gender_cat = pd.crosstab(df['Exited'],df['Gender'])
Gender_cat.div(Gender_cat.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(6, 6));

In [None]:
# comparison between Geography and target

Geography_cat = pd.crosstab(df['Exited'],df['Geography'])
Geography_cat.div(Geography_cat.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(6, 6));

In [None]:
# comparison between IsActiveMember and target

IsActiveMember_cat = pd.crosstab(df['Exited'],df['IsActiveMember'])
IsActiveMember_cat.div(IsActiveMember_cat.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(6, 6));

In [None]:
# comparison between HasCrCard and target

HasCrCard_cat = pd.crosstab(df['Exited'],df['HasCrCard'])
HasCrCard_cat.div(HasCrCard_cat.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(6, 6));

In [None]:
df.info()

In [None]:
# uma opção 

# df['Balance'] = df['Balance'].astype(int)
# df['EstimatedSalary'] = df['EstimatedSalary'].astype(int)

df.info()

In [None]:
# transformando as variveis categoricas 

baseline_df =pd.get_dummies(df, drop_first=True)

print(baseline_df)

In [None]:
baseline_df.info()

In [None]:
# usando diferentes classificadores 
# 1º: Separar o conjunto em variavel resposta e variaveis de treinamento


In [None]:
X = baseline_df[baseline_df.columns.difference(['Exited'])].values

print(X[0])
y = baseline_df['Exited'].values
print(y)

In [None]:
from sklearn.model_selection import train_test_split

# Dividindo os dados em treino e teste:
X_treino, X_teste, y_treino, y_teste = train_test_split(baseline_df.drop('Exited',axis=1),baseline_df['Exited'],test_size=0.3, random_state=17)


In [None]:
X_treino.describe().T

In [None]:
X_treino.shape, y_treino.shape

In [None]:
# Feature Scaling 
# Only on Independent Variable 

from sklearn.preprocessing import MinMaxScaler

cols =X_treino.columns

MinMax = MinMaxScaler()
X_treino = MinMax.fit_transform(X_treino)
X_teste = MinMax.fit_transform(X_teste)
X_treino = pd.DataFrame(X_treino, columns=[cols])
X_treino.head()


## Voting Ensemble

In [None]:
# Documentação de referência: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

# Documentação de referência: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

# Documentação de referência: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.CategoricalNB.html


from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB

# Documentação de referência: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html

from sklearn.ensemble import VotingClassifier



In [None]:
knn = KNeighborsClassifier(n_neighbors = 5)
decision_tree = DecisionTreeClassifier(max_depth=4, criterion='gini', min_samples_split=100)
nbCat = CategoricalNB()
svm = LinearSVC(max_iter=5000)
nbGau = GaussianNB()

voting_clf = VotingClassifier(
    estimators=[('dt', decision_tree), ('knn', knn), ('nb', nbGau), ('svm', svm)], 
    n_jobs=-1)

voting_clf.fit(X_treino, y_treino)


#voting{‘hard’, ‘soft’}, default=’hard’
# If ‘hard’, uses predicted class labels for majority rule voting.
# Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities,
# which is recommended for an ensemble of well-calibrated classifiers.

In [None]:
from sklearn.metrics import accuracy_score

for clf in (decision_tree, knn,  nbGau, svm , voting_clf):
  clf.fit(X_treino, y_treino)
  y_predito = clf.predict(X_teste)
  acuracia = accuracy_score(y_teste, y_predito)
  print("Classificador {} tem acurácia de {}".format(clf.__class__.__name__, acuracia))

In [None]:
# importância das variáveis na árvore de decisão

for feature,importancia in zip(X_treino.columns,decision_tree.feature_importances_):
    print("{}:{}".format(feature, importancia))

## Bagging (Bootstrap aggregation)

In [None]:
# Documentação https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
from sklearn.ensemble import BaggingClassifier

bagging = BaggingClassifier(DecisionTreeClassifier(splitter='random'), n_estimators=500, max_samples=300, )
bagging.fit(X_treino, y_treino)

In [None]:
y_predito = bagging.predict(X_teste)
accuracy_score(y_teste, y_predito)

In [None]:
from sklearn.metrics import confusion_matrix


# Matriz de Confusão

confusion_matrix = confusion_matrix(y_teste, y_predito)
confusion_matrix


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_teste, y_predito))

# Resultado do classification_report:

## Random Forests

In [None]:
# Documentação https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=500, max_depth=4, min_samples_split=100, random_state=17, n_jobs=-1)
random_forest.fit(X_treino, y_treino)

In [None]:
y_predito = random_forest.predict(X_teste)
accuracy_score(y_teste, y_predito)

In [None]:
for caracteristica, score in zip(X_treino, random_forest.feature_importances_):
  print('caracteristica {} e Importância {}'.format(caracteristica, score*100))

In [None]:
from sklearn.metrics import confusion_matrix


# Matriz de Confusão

confusion_matrix = confusion_matrix(y_teste, y_predito)
confusion_matrix

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_teste, y_predito))

# Resultado do classification_report:

## Boosting : AdaBoost 

In [None]:
# Documentação https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
from sklearn.ensemble import AdaBoostClassifier

ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5), n_estimators=500, learning_rate=0.7, random_state=17)
ada_boost.fit(X_treino, y_treino)

n_estimatorsint, default=50
The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. Values must be in the range [1, inf).


learning_ratefloat, default=1.0
Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the learning_rate and n_estimators parameters. Values must be in the range (0.0, inf).

In [None]:
y_predito = ada_boost.predict(X_teste)
accuracy_score(y_teste, y_predito)

In [None]:
from sklearn.metrics import confusion_matrix


# Matriz de Confusão

confusion_matrix = confusion_matrix(y_teste, y_predito)
confusion_matrix

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_teste, y_predito))

# Resultado do classification_report:

## Gradient Boosting

In [None]:
# Documentação https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
from sklearn.ensemble import GradientBoostingClassifier



In [None]:
gradient_boost = GradientBoostingClassifier()

gradient_boost.fit(X_treino, y_treino)

In [None]:
y_predito = gradient_boost.predict(X_teste)
accuracy_score(y_teste, y_predito)

In [None]:
from sklearn.metrics import confusion_matrix


# Matriz de Confusão

confusion_matrix = confusion_matrix(y_teste, y_predito)
confusion_matrix

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_teste, y_predito))

# Resultado do classification_report:

In [None]:
for caracteristica, score in zip(X_treino, gradient_boost.feature_importances_):
  print('caracteristica {} e Importância {}'.format(caracteristica, score*100))



## XGBoost (Extreme Gradient Boosting)

In [None]:
# Documentação sobre XGBoost https://xgboost.readthedocs.io/en/stable/index.html

!pip install  xgboost

In [None]:
import warnings
warnings.simplefilter('ignore')
# Import XGBoost
import xgboost
# XGBoost Classifier
from xgboost import XGBClassifier

In [None]:
X_treino

In [None]:

xgb = xgboost.XGBClassifier(n_estimators=500, max_depth=5, learning_rate=0.01, n_jobs=-1)
bst = xgb.fit(X_treino, y_treino)



In [None]:
y_predito = xgb.predict(X_teste)
accuracy_score(y_teste, y_predito)

In [None]:
from sklearn.metrics import confusion_matrix


# Matriz de Confusão

confusion_matrix = confusion_matrix(y_teste, y_predito)
confusion_matrix

In [None]:

from sklearn.metrics import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=['0', '1'])
disp.plot();



In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_teste, y_predito))

# Resultado do classification_report:

In [None]:
y_predito

In [None]:
from sklearn.model_selection import cross_val_score

scores_dt = cross_val_score(xgb, X_treino, y_treino, scoring='accuracy', cv=10)
print(scores_dt.mean())

In [None]:
from xgboost import plot_importance
    
# Plot feature importance
plot_importance(xgb);

In [None]:
 # fazendo de outra forma:

In [None]:
feature_important = xgb.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())



In [None]:
data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.nlargest(40, columns="score").plot(kind='barh', figsize = (20,10)) ## plot top 40 features