In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('../input/porto-seguro-safe-driver-prediction/train.csv')

target_count = df_train.target.value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

target_count.plot(kind='bar', title='Count (target)');

## Vamos olhar para o comportamento dos dados

In [None]:
df_train.head()

>> não nos fala muito, mas a temos um id na primiera coluna que podemos dropar e o target que precisamos dropar das features

**<h2 id="t2" style="margin-bottom: 18px">Paradoxo da Acuácia</h2>**

Um dos maiores erros que data scientists inexperientes cometem quando lidam com datasets desbalanceados é confiar em uma métrica simples como <code>accuracy_score</code> Apesar de um score elevado nessa métrica, vamos provar como ela pode ser enganosa.


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

# Remove 'id' and 'target' columns
labels = df_train.columns[2:]

X = df_train[labels]
y = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42, stratify=y)

In [None]:

model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

### Vamos piorar o nosso modelo
Se esse modelo estiver certo com uma acurácia de 96%, podemos piorá-lo ao treinar e testar com apenas uma feature 

In [None]:
model = XGBClassifier()
model.fit(X_train[['ps_calc_01']], y_train)
y_pred = model.predict(X_test[['ps_calc_01']])

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

<h2 id="t3" style="margin-bottom: 18px">Matriz de Confusão</h2>

Uma forma interessante de avaliar os resultados é através da matriz de confusão que mostra os valores preditos e esperados ou reais.  
Na primeira linha, a primeira coluna indica quantos "Classe 0" foram preditos corretamente (como "Classe 0") 
Na primeira linha, segunda coluna os erroneamente classificados como "Classe 1".
Na segunda linha, a primeira coluna nos mostra quantos "Classe 1" foram preditos erroneamente (como "Classe 0") 
Na segunda linha, e segunda coluna os corretamente classificados como "Classe1"

A diagonal descendente nos mostra as predições corretas. Modelos que acertam mais possuem concentração de valores maiores na primeira linha, primeira coluna e na segunda linha, segunda coluna.

In [None]:
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt

conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print('Confusion matrix:\n', conf_mat)

labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()

Não é o nosso caso! Assim como no exemplo de fraudes, o modelo não previu nenhuma observação como "Classe 1"

<h1 id="t5">Resampling</h1>

In [None]:
# Função para devolver a contagem de amostras na classe 0 e na classe 1
len_class_0, len_class_1 = df_train.target.value_counts()

# devidir o df por classe
df_class_0 = df_train[df_train['target'] == 0] 
df_class_1 = df_train[df_train['target'] == 1]

<h2 id="t5">Random undersampling</h2>
método raiz: vai gerar os mesmos resultados que usando o imbalearn do scikit learn.

In [None]:
#reduzindo as observações da classe 0 de acordo com o tamanho da classe 1
df_class_0_under = df_class_0.sample(len_class_1, 
                                     random_state=42)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under.target.value_counts())

df_test_under.target.value_counts().plot(kind='bar', title='Count (target)')
plt.show()

**Vamos testar modelar e testar**

In [None]:
# separando X e y do df_test_under
labels = df_train.columns[2:]
X_tun = df_test_under[labels]
y_tun = df_test_under['target']

In [None]:
model = XGBClassifier()
model.fit(X_tun, y_tun)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt

conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print('Confusion matrix:\n', conf_mat)

labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()

<h2 id="t5">Random oversampling</h2>
método raiz: vai gerar os mesmos resultados muito parecidos aos do imblearn do scikit learn.

In [None]:
#vamos aumentar o número de observações da classe 1
#precisamos repor ou replace as amostras para isso ser possível
df_class_1_over = df_class_1.sample(len_class_0, 
                                    replace=True,
                                    random_state=42)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.target.value_counts())

df_test_over.target.value_counts().plot(kind='bar', title='Count (target)');

In [None]:
# separando X e y do df_test_over
labels = df_train.columns[2:]
X_tov = df_test_over[labels]
y_tov = df_test_over['target']

In [None]:
model = XGBClassifier()
model.fit(X_tov, y_tov)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print('Confusion matrix:\n', conf_mat)

labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()

In [None]:
score = f1_score(y_test, y_pred)
print("F1_score: %.2f" %(score))

<h2 id="t5">Usando o Imblearn do Scikit Learn</h2>
Essa biblioteca possui o mesmo random que fizemos na mão e outros metodos mais elaborados de oversampling e undersampling

In [None]:
# Import imbalace technique algorithims
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter # counter takes values returns value_counts dictionary

## Random Undersampling

In [None]:
print('Original dataset shape %s' % Counter(y))

rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_rus))

In [None]:
model = XGBClassifier()
model.fit(X_rus, y_rus)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt

conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print('Confusion matrix:\n', conf_mat)

labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()

## Random Oversampling

In [None]:
print('Original dataset shape %s' % Counter(y))
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_sample(X, y)

print(X_ros.shape[0] - X.shape[0], 'new random picked points')

print('Resampled dataset shape %s' % Counter(y_ros))

In [None]:
model = XGBClassifier()
model.fit(X_ros, y_ros)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print('Confusion matrix:\n', conf_mat)

labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()

In [None]:
score = f1_score(y_test, y_pred)
print("F1_score: %.2f" % (score))

## <a id='smote'>Synthetic Minority OverSampling Technique (SMOTE)</a>

SMOTE (Synthetic Minority Oversampling TEchnique) consiste em sintetizar elementos da classe minoritária baseado nos elementos que já existem. Funciona de forma randomica selecionando aleatoriamente observações da classe minoritária e computando pontos através de um KNN. Os pontos sintéticos são adicionados entre os pontos escolhidos e seus vizinhos. 

 ![](https://raw.githubusercontent.com/rafjaa/machine_learning_fecib/master/src/static/img/smote.png)

In [None]:
print('Original dataset shape %s' % Counter(y))
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_sample(X, y)
print('Resampled dataset shape %s' % Counter(y_sm))

In [None]:
model = XGBClassifier()
model.fit(X_sm, y_sm)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print('Confusion matrix:\n', conf_mat)

labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()

## <a id='adasyn'>Adaptive Synthetic Sampling Method for Imbalanced Data (ADASYN)</a>

ADASYN (Adaptive Synthetic) também é um algoritmo que gera dados sintéticos. Sua maior vantagem é que ele tenta aprender prioritariamente com os dados mais difíceis de aprender da classe minoritária. Sua principal vantagem pode virar uma fraquesa se os dados da classe minoritária forem muito esparsos.


In [None]:
print('Original dataset shape %s' % Counter(y))

adasyn = ADASYN(random_state=42)
X_ada, y_ada = adasyn.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_ada))

In [None]:
model = XGBClassifier()
model.fit(X_ada, y_ada)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print('Confusion matrix:\n', conf_mat)

labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()

<h2 id="t8" style="margin-bottom: 18px">Under-sampling: Tomek links</h2>
Tomek links são pares de instancias próximas mas de classes opostas. Essa técnica de undersampling remove as observações das classes majoritárias, aumentando a fronteira entre as duas classes e dessa forma facilitando o processo de classificação

![](https://raw.githubusercontent.com/rafjaa/machine_learning_fecib/master/src/static/img/tomek.png?v=2)

In [None]:
from imblearn.under_sampling import TomekLinks

print('Original dataset shape %s' % Counter(y))

tl = TomekLinks(sampling_strategy='majority')
X_tl, y_tl = tl.fit_sample(X, y)

print('Removed indexes:', tl.sample_indices_)
print('Resampled dataset shape %s' % Counter(y_tl))


In [None]:
model = XGBClassifier()
model.fit(X_tl, y_tl)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print('Confusion matrix:\n', conf_mat)

labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()