### Dependencies

In [None]:
#from google.colab import files
from IPython.display import clear_output
import os
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import tensorflow.keras.callbacks as callbacks
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

RANDOM_STATE = 56
EPOCHS = 16
BATCH_SIZE = 512
CLIP = 16.

##### Kaggle setup.

In [None]:
#files.upload()
#!pip install -q kaggle
#!mkdir -p ~/.kaggle
#!cp kaggle.json ~/.kaggle/
#!ls ~/.kaggle
#!chmod 600 /root/.kaggle/kaggle.json
#!kaggle kernels list — user `franckepeixoto` — sort-by dateRun
#!kaggle competitions download -c porto-seguro-safe-driver-prediction
#!unzip -q  train.csv.zip -d .
#!unzip -q  test.csv.zip -d .
#!rm -d sample_data
#!rm *.zip
#!rm ./kaggle*
clear_output()
!ls ../input/porto-seguro-safe-driver-prediction/

In [None]:
train = pd.read_csv("../input/porto-seguro-safe-driver-prediction/train.csv",header=0)#,index_col=0)
test  = pd.read_csv("../input/porto-seguro-safe-driver-prediction/test.csv",header=0)#,index_col=0)

### Data

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
for c in train.columns:
  train[c].replace(-1,train[c].median(),inplace=True)
for c in test.columns:
  test[c].replace(-1,test[c].median(),inplace=True)

### Correlation Matrix (heatmap)

In [None]:
def heat_map(data):
  data_corr = data.copy()
  data_corr.pop("id")
  data_corr = data_corr.reset_index(drop=True)
  corr =data_corr.corr(method='pearson')
  corr = corr[corr>=.01]
  plt.figure(figsize=(20,25))
  sns.heatmap(corr,annot=True,cmap="PuBuGn",fmt='.1f',linewidths=1)
heat_map(train)

In [None]:
columns = list(train.columns)
columns = [c  for c in  columns   if c.find('calc_')>-1]
train = train.drop(columns, axis=1)
test = test.drop(columns, axis=1)
print(columns)

In [None]:
heat_map(train)

#### Cat vs Target

In [None]:
columns = [c for c in train.columns if c.find("cat")>-1]
for col in columns:
    g = sns.catplot(x=col, hue="target", col="target", data=train, kind="count")
    g.add_legend()

In [None]:
for col in columns:
    train = pd.concat([train, pd.get_dummies(pd.Series(train[col]))], axis=1)
    train = train.drop([col], axis=1)
    test = pd.concat([test, pd.get_dummies(pd.Series(test[col]))], axis=1)
    test = test.drop([col], axis=1)

clear_output()

### Normalize data

In [None]:
xtrain = train.copy()
claim = xtrain[xtrain.target==1]
not_claim = xtrain[xtrain.target==0]

print("Total claims:\t\t{0}".format(claim.shape[0]))
print("Total no claims:\t{0}".format(not_claim.shape[0]))

### Amostra aleatória de pedido de indenização
Percentual da amostra: **0,05%**   

In [None]:
claim_sample = claim.sample(random_state=RANDOM_STATE, frac=.05)
claim_sample.head(2) 

In [None]:
data_shuffle = pd.concat([not_claim,claim]) 
data_shuffle = shuffle(data_shuffle,random_state=RANDOM_STATE)
data_shuffle = data_shuffle.drop(columns=['id'])
data_shuffle.head(2)

### Split / Normalize
Ao treinar um modelo com colunas númericas, o processo de normalização é indispensavel. 

Esse procedimento ajuda na prevenção de exploding durante o trainamento.

In [None]:
split_data = int(len(data_shuffle) * .8)

train_set = data_shuffle[:split_data]
train_labels = train_set.pop("target")

test_set = data_shuffle[split_data:]
test_labels = test_set.pop("target")

print("test",test_set.shape)
print("test_labels",test_labels.shape)
print("train_set",train_set.shape)
print("train_labels",train_labels.shape)

In [None]:
scaler = StandardScaler()
train_set = scaler.fit_transform(train_set) #normalizar somente os dados de treinamento.
test_set = scaler.transform(test_set)

#### Usando clip()
Garantir que todos os valores fiquem dentro de um range especifico.

Isso é usual caso tenhamos algum outliers após nossa normalização.

In [None]:
train_set = np.clip(train_set, CLIP*-1,CLIP)
test_set = np.clip(test_set, CLIP*-1, CLIP)
train_set.shape,test_set.shape

### Downsampling
Vamos preparar o terreno para o tensorflow atual.

A váriavel **weights** permite especificar o peso, levando em conta sua frequencia.

In [None]:
weight_not_claim = 1.0 / data_shuffle.target.value_counts()[0]
weight_claim = 1.0 / data_shuffle.target.value_counts()[1]
weights ={ 0:weight_not_claim, 1:weight_claim}

### Train and Evaluate

As metricas adicionadas abaixo, será crucial para endentermos o desempenho de nosso modelo.

In [None]:
metrics=[]
metrics.append(keras.metrics.TruePositives(name="truePositive"))
metrics.append(keras.metrics.TrueNegatives(name="trueNegative"))
metrics.append(keras.metrics.FalsePositives(name="falsePositive"))
metrics.append(keras.metrics.FalseNegatives(name="falseNegative"))
metrics.append(keras.metrics.Precision(name="precision"))
metrics.append(keras.metrics.Recall(name="recall"))
metrics.append(keras.metrics.AUC(name="auc"))

In [None]:
model = keras.Sequential(name="porto_seguro")
units = int(train_set.shape[1])
input_shape = (train_set.shape[-1],)

model.add(layers.Dense(units, activation=keras.activations.relu, input_shape=input_shape))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(int(units), activation=keras.activations.relu))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(1,activation=keras.activations.sigmoid))

model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=metrics)

model.summary()

In [None]:
early_stopping = callbacks.EarlyStopping(monitor='val_auc',
                                         verbose=1,
                                         patience=5,
                                         mode='max',
                                         restore_best_weights=True)


In [None]:
fit_results = model.fit(train_set,
                        train_labels,
                        callbacks=[early_stopping],
                        validation_data=(test_set,test_labels),
                        class_weight=weights,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE)


### Visualize / Metrics 
Verificação de desempenho de nosso modelo.

In [None]:

mpl.rcParams['figure.figsize'] = (12, 6)
names = ['loss', 'auc', 'precision', 'recall']
history = fit_results.history
epoch = fit_results.epoch

for n, _ in enumerate(names):
  name = str(_).capitalize()
  plt.subplot(2, 2, n+1)
  plt.plot(epoch,history[_],label='Train')
  plt.plot(epoch,history['val_'+_],linestyle="--",label="Val")
  plt.xlabel("Epoch")
  plt.ylabel(name)
  
  plt.legend()





In [None]:
test_set.shape,test_labels.shape

### Porto Seguro’s Safe Driver Prediction | Action
*Objetivo:*  prever pedido de indenização de apólice.
*prefixo* 
* **bin**  Binário.
* **cat**  Categórico.

**Colunas com valores -1** = Indicam falta  de observação.

**target** Informa se  teve pedido ou não.  (1 ou 0)

*   https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data






In [None]:
submit = test.pop("id")
test_set = scaler.fit_transform(test)
test_set = np.clip(test_set, CLIP*-1, CLIP)
predict = model.predict(test_set)

In [None]:
predictions = []
for i in predict.tolist():
    predictions.append(round(i[0]))

test.shape,len(predictions)

In [None]:
submission = pd.DataFrame({"id" : submit, "target" : predictions})
submission.to_csv("submission.csv", index=False)

#!kaggle competitions submit -c porto-seguro-safe-driver-prediction -f submission.csv -m " tentativa "
#files.download('submission.csv') 
#!rm  submission.csv
submission.info()

In [None]:
submission.head()