# Projeto da Disciplina 

Este notebook contém um script base para o projeto da disciplina IF702 Redes Neurais.

### Importando dependencias

In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, average_precision_score
import matplotlib
matplotlib.use('nbagg')
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from imblearn.over_sampling import SMOTE
from itertools import groupby

  from ._conv import register_converters as _register_converters
Using Theano backend.


## Preparação de Dados

Inicialmente precisamos carregar os dados e eliminar exemplos repetidos, fazemos isso usando o Pandas.
Em seguida exibimos uma pequena amostra do head do dataset e seus dados estatísticos.

In [2]:
data_set = pd.read_csv('data/TRN',sep='\t')
data_set.drop_duplicates(inplace=True)  # Remove exemplos repetidos

In [3]:
data_set.head(5)

Unnamed: 0,INDEX,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,...,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14,IND_BOM_1_1,IND_BOM_1_2
0,0,1,1,1,0,0,0,0,0.135098,1,...,0,0,1,1,0,1,1,1,0,1
1,1,1,0,1,0,0,1,0,0.273504,1,...,0,1,0,1,1,0,0,0,1,0
2,2,1,0,1,0,0,1,0,0.28191,0,...,1,1,0,0,0,0,1,0,1,0
3,3,1,1,1,0,0,0,0,0.225741,0,...,1,1,0,1,1,0,1,0,1,0
4,4,1,1,0,0,0,1,0,0.480403,0,...,1,1,1,0,0,1,0,1,1,0


In [4]:
data_set.describe()

Unnamed: 0,INDEX,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,...,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14,IND_BOM_1_1,IND_BOM_1_2
count,389196.0,389196.0,389196.0,389196.0,389196.0,389196.0,389196.0,389196.0,389196.0,389196.0,...,389196.0,389196.0,389196.0,389196.0,389196.0,389196.0,389196.0,389196.0,389196.0,389196.0
mean,194597.5,0.889274,0.691952,0.476552,0.296195,0.241179,0.218011,0.186836,0.4552049,0.521514,...,0.423378,0.41754,0.425708,0.45982,0.440842,0.436896,0.433709,0.440339,0.655449,0.344551
std,112351.35202,0.313793,0.461687,0.499451,0.456579,0.427799,0.412895,0.389781,0.2537459,0.499538,...,0.494095,0.493154,0.494451,0.498384,0.496489,0.496002,0.495587,0.496428,0.475222,0.475222
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.506237e-16,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,97298.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2507866,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,194597.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.4375241,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,291896.25,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.6578835,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,389195.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Separando as features e classes do data set em arrays

In [18]:
# Também convertemos os dados para arrays ao invés de DataFrames
X = data_set.iloc[:, :-2].values
y = data_set.iloc[:, -1].values

## Divisão dos dados

Iremos dividir os dados dependentes (X=features) e os independentes (Y=classe) nos conjuntos de treinamento, teste e validação.

In [19]:
## Treino: 50%, Validação: 25%, Teste: 25%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, 
                                                    random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/3, 
                                                  random_state=42, stratify=y_train)

In [20]:
y_train_copy=y_train.copy()
y_train_copy.sort()
[len(list(group)) for key, group in groupby(y_train_copy)]

[127549, 67049]

Como podemos reparar, há presença de uma classe minoritária e uma majoritária. Iremos usar o SMOTE para deixá-las com mesmo tamanho.

In [23]:
sm = SMOTE(random_state=12, ratio = 1.0)
x_res, y_res = sm.fit_sample(X_train, y_train)



In [24]:
y_res_copy=y_res.copy()
y_res_copy.sort()
y_res_copy
[len(list(group)) for key, group in groupby(y_res_copy)]

[127549, 127549]

# Configuração das Redes e Treinamentos

## MPL

#### EXEMPLO DE TESTE - Será atualizado em breve

In [25]:
# Número de features do nosso data set.# Númer 
input_dim = X_train.shape[1]

# Aqui criamos o esboço da rede.
classifier = Sequential()

# Agora adicionamos a primeira camada escondida contendo 16 neurônios e função de ativação
# tangente hiperbólica. Por ser a primeira camada adicionada à rede, precisamos especificar
# a dimensão de entrada (número de features do data set).
classifier.add(Dense(16, activation='tanh', input_dim=input_dim))

# Em seguida adicionamos a camada de saída. Como nosso problema é binário só precisamos de
# 1 neurônio com função de ativação sigmoidal. A partir da segunda camada adicionada keras já
# consegue inferir o número de neurônios de entrada (16) e nós não precisamos mais especificar.
classifier.add(Dense(1, activation='sigmoid'))

# Por fim compilamos o modelo especificando um otimizador, a função de custo, e opcionalmente
# métricas para serem observadas durante treinamento.
classifier.compile(optimizer='adam', loss='mean_squared_error')

In [27]:
# Para treinar a rede passamos o conjunto de treinamento e especificamos o tamanho do mini-batch,# Para t 
# o número máximo de épocas, e opcionalmente callbacks. No seguinte exemplo utilizamos early
# stopping para interromper o treinamento caso a performance não melhore em um conjunto de validação.
history = classifier.fit(X_train, y_train, batch_size=5000, epochs=1, 
                         callbacks=[EarlyStopping(patience=3)], validation_data=(X_val, y_val))

Train on 194598 samples, validate on 97299 samples
Epoch 1/1
