# Predicción de casos de Dengue preservando la privacidad de los datos
Demo en la construcción de un modelo para la predicción de casos de dengue con el objetivo de preservar la privacidad de los datos utilizados.

In [25]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [46]:
import numpy as np
import pandas as pd
import copy
import phe

from lightdlf.cpu.core import Tensor
from lightdlf.cpu.layers import Linear, Relu, Sigmoid, Tanh, MSELoss, Sequential
from lightdlf.cpu.optimizers import SGD

np.random.seed(123)

In [24]:
def rmse (pred, y):
    rse_sum = 0
    for i in range(len(pred)):
        rse = np.sqrt((pred[i] - y[i]) * (pred[i] - y[i]))
        rse_sum += rse
    return rse_sum/len(pred)

In [3]:
df = pd.read_csv('datasets/dengue/asu_dengue_dataset.csv')
df.head()

Unnamed: 0,cantidad(-1),cantidad(-2),cantidad(-3),cantidad(-4),cantidad(-5),cantidad(-6),cantidad(-7),cantidad(-8),cantidad(-9),cantidad(-10),...,temperatura_min_media(-2),temperatura_min_media(-3),temperatura_min_media(-4),temperatura_min_media(-5),temperatura_min_media(-6),temperatura_min_media(-7),temperatura_min_media(-8),temperatura_min_media(-9),temperatura_min_media(-10),temperatura_min_media(-11)
0,47,50,19,15,12,8,11,0,3,2,...,20.714,24.286,22.857,24.143,21.143,20.714,21.0,20.571,21.714,18.714
1,29,47,50,19,15,12,8,11,0,3,...,19.429,20.714,24.286,22.857,24.143,21.143,20.714,21.0,20.571,21.714
2,27,29,47,50,19,15,12,8,11,0,...,21.286,19.429,20.714,24.286,22.857,24.143,21.143,20.714,21.0,20.571
3,30,27,29,47,50,19,15,12,8,11,...,21.143,21.286,19.429,20.714,24.286,22.857,24.143,21.143,20.714,21.0
4,60,30,27,29,47,50,19,15,12,8,...,17.429,21.143,21.286,19.429,20.714,24.286,22.857,24.143,21.143,20.714


In [4]:
# for column in df.columns:
#     print(column)

In [5]:
df_reduced = df[['cantidad',
                'cantidad(-1)',
                'temperatura_max_media(-1)',
                'temperatura_max_media(-2)',
                'temperatura_max_media(-3)',
                'temperatura_max_media(-4)',
                'temperatura_max_media(-5)',
                'temperatura_max_media(-6)',
                'temperatura_max_media(-7)',
                'temperatura_max_media(-8)',
                'temperatura_max_media(-9)',
                'temperatura_max_media(-10)',
                'temperatura_max_media(-11)',
                'lluvia_mm(-1)',
                'lluvia_mm(-2)',
                'humedad_min_media_porc(-1)',
                'humedad_min_media_porc(-2)',
                'humedad_min_media_porc(-3)',
                'humedad_min_media_porc(-4)',
                'humedad_min_media_porc(-5)',
                'humedad_min_media_porc(-6)',
                'humedad_min_media_porc(-7)',
                'humedad_min_media_porc(-8)',
                'humedad_min_media_porc(-9)',
                'humedad_min_media_porc(-10)',
                'humedad_min_media_porc(-11)']]

In [6]:
df_reduced.dtypes

cantidad                         int64
cantidad(-1)                     int64
temperatura_max_media(-1)      float64
temperatura_max_media(-2)      float64
temperatura_max_media(-3)      float64
temperatura_max_media(-4)      float64
temperatura_max_media(-5)      float64
temperatura_max_media(-6)      float64
temperatura_max_media(-7)      float64
temperatura_max_media(-8)      float64
temperatura_max_media(-9)      float64
temperatura_max_media(-10)     float64
temperatura_max_media(-11)     float64
lluvia_mm(-1)                  float64
lluvia_mm(-2)                  float64
humedad_min_media_porc(-1)     float64
humedad_min_media_porc(-2)     float64
humedad_min_media_porc(-3)     float64
humedad_min_media_porc(-4)     float64
humedad_min_media_porc(-5)     float64
humedad_min_media_porc(-6)     float64
humedad_min_media_porc(-7)     float64
humedad_min_media_porc(-8)     float64
humedad_min_media_porc(-9)     float64
humedad_min_media_porc(-10)    float64
humedad_min_media_porc(-1

In [7]:
df_reduced.head()

Unnamed: 0,cantidad,cantidad(-1),temperatura_max_media(-1),temperatura_max_media(-2),temperatura_max_media(-3),temperatura_max_media(-4),temperatura_max_media(-5),temperatura_max_media(-6),temperatura_max_media(-7),temperatura_max_media(-8),...,humedad_min_media_porc(-2),humedad_min_media_porc(-3),humedad_min_media_porc(-4),humedad_min_media_porc(-5),humedad_min_media_porc(-6),humedad_min_media_porc(-7),humedad_min_media_porc(-8),humedad_min_media_porc(-9),humedad_min_media_porc(-10),humedad_min_media_porc(-11)
0,29,47,33.286,31.857,36.0,31.857,34.429,31.143,33.0,33.857,...,36.0,34.429,48.429,43.0,46.857,48.571,40.429,39.571,38.857,20.714
1,27,29,34.857,33.286,31.857,36.0,31.857,34.429,31.143,33.0,...,31.857,36.0,34.429,48.429,43.0,46.857,48.571,40.429,39.571,38.857
2,30,27,35.571,34.857,33.286,31.857,36.0,31.857,34.429,31.143,...,34.143,31.857,36.0,34.429,48.429,43.0,46.857,48.571,40.429,39.571
3,60,30,31.0,35.571,34.857,33.286,31.857,36.0,31.857,34.429,...,33.286,34.143,31.857,36.0,34.429,48.429,43.0,46.857,48.571,40.429
4,79,60,34.429,31.0,35.571,34.857,33.286,31.857,36.0,31.857,...,32.429,33.286,34.143,31.857,36.0,34.429,48.429,43.0,46.857,48.571


In [8]:
df_reduced.describe()

Unnamed: 0,cantidad,cantidad(-1),temperatura_max_media(-1),temperatura_max_media(-2),temperatura_max_media(-3),temperatura_max_media(-4),temperatura_max_media(-5),temperatura_max_media(-6),temperatura_max_media(-7),temperatura_max_media(-8),...,humedad_min_media_porc(-2),humedad_min_media_porc(-3),humedad_min_media_porc(-4),humedad_min_media_porc(-5),humedad_min_media_porc(-6),humedad_min_media_porc(-7),humedad_min_media_porc(-8),humedad_min_media_porc(-9),humedad_min_media_porc(-10),humedad_min_media_porc(-11)
count,328.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0,...,328.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0
mean,111.030488,110.945122,28.538134,28.558604,28.608692,28.623064,28.648326,28.675765,28.695366,28.727595,...,42.243988,42.214372,42.209146,42.182143,42.117247,42.0885,42.038851,42.026655,41.99878,41.912979
std,247.165693,247.18301,4.833589,4.833383,4.82508,4.82772,4.836375,4.824871,4.829351,4.828414,...,11.584123,11.591668,11.588468,11.576231,11.48959,11.461491,11.433179,11.433673,11.43021,11.484073
min,0.0,0.0,16.143,16.143,16.143,16.143,16.143,16.143,16.143,16.143,...,15.429,15.429,15.429,15.429,15.429,15.429,15.429,15.429,15.429,15.429
25%,2.75,2.75,25.429,25.429,25.5355,25.5355,25.5355,25.571,25.571,25.67825,...,34.10725,34.10725,34.10725,34.10725,34.10725,34.10725,34.10725,34.10725,34.10725,34.0
50%,18.5,18.5,29.3575,29.429,29.5,29.571,29.571,29.571,29.571,29.571,...,42.2145,42.0715,42.0715,42.0715,42.0715,42.0715,41.9285,41.857,41.7855,41.714
75%,76.5,76.5,32.429,32.429,32.429,32.429,32.429,32.429,32.429,32.4645,...,48.857,48.857,48.74975,48.60675,48.4645,48.4645,48.32175,48.32175,48.32175,48.286
max,1691.0,1691.0,39.714,39.714,39.714,39.714,39.714,39.714,39.714,39.714,...,78.857,78.857,78.857,78.857,78.857,78.857,78.857,78.857,78.857,78.857


In [9]:
max_values = df_reduced.max()
min_values = df_reduced.min()

# Normalización del dataset
df_normalizado = (df_reduced - df_reduced.min())/(df_reduced.max() - df_reduced.min())

In [10]:
df_normalizado.head()

Unnamed: 0,cantidad,cantidad(-1),temperatura_max_media(-1),temperatura_max_media(-2),temperatura_max_media(-3),temperatura_max_media(-4),temperatura_max_media(-5),temperatura_max_media(-6),temperatura_max_media(-7),temperatura_max_media(-8),...,humedad_min_media_porc(-2),humedad_min_media_porc(-3),humedad_min_media_porc(-4),humedad_min_media_porc(-5),humedad_min_media_porc(-6),humedad_min_media_porc(-7),humedad_min_media_porc(-8),humedad_min_media_porc(-9),humedad_min_media_porc(-10),humedad_min_media_porc(-11)
0,0.01715,0.027794,0.727292,0.666667,0.842433,0.666667,0.775784,0.636375,0.715158,0.751517,...,0.32432,0.299552,0.520275,0.434682,0.495491,0.522514,0.394148,0.380621,0.369364,0.083323
1,0.015967,0.01715,0.793942,0.727292,0.666667,0.842433,0.666667,0.775784,0.636375,0.715158,...,0.259002,0.32432,0.299552,0.520275,0.434682,0.495491,0.522514,0.394148,0.380621,0.369364
2,0.017741,0.015967,0.824233,0.793942,0.727292,0.666667,0.842433,0.666667,0.775784,0.636375,...,0.295043,0.259002,0.32432,0.299552,0.520275,0.434682,0.495491,0.522514,0.394148,0.380621
3,0.035482,0.017741,0.630308,0.824233,0.793942,0.727292,0.666667,0.842433,0.666667,0.775784,...,0.281532,0.295043,0.259002,0.32432,0.299552,0.520275,0.434682,0.495491,0.522514,0.394148
4,0.046718,0.035482,0.775784,0.630308,0.824233,0.793942,0.727292,0.666667,0.842433,0.666667,...,0.26802,0.281532,0.295043,0.259002,0.32432,0.299552,0.520275,0.434682,0.495491,0.522514


In [11]:
Y = df_normalizado[['cantidad']].to_numpy()
X = df_normalizado.drop(['cantidad'], axis=1).to_numpy()
Y[0], X[0]

(array([0.01714962]),
 array([0.0277942 , 0.72729201, 0.66666667, 0.8424335 , 0.66666667,
        0.7757838 , 0.63637521, 0.71515846, 0.75151669, 0.64244198,
        0.76971703, 0.72729201, 0.        , 0.11750881, 0.25900233,
        0.32432049, 0.29955225, 0.52027496, 0.43468184, 0.49549095,
        0.52251372, 0.3941477 , 0.38062055, 0.36936369, 0.08332282]))

In [12]:
len(X[0])

25

In [13]:
bunch_size = int(len(Y)/4)
bunch_size

82

In [14]:
x_train = X[0:len(Y)-bunch_size]
x_test = X[-bunch_size:]

y_train = Y[0:len(Y)-bunch_size]
y_test = Y[-bunch_size:]

In [15]:
len(y_train), len(y_test)

(246, 82)

Definicion del Modelo

In [16]:
np.random.seed(0)

data = Tensor(x_train, autograd=True)
target = Tensor(y_train, autograd=True)

model = Sequential([Linear(25,4), Relu(), Linear(4,3), Relu(), Linear(3,1), Sigmoid()])
criterion = MSELoss()
# optim = SGD(parameters=model.get_parameters(), alpha=0.01)
optim = SGD(parameters=model.get_parameters(), alpha=0.01)

# 500
for i in range(500):
    # Predecir
    pred = model.forward(data)
    
    # Comparar
    loss = criterion.forward(pred, target)
    
    # Aprender
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()
    if (i%100 == 0):
        print(loss)

[56.50307108]
[2.191908]
[1.25671119]
[0.9974294]
[0.8324166]


In [17]:
test_data = Tensor(x_test)
test_target = Tensor(y_test)

In [18]:
pred = model.forward(test_data)

pred_list = [x[0] for x in pred.data]
test_target_list = [x[0] for x in test_target.data]

In [19]:
comparison = pd.DataFrame({'actual':test_target_list, 'predicted':pred_list})
comparison.head()

Unnamed: 0,actual,predicted
0,0.014193,0.012301
1,0.010645,0.010676
2,0.011827,0.014646
3,0.062093,0.012226
4,0.021881,0.020853


In [20]:
denormalized_pred_list = [(x[0] * (max_values['cantidad'] - min_values['cantidad'])) + min_values['cantidad'] for x in pred.data]
denormalized_test_target_list = [(x[0] * (max_values['cantidad'] - min_values['cantidad'])) + min_values['cantidad'] for x in test_target.data]
denormalized_comparison = pd.DataFrame({'actual':denormalized_test_target_list, 'predicted':denormalized_pred_list})

In [21]:
denormalized_comparison.head()

Unnamed: 0,actual,predicted
0,24.0,20.801821
1,18.0,18.05336
2,20.0,24.767106
3,105.0,20.674051
4,37.0,35.26243


In [23]:
print('RMSE:',rmse(pred_list, test_target_list))

RMSE: 0.013776108411507624


# Modelo Federado

In [41]:
np.random.seed(0)

data = Tensor(x_train, autograd=True)
target = Tensor(y_train, autograd=True)

layers = [Linear(25,4), Relu(), Linear(4,3), Relu(), Linear(3,1), Sigmoid()]
model = Sequential(layers)

def train(model, data, target, iterations=5):
    criterion = MSELoss()
    optim = SGD(parameters=model.get_parameters(), alpha=0.01)

    for i in range(iterations):
        # Predecir
        pred = model.forward(data)

        # Comparar
        loss = criterion.forward(pred, target)

        # Aprender
        loss.backward(Tensor(np.ones_like(loss.data)))
        optim.step()
        if (i%100 == 0):
            sys.stdout.write("\r\tLoss:" + str(loss))
    print()
    return model

In [42]:
new = train(model, data, target, iterations=500)

	Loss:[0.8324166]


Pruebas de encriptado

In [47]:
public_key, private_key = phe.generate_paillier_keypair(n_length=128)

In [49]:
aux = Sequential([Linear(3,2)])

In [143]:
def encrypt_tensor(matrix, pubkey):
    encrypt_weights = list()
    for vector in matrix:
        # print(vector)
        for val in vector:
            # print(val)
            encrypt_weights.append(pubkey.encrypt(val))
    restore = np.array(encrypt_weights).reshape(matrix.shape)
    # print(restore)
    return restore

def decrypt_tensor(matrix, privkey):
    decrypted_weights = list()
    for vector in matrix:
        # print(vector)
        for val in vector.flatten():
            # print(val)
            decrypted_weights.append(privkey.decrypt(val))
    restore = np.array(decrypted_weights).reshape(matrix.shape)
    # print(restore)
    return restore
    
def encrypt_sequential_model(model, pubkey):
    for layer in model.layers:
        if type(layer) == Linear:
            layer.weight.data = encrypt_tensor(layer.weight.data, pubkey)
    return model

def decrypt_sequential_model(model, privkey):
    for layer in model.layers:
        if type(layer) == Linear:
            layer.weight.data = decrypt_tensor(layer.weight.data, privkey)
    return model

In [144]:
aux.layers[0].weight.data

array([[-1.22988448, -0.36088954],
       [ 1.13453133, -0.87974676],
       [ 0.19472559,  1.37138796]])

In [145]:
encripted_tensor = encrypt_tensor(aux.layers[0].weight.data, pubkey=public_key)

In [146]:
decrypt_tensor(encripted_tensor, privkey=private_key)

array([[-1.22988448, -0.36088954],
       [ 1.13453133, -0.87974676],
       [ 0.19472559,  1.37138796]])

In [147]:
seq_aux = Sequential([Linear(2,3), Linear(3,2)])
print(seq_aux.layers[0].weight.data)
print()
encrypted_model = encrypt_sequential_model(seq_aux, pubkey=public_key)
print(encrypted_model.layers[0].weight.data)
print()
decrypted_model = decrypt_sequential_model(encrypted_model, private_key)
print(decrypted_model.layers[0].weight.data)

[[ 0.07290724 -0.90712387  0.16841519]
 [ 1.02510407  1.27103949 -2.65731471]]

[[<phe.paillier.EncryptedNumber object at 0x1196a6278>
  <phe.paillier.EncryptedNumber object at 0x1196a62e8>
  <phe.paillier.EncryptedNumber object at 0x1196a6198>]
 [<phe.paillier.EncryptedNumber object at 0x1196a6208>
  <phe.paillier.EncryptedNumber object at 0x119068390>
  <phe.paillier.EncryptedNumber object at 0x11ca61b70>]]

[[ 0.07290724 -0.90712387  0.16841519]
 [ 1.02510407  1.27103949 -2.65731471]]
