In [1]:
import os
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt   

# Work with Diagnostics

In [2]:
os.chdir('C:\\Users\\rober\\Desktop\\Humana_2\\Data_1')
data = pd.read_csv("BASE_TRAIN_MODELOS.csv", sep=',', encoding = 'ISO-8859-1')
data['AFILIADO'] = data['AFILIADO'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
data = data.drop_duplicates(subset=['AFILIADO', 'PERIODO'], keep='first', inplace=False).reset_index(drop=True)

In [3]:
data_touse = data[['AFILIADO', 'PERIODO', 'DIAGNOSTICO_A', 'PAGADO_A.1']].copy()
data_touse.head(10)

Unnamed: 0,AFILIADO,PERIODO,DIAGNOSTICO_A,PAGADO_A.1
0,ABAD AREVALO CARLA MARA,2014-2015,H605,15.0
1,ABAD CABANILLA CARMEN BEATRIZ,2014-2015,"M130, A09X, J304, J310, K30X, N709, M214, N925...",152.75
2,ABAD ROMERO CECILIA EUGENIA,2014-2015,"J00X, M170, I10X, I450, M179, J40X, N739, N72X...",370.48
3,ABARCA AGUIRRE MARIELA MARGARITA,2014-2015,"N850, N939, M350",210.55
4,ABARCA TORRES CARLOS VICTORIANO,2014-2015,ND,0.0
5,ABRIL MONTEROS DENNIS DIEGO,2014-2015,ND,0.0
6,ACERO GONZALEZ JORGE,2014-2015,B829,0.0
7,ACEVEDO RIPALDA ALLISON PATRICIA,2014-2015,"E039, E282, D539, G439, E221",288.26
8,ACHI GONZALEZ JACQUELINE PRICILA,2014-2015,ND,52.36
9,ACOSTA BALSECA EDGAR RAMIRO,2014-2015,"K760, J029, E789",40.15


It is important to check if afiliados-anio is unique:

In [4]:
(data_touse['AFILIADO'] + '_' + data_touse['PERIODO']).is_unique

True

## Create data matrix (Y)

For this first we create a dictionary of unique diagnostics, all with count zero. This is going to be our start data for each assured:

In [5]:
def update_set(mset, diag):
    mset.update([x.strip() for x in diag.split(',')])
    
mset = set()    

for ld in data_touse['DIAGNOSTICO_A']:
    update_set(mset, ld)   
    
mdict = dict.fromkeys(mset, 0)
mdict

{'H179': 0,
 'R104': 0,
 'N761': 0,
 'K528': 0,
 'K804': 0,
 'D698': 0,
 'I841': 0,
 'K290': 0,
 'I471': 0,
 'L200': 0,
 'M220': 0,
 'I499': 0,
 'T182': 0,
 'S345': 0,
 'N44X': 0,
 'M898': 0,
 'S335': 0,
 'D508': 0,
 'M233': 0,
 'S022': 0,
 'B581': 0,
 'I872': 0,
 'R896': 0,
 'E889': 0,
 'S136': 0,
 'I847': 0,
 'L928': 0,
 'R008': 0,
 'M089': 0,
 'I831': 0,
 'G575': 0,
 'C600': 0,
 'K099': 0,
 'K717': 0,
 'S927': 0,
 'L941': 0,
 'E220': 0,
 'N63X': 0,
 'K266': 0,
 'R402': 0,
 'D758': 0,
 'J853': 0,
 'M906': 0,
 'PM10': 0,
 'S833': 0,
 'S526': 0,
 'B814': 0,
 'I059': 0,
 'G811': 0,
 'I862': 0,
 'H830': 0,
 'M242': 0,
 'Q500': 0,
 'K031': 0,
 'Y496': 0,
 'B005': 0,
 'K710': 0,
 'E127': 0,
 'I709': 0,
 'L708': 0,
 'D649': 0,
 'B338': 0,
 'L26X': 0,
 'G441': 0,
 'M650': 0,
 'M508': 0,
 'M921': 0,
 'G719': 0,
 'R229': 0,
 'A080': 0,
 'N433': 0,
 'C754': 0,
 'L081': 0,
 'N600': 0,
 'C221': 0,
 'L405': 0,
 'Z431': 0,
 'N912': 0,
 'I509': 0,
 'G530': 0,
 'M503': 0,
 'N498': 0,
 'H599': 0,
 'M3

Now we create a matrix and populate with 1ones for each assured that suffered a diagnosis:

In [6]:
Y = np.zeros((data_touse.shape[0], len(mset)))

for i in range(Y.shape[0]):

    diags = data_touse['DIAGNOSTICO_A'][i]
    ldiags = [x.strip() for x in diags.split(',')]
    
    for d in ldiags:
        mdict[d] = 1
   
    Y[i,:] = list(mdict.values())
    mdict = dict.fromkeys(mset, 0)    

And we are done!

## Apply Matrix Filtering Algorithm:

We use the formulas for the improved algorithm:

In [7]:
def costFunction(Y, U, V):
    #Compute cost for given Y (data), U, V
    Y_h = np.dot(U.T,V)    
    S1 = Y - Y_h
    J = np.trace(np.dot(S1.T,S1))
    return J/(Y.shape[0]*Y.shape[1])

def compS(Y, U, V):
    Y_h = np.dot(U.T,V)
    S1 = Y - Y_h 
    return S1

def UGrad(S, U, V, ctnt):
    N = U.shape[1]  
    K = U.shape[0]
    U_grad = np.zeros((K,N))
    for i in range(N):
        U_grad[:,i] = -(2/ctnt) * (S[i,:] * V).sum(axis=1)
    return U_grad

def VGrad(S, U, V, ctnt):
    M = V.shape[1]  
    K = U.shape[0]
    V_grad = np.zeros((K,M))
    for j in range(M):
        V_grad[:,j] = -(2/ctnt) * (S[:,j] * U).sum(axis=1)
    return V_grad

def Update(U, U_grad, alpha):
    U = U - alpha*U_grad
    return U    

def Vpdate(V, V_grad, alpha):
    V = V - alpha*V_grad
    return V 

In [8]:
#U = np.random.randn(K,N)   #We give random starting values for user parameters
#V = np.random.randn(K,M)   #We give random starting values for product parameters
U = np.load('U.npy')
V = np.load('V.npy')

In [9]:
N = Y.shape[0]  #Number of users
M = Y.shape[1]  #Number of products
K = 3  #Latent Features

In [10]:
J = []  #Create an empty vector J
epochs = 4000  #Number of loops around our data
alpha = 0.0001
#ctnt = M * N
ctnt = 1
print_every = 20

In [11]:
for l in range(epochs):    
    
    S = compS(Y,U,V)
    
    U_grad = UGrad(S, U, V, ctnt)
    
    V_grad = VGrad(S, U, V, ctnt)      
    
    U = Update(U, U_grad, alpha)
    V = Vpdate(V, V_grad, alpha)
    
    e = costFunction(Y, U, V)
    J.append(e)
    
    if l+1 >= print_every and (l+1) % print_every == 0:
        print("Loop around data number {0} of {1}, error = {2}".format(l+1, epochs, e))

Loop around data number 20 of 4000, error = 0.0013163938332776163


KeyboardInterrupt: 

Now lets save them for later use in training:

In [12]:
np.save('U.npy', U)
np.save('V.npy', V)

## Save Results

And lets prepare them for modelling. First the persons features database:

In [13]:
df = pd.DataFrame(U.T)
df.columns = ['diag_f1','diag_f2','diag_f3']

In [14]:
data_touse = pd.concat([data_touse, df], axis=1)
data_touse.head(3)

Unnamed: 0,AFILIADO,PERIODO,DIAGNOSTICO_A,PAGADO_A.1,diag_f1,diag_f2,diag_f3
0,ABAD AREVALO CARLA MARA,2014-2015,H605,15.0,-0.526898,0.155419,0.012823
1,ABAD CABANILLA CARMEN BEATRIZ,2014-2015,"M130, A09X, J304, J310, K30X, N709, M214, N925...",152.75,0.228049,0.291613,-0.170468
2,ABAD ROMERO CECILIA EUGENIA,2014-2015,"J00X, M170, I10X, I450, M179, J40X, N739, N72X...",370.48,-0.857004,0.744933,-0.600182


In [15]:
data_touse.to_csv("nuevas\\pers_diag.csv", encoding='utf-8', index=False)
del [df]

Now the diagnostic features:

In [16]:
df = pd.concat([pd.DataFrame(mdict.keys()), pd.DataFrame(V.T)], axis=1)
df.columns = ['diag', 'diag2_f1','diag2_f2','diag2_f3']
df.to_csv("nuevas\\cdiag_diag.csv", encoding='utf-8', index=False)