In [1]:
import numpy as np
import pandas as pd
import csv

from sklearn.utils import shuffle
import torch
from torch.distributions.one_hot_categorical import OneHotCategorical 
import pyro
import pyro.distributions as dist

In [2]:
import urllib.request
import random

word_url = "http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain"
response = urllib.request.urlopen(word_url)
long_txt = response.read().decode()
words = long_txt.splitlines()

upper_words = [word for word in words if word[0].isupper()]
name_words  = [word for word in upper_words if not word.isupper()]
one_name = ' '.join([name_words[random.randint(0, len(name_words))] for i in range(2)])


def rand_name():
    name = ' '.join([name_words[random.randint(0, len(name_words))] for i in range(2)])
    return name

In [3]:
def disease_model(prob_vec):
    
    if len(prob_vec) != 6:
        print ("Number of disease parameter are six")
        return None
        
    #distribuição de paralisia para doença zulombriga
    paralysis = pyro.sample('paralysis', dist.Bernoulli(prob_vec[0]))
    paralysis = 1 if paralysis.item() == 1.0 else 0
    
    tongue = pyro.sample('tongue', dist.Bernoulli(prob_vec[1]))
    tongue = 1 if tongue.item() == 1.0 else 0
    
    finger = pyro.sample('finger', dist.Bernoulli(prob_vec[2]))
    finger = 1 if finger.item() == 1.0 else 0
    
    member = pyro.sample('member', dist.Bernoulli(prob_vec[3]))
    member = 1 if member.item() == 1.0 else 0

    pain = pyro.sample('pain', dist.Bernoulli(prob_vec[4]))
    pain = 1 if pain.item() == 1.0 else 0
    
    anger = pyro.sample('anger', dist.Bernoulli(prob_vec[5]))
    anger = 1 if anger.item() == 1.0 else 0
    
    return [paralysis, tongue, finger, member, pain, anger]

In [4]:
def generate_data(n_cases = 20):
    
    #Model probabilities of the diseases
    zulombriga_prob = [0.0, 0.6, 0.9, 0.3, 0.0, 0.0]
    infection_prob = [0.5, 0.90, 0.0, 0.5, 0.0, 0.0]
    viral_prob = [0.1, 0.0, 0.0, 0.5, 0.95, 0.0]
    fight_prob = [0.0, 0.0, 0.0, 0.7, 0.5, 0.9]

    cases = []
    
    for c in range(n_cases):
        
        disease_vec = [1, 2, 3, 4]
        diseases_prob = [zulombriga_prob, infection_prob, viral_prob, fight_prob]
        disease_dist = OneHotCategorical(torch.tensor([0.25, 0.25, 0.25, 0.25]))
        d_idx = np.argmax(disease_dist.sample())
        diagnosis = disease_vec[d_idx]
        symptoms = disease_model(diseases_prob[d_idx])
        name = rand_name()
        line = [name] + symptoms + [diagnosis]
        
        cases.append(line)
        
    
    return cases

In [5]:
def save_to_csv(mylist, filename):
    
    header = ["Nome", "Paralisia", "Língua Amarela", "Dedo Tremendo", "Perda de Membro", "Dor no Peito", "Raiva Severa", "Diagnóstico"]
#     h = ','.join(header)
    
    with open(filename, 'w', newline='') as myfile:
        wr = csv.writer(myfile, delimiter=',')
        wr.writerow(header)
        for item in mylist:
             wr.writerow(item)

In [6]:
cases = generate_data(500)
save_to_csv(cases, "cases500.csv")

In [7]:
df=pd.read_csv('cases500.csv', sep=',')
df = shuffle(df)

In [8]:
df20 = df[:20]
df20

Unnamed: 0,Nome,Paralisia,Língua Amarela,Dedo Tremendo,Perda de Membro,Dor no Peito,Raiva Severa,Diagnóstico
46,Mesopotamia Hillel,0,0,0,0,1,1,4
488,Schafer Hayden,0,0,1,0,0,0,1
455,Byronic Heathkit,0,0,0,0,0,1,4
348,Guggenheim Byrd,0,0,0,1,1,0,3
157,Johnston Helena,0,1,0,0,0,0,2
181,Baldwin Asia,0,0,0,1,1,0,3
39,Benjamin Aubrey,0,0,0,0,1,1,4
284,Tuskegee Mizar,0,1,1,1,0,0,1
147,Yost Hapsburg,0,0,1,0,0,0,1
299,Percival Lind,0,1,0,0,0,0,2


In [9]:
# df20.to_csv("cases.csv", index=False)