In [None]:
#abrir os dados
import pandas as pd

dados = pd.read_csv('/content/drive/MyDrive/SistemasInteligentes/diabetic_data.csv', sep=',')

In [None]:
#removendo dados que não são interessantes
dados.drop(columns=['encounter_id', 'patient_nbr', 'weight', 'payer_code'], inplace=True)

In [None]:
# Dicionário de mapeamento
admission_type_map = {
    1: 'Emergency',
    2: 'Urgent',
    3: 'Elective',
    4: 'Newborn',
    5: 'Not Available',
    6: 'NULL',
    7: 'Trauma Center',
    8: 'Not Mapped'
}

# Substituir os valores na coluna
dados['admission_type_id'] = dados['admission_type_id'].replace(admission_type_map)

# Dicionário de mapeamento para discharge_disposition_id
discharge_disposition_map = {
    1: "Discharged to home",
    2: "Discharged/transferred to another short term hospital",
    3: "Discharged/transferred to SNF",
    4: "Discharged/transferred to ICF",
    5: "Discharged/transferred to another type of inpatient care institution",
    6: "Discharged/transferred to home with home health service",
    7: "Left AMA",
    8: "Discharged/transferred to home under care of Home IV provider",
    9: "Admitted as an inpatient to this hospital",
    10: "Neonate discharged to another hospital for neonatal aftercare",
    11: "Expired",
    12: "Still patient or expected to return for outpatient services",
    13: "Hospice / home",
    14: "Hospice / medical facility",
    15: "Discharged/transferred within this institution to Medicare approved swing bed",
    16: "Discharged/transferred/referred another institution for outpatient services",
    17: "Discharged/transferred/referred to this institution for outpatient services",
    18: "NULL",
    19: "Expired at home. Medicaid only, hospice.",
    20: "Expired in a medical facility. Medicaid only, hospice.",
    21: "Expired, place unknown. Medicaid only, hospice.",
    22: "Discharged/transferred to another rehab fac including rehab units of a hospital .",
    23: "Discharged/transferred to a long term care hospital.",
    24: "Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.",
    25: "Not Mapped",
    26: "Unknown/Invalid",
    27: "Discharged/transferred to a federal health care facility.",
    28: "Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital",
    29: "Discharged/transferred to a Critical Access Hospital (CAH).",
    30: "Discharged/transferred to another Type of Health Care Institution not Defined Elsewhere"
}

# Substituir os valores na coluna
dados['discharge_disposition_id'] = dados['discharge_disposition_id'].replace(discharge_disposition_map)

# Dicionário de mapeamento para admission_source_id
admission_source_map = {
    1: "Physician Referral",
    2: "Clinic Referral",
    3: "HMO Referral",
    4: "Transfer from a hospital",
    5: "Transfer from a Skilled Nursing Facility (SNF)",
    6: "Transfer from another health care facility",
    7: "Emergency Room",
    8: "Court/Law Enforcement",
    9: "Not Available",
    10: "Transfer from critial access hospital",
    11: "Normal Delivery",
    12: "Premature Delivery",
    13: "Sick Baby",
    14: "Extramural Birth",
    15: "Not Available",
    17: "NULL",
    18: "Transfer From Another Home Health Agency",
    19: "Readmission to Same Home Health Agency",
    20: "Not Mapped",
    21: "Unknown/Invalid",
    22: "Transfer from hospital inpt/same fac reslt in a sep claim",
    23: "Born inside this hospital",
    24: "Born outside this hospital",
    25: "Transfer from Ambulatory Surgery Center",
    26: "Transfer from Hospice"
}

# Substituir os valores na coluna
dados['admission_source_id'] = dados['admission_source_id'].replace(admission_source_map)

In [None]:
import numpy as np

# Substituir '?' por NaN
dados['medical_specialty'] = dados['medical_specialty'].replace('?', np.nan)

# Substituir os NaN pela moda
dados['medical_specialty'] = dados['medical_specialty'].fillna(dados['medical_specialty'].mode()[0])

In [None]:
for col in ['diag_1', 'diag_2', 'diag_3']:
    if dados[col].dtype == 'object':
        dados[col] = pd.to_numeric(dados[col], errors='coerce')  # converte strings para números
        moda = dados[col].mode()[0]  # pega a moda da coluna (o valor mais frequente)
        dados[col].fillna(moda, inplace=True)  # substitui NaN pela moda

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dados[col].fillna(moda, inplace=True)  # substitui NaN pela moda


In [None]:
# Identificar colunas numéricas e categóricas
colunas_numericas = dados.select_dtypes(include=['int64', 'float64']).columns
colunas_categoricas = dados.select_dtypes(include=['object', 'category']).columns

colunas_numericas

Index(['time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses'],
      dtype='object')

In [None]:
# normalizar os dados
from sklearn import preprocessing
from pickle import dump, load

# instanciar o normalizador
normalizador = preprocessing.MinMaxScaler()

# treinar o normalizador
modelo_normalizador_diabetes = normalizador.fit(dados[colunas_numericas])

# salvar o modelo
dump(modelo_normalizador_diabetes, open('/content/drive/MyDrive/SistemasInteligentes/models/normalizador_diabetes.model', 'wb'))

In [None]:
# abrir o modelo (só se precisar depois)
modelo_normalizador_diabetes = load(open('/content/drive/MyDrive/SistemasInteligentes/models/normalizador_diabetes.model', 'rb'))

# normalizar os dados
dados_normalizados = modelo_normalizador_diabetes.transform(dados[colunas_numericas])
dados_num_normalizados = pd.DataFrame(dados_normalizados, columns=colunas_numericas)

dados_normalizados

array([[0.        , 0.30534351, 0.        , ..., 0.27263581, 0.24799197,
        0.        ],
       [0.15384615, 0.44274809, 0.        , ..., 0.24648893, 0.25301205,
        0.53333333],
       [0.07692308, 0.07633588, 0.83333333, ..., 0.24647887, 0.24799197,
        0.33333333],
       ...,
       [0.        , 0.39694656, 0.        , ..., 0.58853119, 0.29417671,
        0.8       ],
       [0.69230769, 0.33587786, 0.33333333, ..., 0.28169014, 0.99899598,
        0.53333333],
       [0.38461538, 0.09160305, 0.5       , ..., 0.52816901, 0.78714859,
        0.53333333]])

In [None]:
# transformar atributos categóricos em dummies (variáveis indicadoras)
dados_cat_dummies = pd.get_dummies(dados[colunas_categoricas])

#removendo dados que não são interessantes
dados_cat_dummies.drop(columns=['race_?'], inplace=True)

dados_cat_dummies

Unnamed: 0,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Female,gender_Male,gender_Unknown/Invalid,age_[0-10),age_[10-20),...,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,readmitted_<30,readmitted_>30,readmitted_NO
0,False,False,True,False,False,True,False,False,True,False,...,False,True,False,False,True,True,False,False,False,True
1,False,False,True,False,False,True,False,False,False,True,...,False,True,False,True,False,False,True,False,True,False
2,True,False,False,False,False,True,False,False,False,False,...,False,True,False,False,True,False,True,False,False,True
3,False,False,True,False,False,False,True,False,False,False,...,False,True,False,True,False,False,True,False,False,True
4,False,False,True,False,False,False,True,False,False,False,...,False,True,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,True,False,False,False,False,False,True,False,False,False,...,False,True,False,True,False,False,True,False,True,False
101762,True,False,False,False,False,True,False,False,False,False,...,False,True,False,False,True,False,True,False,False,True
101763,False,False,True,False,False,False,True,False,False,False,...,False,True,False,True,False,False,True,False,False,True
101764,False,False,True,False,False,True,False,False,False,False,...,False,True,False,True,False,False,True,False,False,True


In [None]:
#normalizar os dados categoricos
dados_cat_normalizados = pd.get_dummies(dados_cat_dummies, dtype='int')
dados_cat_normalizados = dados_cat_normalizados.astype(int)

dados_cat_normalizados

Unnamed: 0,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Female,gender_Male,gender_Unknown/Invalid,age_[0-10),age_[10-20),...,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,readmitted_<30,readmitted_>30,readmitted_NO
0,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,1,1,0,0,0,1
1,0,0,1,0,0,1,0,0,0,1,...,0,1,0,1,0,0,1,0,1,0
2,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,1,0,0,1
3,0,0,1,0,0,0,1,0,0,0,...,0,1,0,1,0,0,1,0,0,1
4,0,0,1,0,0,0,1,0,0,0,...,0,1,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1,0,0,0,0,0,1,0,0,0,...,0,1,0,1,0,0,1,0,1,0
101762,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,1,0,0,1
101763,0,0,1,0,0,0,1,0,0,0,...,0,1,0,1,0,0,1,0,0,1
101764,0,0,1,0,0,1,0,0,0,0,...,0,1,0,1,0,0,1,0,0,1


In [None]:
dados_final = dados_num_normalizados.join(dados_cat_normalizados)
dados_final

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,...,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,readmitted_<30,readmitted_>30,readmitted_NO
0,0.000000,0.305344,0.000000,0.0000,0.000000,0.0,0.000000,0.248825,0.272636,0.247992,...,0,1,0,0,1,1,0,0,0,1
1,0.153846,0.442748,0.000000,0.2125,0.000000,0.0,0.000000,0.274096,0.246489,0.253012,...,0,1,0,1,0,0,1,0,1,0
2,0.076923,0.076336,0.833333,0.1500,0.047619,0.0,0.047619,0.647590,0.246479,0.247992,...,0,1,0,0,1,0,1,0,0,1
3,0.076923,0.328244,0.166667,0.1875,0.000000,0.0,0.000000,0.005020,0.246911,0.401606,...,0,1,0,1,0,0,1,0,0,1
4,0.000000,0.381679,0.000000,0.0875,0.000000,0.0,0.000000,0.194779,0.152918,0.247992,...,0,1,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,0.153846,0.381679,0.000000,0.1875,0.000000,0.0,0.000000,0.248122,0.287726,0.456827,...,0,1,0,1,0,0,1,0,1,0
101762,0.307692,0.244275,0.500000,0.2125,0.000000,0.0,0.047619,0.559237,0.272636,0.787149,...,0,1,0,0,1,0,1,0,0,1
101763,0.000000,0.396947,0.000000,0.1000,0.023810,0.0,0.000000,0.035141,0.588531,0.294177,...,0,1,0,1,0,0,1,0,0,1
101764,0.692308,0.335878,0.333333,0.2500,0.000000,0.0,0.047619,0.996988,0.281690,0.998996,...,0,1,0,1,0,0,1,0,0,1


In [None]:
amostra = dados_final.sample(frac=0.1, random_state=42)

In [None]:
#hiperparametrização ou numero de clusters
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import math
from scipy.spatial.distance import cdist #determinador de distancias
import numpy as np

distortions = [] #retorno de distorcoes
K = range(1,50) #intervalo de clusters analisados para o numero otimo

for k in K:
    clusterModel = KMeans(n_clusters=k, random_state=42)
    clusterModel.fit(amostra)
    # calcular a distorção
    distortions.append(
        sum(np.min(cdist(amostra, clusterModel.cluster_centers_, 'euclidean'), axis=1)) / amostra.shape[0]
    )


In [None]:
#determinar o numero de clusters
x0 = K[0]
y0 = distortions[0]

xn = K[-1]
yn = distortions[-1]
distancias = []

for i in range(len(distortions)):
    x = K[i]
    y = distortions[i]
    numerador = abs((yn - y0) * x - (xn - x0) * y + (xn * y0) - (yn * x0))
    denominador = math.sqrt((yn - y0)**2 + (xn - x0)**2)
    distancias.append(numerador / denominador)

numero_clusters_otimo = K[distancias.index(max(distancias))]
print(f"Número ótimo de clusters: {numero_clusters_otimo}")

# treinar o modelo definitivo com o número ótimo de clusters
diabetes_cluster_model = KMeans(n_clusters=numero_clusters_otimo, random_state=42)
diabetes_cluster_model.fit(dados_final)

# salvar o modelo
dump(diabetes_cluster_model, open('/content/drive/MyDrive/SistemasInteligentes/models/diabetes_clusters.model', 'wb'))


Número ótimo de clusters: 11
