In [3]:
#abrir os dados
import pandas as pd

dados = pd.read_csv('/content/drive/MyDrive/SistemasInteligentes/diabetic_data.csv', sep=',')

In [4]:
#removendo dados que não são interessantes
dados.drop(columns=['encounter_id', 'patient_nbr', 'weight', 'payer_code'], inplace=True)

In [5]:
# Dicionário de mapeamento
admission_type_map = {
    1: 'Emergency',
    2: 'Urgent',
    3: 'Elective',
    4: 'Newborn',
    5: 'Not Available',
    6: 'NULL',
    7: 'Trauma Center',
    8: 'Not Mapped'
}

# Substituir os valores na coluna
dados['admission_type_id'] = dados['admission_type_id'].replace(admission_type_map)

# Dicionário de mapeamento para discharge_disposition_id
discharge_disposition_map = {
    1: "Discharged to home",
    2: "Discharged/transferred to another short term hospital",
    3: "Discharged/transferred to SNF",
    4: "Discharged/transferred to ICF",
    5: "Discharged/transferred to another type of inpatient care institution",
    6: "Discharged/transferred to home with home health service",
    7: "Left AMA",
    8: "Discharged/transferred to home under care of Home IV provider",
    9: "Admitted as an inpatient to this hospital",
    10: "Neonate discharged to another hospital for neonatal aftercare",
    11: "Expired",
    12: "Still patient or expected to return for outpatient services",
    13: "Hospice / home",
    14: "Hospice / medical facility",
    15: "Discharged/transferred within this institution to Medicare approved swing bed",
    16: "Discharged/transferred/referred another institution for outpatient services",
    17: "Discharged/transferred/referred to this institution for outpatient services",
    18: "NULL",
    19: "Expired at home. Medicaid only, hospice.",
    20: "Expired in a medical facility. Medicaid only, hospice.",
    21: "Expired, place unknown. Medicaid only, hospice.",
    22: "Discharged/transferred to another rehab fac including rehab units of a hospital .",
    23: "Discharged/transferred to a long term care hospital.",
    24: "Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.",
    25: "Not Mapped",
    26: "Unknown/Invalid",
    27: "Discharged/transferred to a federal health care facility.",
    28: "Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital",
    29: "Discharged/transferred to a Critical Access Hospital (CAH).",
    30: "Discharged/transferred to another Type of Health Care Institution not Defined Elsewhere"
}

# Substituir os valores na coluna
dados['discharge_disposition_id'] = dados['discharge_disposition_id'].replace(discharge_disposition_map)

# Dicionário de mapeamento para admission_source_id
admission_source_map = {
    1: "Physician Referral",
    2: "Clinic Referral",
    3: "HMO Referral",
    4: "Transfer from a hospital",
    5: "Transfer from a Skilled Nursing Facility (SNF)",
    6: "Transfer from another health care facility",
    7: "Emergency Room",
    8: "Court/Law Enforcement",
    9: "Not Available",
    10: "Transfer from critial access hospital",
    11: "Normal Delivery",
    12: "Premature Delivery",
    13: "Sick Baby",
    14: "Extramural Birth",
    15: "Not Available",
    17: "NULL",
    18: "Transfer From Another Home Health Agency",
    19: "Readmission to Same Home Health Agency",
    20: "Not Mapped",
    21: "Unknown/Invalid",
    22: "Transfer from hospital inpt/same fac reslt in a sep claim",
    23: "Born inside this hospital",
    24: "Born outside this hospital",
    25: "Transfer from Ambulatory Surgery Center",
    26: "Transfer from Hospice"
}

# Substituir os valores na coluna
dados['admission_source_id'] = dados['admission_source_id'].replace(admission_source_map)

In [6]:
import numpy as np

# Substituir '?' por NaN
dados['medical_specialty'] = dados['medical_specialty'].replace('?', np.nan)

# Substituir os NaN pela moda
dados['medical_specialty'] = dados['medical_specialty'].fillna(dados['medical_specialty'].mode()[0])

In [7]:
# Identificar colunas numéricas e categóricas
colunas_numericas = dados.select_dtypes(include=['int64', 'float64']).columns
colunas_categoricas = dados.select_dtypes(include=['object', 'category']).columns

colunas_numericas

Index(['time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses'],
      dtype='object')

In [8]:
# normalizar os dados
from sklearn import preprocessing
from pickle import dump, load

# instanciar o normalizador
normalizador = preprocessing.MinMaxScaler()

# treinar o normalizador
modelo_normalizador_diabetes = normalizador.fit(dados[colunas_numericas])

# salvar o modelo
dump(modelo_normalizador_diabetes, open('/content/drive/MyDrive/SistemasInteligentes/models/normalizador_diabetes.model', 'wb'))

In [9]:
# abrir o modelo (só se precisar depois)
modelo_normalizador_diabetes = load(open('/content/drive/MyDrive/SistemasInteligentes/models/normalizador_diabetes.model', 'rb'))

# normalizar os dados
dados_normalizados = modelo_normalizador_diabetes.transform(dados[colunas_numericas])
dados_num_normalizados = pd.DataFrame(dados_normalizados, columns=colunas_numericas)

dados_normalizados

array([[0.        , 0.30534351, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.15384615, 0.44274809, 0.        , ..., 0.        , 0.        ,
        0.53333333],
       [0.07692308, 0.07633588, 0.83333333, ..., 0.        , 0.04761905,
        0.33333333],
       ...,
       [0.        , 0.39694656, 0.        , ..., 0.        , 0.        ,
        0.8       ],
       [0.69230769, 0.33587786, 0.33333333, ..., 0.        , 0.04761905,
        0.53333333],
       [0.38461538, 0.09160305, 0.5       , ..., 0.        , 0.        ,
        0.53333333]])

In [10]:
# transformar atributos categóricos em dummies (variáveis indicadoras)
dados_cat_dummies = pd.get_dummies(dados[colunas_categoricas], drop_first=True)
dados_cat_dummies

Unnamed: 0,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Male,gender_Unknown/Invalid,age_[10-20),age_[20-30),age_[30-40),...,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes,readmitted_>30,readmitted_NO
0,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
1,False,False,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,True,True,False
2,True,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,True,True,False,True
3,False,False,True,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,True,False,True
4,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
101762,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,True
101763,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
101764,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True


In [11]:
#normalizar os dados categoricos
dados_cat_normalizados = pd.get_dummies(dados_cat_dummies, dtype='int')
dados_cat_normalizados = dados_cat_normalizados.astype(int)

dados_cat_normalizados

Unnamed: 0,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Male,gender_Unknown/Invalid,age_[10-20),age_[20-30),age_[30-40),...,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes,readmitted_>30,readmitted_NO
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,0,1
3,0,0,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
4,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
101762,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
101763,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
101764,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [78]:
dados_final = dados_num_normalizados.join(dados_cat_normalizados)
dados_final

colunas_treino = dados_final.columns.tolist()

colunas_treino

['time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'race_Caucasian',
 'gender_Female',
 'age_[10-20)',
 'admission_type_id_Emergency',
 'discharge_disposition_id_Discharged to home',
 'admission_source_id_Emergency Room',
 'medical_specialty_InternalMedicine',
 'diag_1_276',
 'diag_2_250.01',
 'diag_3_255',
 'max_glu_serum_None',
 'A1Cresult_None',
 'metformin_No',
 'repaglinide_No',
 'nateglinide_No',
 'chlorpropamide_No',
 'glimepiride_No',
 'acetohexamide_No',
 'glipizide_No',
 'glyburide_No',
 'tolbutamide_No',
 'pioglitazone_No',
 'rosiglitazone_No',
 'acarbose_No',
 'miglitol_No',
 'troglitazone_No',
 'tolazamide_No',
 'examide_No',
 'citoglipton_Up',
 'insulin_No',
 'glyburide-metformin_No',
 'glipizide-metformin_No',
 'glimepiride-pioglitazone_No',
 'metformin-rosiglitazone_No',
 'metformin-pioglitazone_Ch',
 'change_No',
 'diabetesMed_Yes',
 'readmitted_>30']

In [13]:
amostra = dados_final.sample(frac=0.1, random_state=42)

In [14]:
#hiperparametrização ou numero de clusters
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import math
from scipy.spatial.distance import cdist #determinador de distancias
import numpy as np

distortions = [] #retorno de distorcoes
K = range(1,30) #intervalo de clusters analisados para o numero otimo

for k in K:
    clusterModel = KMeans(n_clusters=k, random_state=42)
    clusterModel.fit(amostra)
    # calcular a distorção
    distortions.append(
        sum(np.min(cdist(amostra, clusterModel.cluster_centers_, 'euclidean'), axis=1)) / amostra.shape[0]
    )


In [15]:
#determinar o numero de clusters
x0 = K[0]
y0 = distortions[0]

xn = K[-1]
yn = distortions[-1]
distancias = []

for i in range(len(distortions)):
    x = K[i]
    y = distortions[i]
    numerador = abs((yn - y0) * x - (xn - x0) * y + (xn * y0) - (yn * x0))
    denominador = math.sqrt((yn - y0)**2 + (xn - x0)**2)
    distancias.append(numerador / denominador)

numero_clusters_otimo = K[distancias.index(max(distancias))]
print(f"Número ótimo de clusters: {numero_clusters_otimo}")

# treinar o modelo definitivo com o número ótimo de clusters
diabetes_cluster_model = KMeans(n_clusters=numero_clusters_otimo, random_state=42)
diabetes_cluster_model.fit(dados_final)

# salvar o modelo
dump(diabetes_cluster_model, open('/content/drive/MyDrive/SistemasInteligentes/models/diabetes_clusters.model', 'wb'))


Número ótimo de clusters: 9


In [79]:
import pandas as pd
import numpy as np

#colunas do dataset (as mesmas do seu DataFrame original)
colunas = [
    'encounter_id','patient_nbr','race','gender','age','weight','admission_type_id',
    'discharge_disposition_id','admission_source_id','time_in_hospital','payer_code',
    'medical_specialty','num_lab_procedures','num_procedures','num_medications',
    'number_outpatient','number_emergency','number_inpatient','diag_1','diag_2','diag_3',
    'number_diagnoses','max_glu_serum','A1Cresult','metformin','repaglinide','nateglinide',
    'chlorpropamide','glimepiride','acetohexamide','glipizide','glyburide','tolbutamide',
    'pioglitazone','rosiglitazone','acarbose','miglitol','troglitazone','tolazamide',
    'examide','citoglipton','insulin','glyburide-metformin','glipizide-metformin',
    'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone',
    'change','diabetesMed','readmitted'
]

#nova instância como lista
nova_instancia = [
    149190,55629189,"Caucasian","Female","[10-20)","?",1,1,7,3,"?","?",59,0,18,0,0,0,
    "276","250.01","255",9,"None","None","No","No","No","No","No","No","No","No","No",
    "No","No","No","No","No","No","No","Up","No","No","No","No","No","Ch","No","Yes",">30"
]

#criar DataFrame da nova instância
nova_df = pd.DataFrame([nova_instancia], columns=colunas)

In [80]:
#dropar colunas nao uteis
colunas_drop = ['encounter_id', 'patient_nbr', 'weight', 'payer_code']
nova_df = nova_df.drop(columns=colunas_drop)

In [81]:
# substituir '?' por NaN
nova_df = nova_df.replace('?', np.nan)
nova_df['medical_specialty'] = nova_df['medical_specialty'].fillna('InternalMedicine')

  nova_df = nova_df.replace('?', np.nan)


In [82]:
# mapear categorias
admission_type_map = {
    1: "Emergency", 2: "Urgent", 3: "Elective", 4: "Newborn",
    5: "Not Available", 6: "NULL", 7: "Trauma Center", 8: "Not Mapped"
}
discharge_map = {
    1: "Discharged to home", 2: "Transferred to short term hospital", 3: "Transferred to SNF",
    4: "Transferred to ICF", 5: "Another inpatient care", 6: "Home health service",
    7: "Left AMA", 11: "Expired", 13: "Hospice / home", 14: "Hospice / medical facility", 25: "Not Mapped", 26: "Unknown"
}
admission_source_map = {
    1: "Physician Referral", 2: "Clinic Referral", 3: "HMO Referral", 4: "Transfer from hospital",
    7: "Emergency Room", 9: "Not Available", 17: "NULL", 20: "Not Mapped", 21: "Unknown/Invalid"
}

nova_df['admission_type_id'] = nova_df['admission_type_id'].map(admission_type_map)
nova_df['discharge_disposition_id'] = nova_df['discharge_disposition_id'].map(discharge_map)
nova_df['admission_source_id'] = nova_df['admission_source_id'].map(admission_source_map)

In [83]:
# Identificar colunas numéricas e categóricas
colunas_numericas = nova_df.select_dtypes(include=['int64', 'float64']).columns
colunas_categoricas = nova_df.select_dtypes(include=['object', 'category']).columns

colunas_categoricas

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'medical_specialty',
       'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [84]:
# normalizar os dados 2
dados_normalizados = modelo_normalizador_diabetes.transform(nova_df[colunas_numericas])
dados_num_normalizados = pd.DataFrame(dados_normalizados, columns=colunas_numericas)

dados_normalizados

array([[0.15384615, 0.44274809, 0.        , 0.2125    , 0.        ,
        0.        , 0.        , 0.53333333]])

In [85]:
# transformar atributos categóricos em dummies (variáveis indicadoras)
dados_cat_dummies = pd.get_dummies(nova_df[colunas_categoricas])
dados_cat_dummies

Unnamed: 0,race_Caucasian,gender_Female,age_[10-20),admission_type_id_Emergency,discharge_disposition_id_Discharged to home,admission_source_id_Emergency Room,medical_specialty_InternalMedicine,diag_1_276,diag_2_250.01,diag_3_255,...,citoglipton_Up,insulin_No,glyburide-metformin_No,glipizide-metformin_No,glimepiride-pioglitazone_No,metformin-rosiglitazone_No,metformin-pioglitazone_Ch,change_No,diabetesMed_Yes,readmitted_>30
0,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [86]:
#normalizar os dados categoricos
dados_cat_normalizados = pd.get_dummies(dados_cat_dummies, dtype='int')
dados_cat_normalizados = dados_cat_normalizados.astype(int)

dados_cat_normalizados

Unnamed: 0,race_Caucasian,gender_Female,age_[10-20),admission_type_id_Emergency,discharge_disposition_id_Discharged to home,admission_source_id_Emergency Room,medical_specialty_InternalMedicine,diag_1_276,diag_2_250.01,diag_3_255,...,citoglipton_Up,insulin_No,glyburide-metformin_No,glipizide-metformin_No,glimepiride-pioglitazone_No,metformin-rosiglitazone_No,metformin-pioglitazone_Ch,change_No,diabetesMed_Yes,readmitted_>30
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [87]:
dados_final = dados_num_normalizados.join(dados_cat_normalizados)
dados_final

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_Caucasian,gender_Female,...,citoglipton_Up,insulin_No,glyburide-metformin_No,glipizide-metformin_No,glimepiride-pioglitazone_No,metformin-rosiglitazone_No,metformin-pioglitazone_Ch,change_No,diabetesMed_Yes,readmitted_>30
0,0.153846,0.442748,0.0,0.2125,0.0,0.0,0.0,0.533333,1,1,...,1,1,1,1,1,1,1,1,1,1


In [77]:
colunas_faltando = [col for col in colunas_treino if col not in nova_df.columns]

for col in colunas_faltando:
    nova_df[col] = 0

nova_df

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,...,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes,readmitted_>30,readmitted_NO
0,Caucasian,Female,[10-20),Emergency,Discharged to home,Emergency Room,3,InternalMedicine,59,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
#carregar o modelo de clusters
diabetes_clusters_model = load(open('/content/drive/MyDrive/SistemasInteligentes/models/diabetes_clusters.model', 'rb'))

grupo = diabetes_clusters_model.predict(nova_df)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- A1Cresult
- acarbose
- acetohexamide
- admission_source_id
- admission_type_id
- ...
