In [1]:
import pandas as pd

In [2]:
df_data = pd.read_csv("../input/patient_priority.csv")
df_data.head(5)

Unnamed: 0.1,Unnamed: 0,age,gender,chest pain type,blood pressure,cholesterol,max heart rate,exercise angina,plasma glucose,skin_thickness,insulin,bmi,diabetes_pedigree,hypertension,heart_disease,Residence_type,smoking_status,triage
0,0,40.0,1.0,2.0,140.0,294.0,172.0,0.0,108.0,43.0,92.0,19.0,0.467386,0.0,0.0,Urban,never smoked,yellow
1,1,49.0,0.0,3.0,160.0,180.0,156.0,0.0,75.0,47.0,90.0,18.0,0.467386,0.0,0.0,Urban,never smoked,orange
2,2,37.0,1.0,2.0,130.0,294.0,156.0,0.0,98.0,53.0,102.0,23.0,0.467386,0.0,0.0,Urban,never smoked,yellow
3,3,48.0,0.0,4.0,138.0,214.0,156.0,1.0,72.0,51.0,118.0,18.0,0.467386,0.0,0.0,Urban,never smoked,orange
4,4,54.0,1.0,3.0,150.0,195.0,156.0,0.0,108.0,90.0,83.0,21.0,0.467386,0.0,0.0,Urban,never smoked,yellow


#### Basic description of the dataset

In [3]:
df_data = df_data.drop(columns=['Unnamed: 0'])

In [4]:
df_data.shape

(6962, 17)

In [5]:
df_data.describe()

Unnamed: 0,age,gender,chest pain type,blood pressure,cholesterol,max heart rate,exercise angina,plasma glucose,skin_thickness,insulin,bmi,diabetes_pedigree,hypertension,heart_disease
count,6962.0,6961.0,6962.0,6962.0,6962.0,6962.0,6962.0,6962.0,6962.0,6962.0,6962.0,6962.0,6962.0,6962.0
mean,57.450014,0.531964,0.529015,109.629991,184.71129,163.502442,0.061764,98.394283,56.813416,111.09164,27.190908,0.467386,0.071531,0.0395
std,11.904948,0.499013,1.253791,21.534852,32.010359,15.458693,0.240743,28.598084,22.889316,17.470033,7.362886,0.102663,0.257729,0.194796
min,28.0,0.0,0.0,60.0,150.0,138.0,0.0,55.12,21.0,81.0,10.3,0.078,0.0,0.0
25%,48.0,0.0,0.0,92.0,164.0,150.0,0.0,78.7075,36.0,97.0,21.8,0.467386,0.0,0.0
50%,56.0,1.0,0.0,111.0,179.0,163.0,0.0,93.0,55.0,111.0,26.2,0.467386,0.0,0.0
75%,66.0,1.0,0.0,127.0,192.0,177.0,0.0,111.6325,77.0,125.0,31.0,0.467386,0.0,0.0
max,82.0,1.0,4.0,165.0,294.0,202.0,1.0,199.0,99.0,171.0,66.8,2.42,1.0,1.0


#### Check for null values

In [6]:
df_data.isnull().sum()

age                    0
gender                 1
chest pain type        0
blood pressure         0
cholesterol            0
max heart rate         0
exercise angina        0
plasma glucose         0
skin_thickness         0
insulin                0
bmi                    0
diabetes_pedigree      0
hypertension           0
heart_disease          0
Residence_type         0
smoking_status         0
triage               410
dtype: int64

In [7]:
df_data = df_data.drop(columns=['triage']) # corresponde a la columna respuesta indicando la prioridad de atención del paciente
df_data = df_data.dropna() # eliminar la fila que tiene un valor vacio en genero

#### Check for duplicate rows

In [8]:
duplicate = df_data.duplicated()
print(f"Total duplicate rows: {duplicate.sum()}")

Total duplicate rows: 0


#### View data types

In [9]:
df_data.dtypes

age                  float64
gender               float64
chest pain type      float64
blood pressure       float64
cholesterol          float64
max heart rate       float64
exercise angina      float64
plasma glucose       float64
skin_thickness       float64
insulin              float64
bmi                  float64
diabetes_pedigree    float64
hypertension         float64
heart_disease        float64
Residence_type        object
smoking_status        object
dtype: object

In [10]:
cols_to_int = ['age', 'gender', 'chest pain type', 'exercise angina', 'hypertension', 'heart_disease']

# Convertir las columnas a tipo int
df_data[cols_to_int] = df_data[cols_to_int].astype(int)

In [11]:
df_data.dtypes

age                    int64
gender                 int64
chest pain type        int64
blood pressure       float64
cholesterol          float64
max heart rate       float64
exercise angina        int64
plasma glucose       float64
skin_thickness       float64
insulin              float64
bmi                  float64
diabetes_pedigree    float64
hypertension           int64
heart_disease          int64
Residence_type        object
smoking_status        object
dtype: object

In [12]:
df_data["Residence_type"].value_counts()

Residence_type
Urban    4449
Rural    2512
Name: count, dtype: int64

In [13]:
df_data["smoking_status"].value_counts()

smoking_status
never smoked       3745
Unknown            1544
formerly smoked     883
smokes              789
Name: count, dtype: int64

#### Change column values ​​to numeric values

In [14]:
df_encoded = pd.get_dummies(df_data, columns=['Residence_type', 'smoking_status'])

# Convertir solo las columnas generadas por one-hot encoding a int
one_hot_columns = [col for col in df_encoded.columns if col.startswith('Residence_type_') or col.startswith('smoking_status_')]
df_encoded[one_hot_columns] = df_encoded[one_hot_columns].astype(int)

In [15]:
df_encoded.to_csv("../results/preprocessing/01_dataset_preprocessing.csv", index=False)