### Import Libraries

In [12]:
import pandas as pd
import plotly
import plotly.express as px

### Read CSV File

In [3]:
diabetes = pd.read_csv('diabetic_data.csv')

### Sample Data (20 items)

In [17]:
diabetes.head(n=20)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),?,2,1,2,3,...,No,Steady,No,No,No,No,No,No,Yes,>30
6,55842,84259809,Caucasian,Male,[60-70),?,3,1,2,4,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,63768,114882984,Caucasian,Male,[70-80),?,1,1,7,5,...,No,No,No,No,No,No,No,No,Yes,>30
8,12522,48330783,Caucasian,Female,[80-90),?,2,1,4,13,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
9,15738,63555939,Caucasian,Female,[90-100),?,3,3,4,12,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


### Y Values: Readmitted

In [11]:
y = diabetes['readmitted']
print(f'Percentage of patient readmitted for diabetes (>30 days): % {round(y.value_counts(normalize=True)[1]*100,2)} --> ({y.value_counts()[1]} patient)\nPercentage of patient not readmitted for diabetes: % {round(y.value_counts(normalize=True)[0]*100,2)} --> ({y.value_counts()[0]} patient)\nPercentage of patient readmitted for diabetes (<30 days): % {round(y.value_counts(normalize=True)[2]*100,2)} --> ({y.value_counts()[2]} patient)')

Percentage of patient readmitted for diabetes (>30 days): % 34.93 --> (35545 patient)
Percentage of patient not readmitted for diabetes: % 53.91 --> (54864 patient)
Percentage of patient readmitted for diabetes (<30 days): % 11.16 --> (11357 patient)


### Figure of Diabetes Readmitted ( NO , <30 , >30)

In [13]:
fig = px.histogram(diabetes, x="readmitted", title='Diabetes', width=400, height=400)
fig.show()

In [None]:
 diabetes.info()

## Defined missing data


In [20]:
def missing (diabetes):
    missing_number = diabetes.isnull().sum().sort_values(ascending=False)
    missing_percent = (diabetes.isnull().sum()/diabetes.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
    return missing_values

missing(diabetes)

Unnamed: 0,Missing_Number,Missing_Percent
weight,98569,0.968585
medical_specialty,49949,0.490822
payer_code,40256,0.395574
race,2273,0.022336
diag_3,1423,0.013983
diag_2,358,0.003518
diag_1,21,0.000206
encounter_id,0,0.0
tolazamide,0,0.0
glyburide,0,0.0


## Found Custom missing values using "?"
## Replaced with NA instead of ? since no data regarding this topic

In [18]:
custom_missing_values = ["?"]
diabetes.replace(custom_missing_values, pd.NA, inplace=True)

print("\nDataFrame with Missing Data Handled:")
print(diabetes)


DataFrame with Missing Data Handled:
        encounter_id  patient_nbr             race  gender      age weight  \
0            2278392      8222157        Caucasian  Female   [0-10)   <NA>   
1             149190     55629189        Caucasian  Female  [10-20)   <NA>   
2              64410     86047875  AfricanAmerican  Female  [20-30)   <NA>   
3             500364     82442376        Caucasian    Male  [30-40)   <NA>   
4              16680     42519267        Caucasian    Male  [40-50)   <NA>   
...              ...          ...              ...     ...      ...    ...   
101761     443847548    100162476  AfricanAmerican    Male  [70-80)   <NA>   
101762     443847782     74694222  AfricanAmerican  Female  [80-90)   <NA>   
101763     443854148     41088789        Caucasian    Male  [70-80)   <NA>   
101764     443857166     31693671        Caucasian  Female  [80-90)   <NA>   
101765     443867222    175429310        Caucasian    Male  [70-80)   <NA>   

        admission_type_id