In [64]:
# Import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [65]:
# Read the data from CSV file
data = pd.read_csv('diabetic_data.csv')

In [66]:
# Shape of the data
rows, columns = data.shape
print(f'Rows : {rows}, Columns : {columns}')

Rows : 101766, Columns : 50


In [67]:
# Delete the column encounter_id
if 'encounter_id' in data.columns:
	data = data.drop('encounter_id', axis=1) # axis=1 for column
	print('Column encounter_id column deleted.')
else:
    print('encounter_id column not found.')

Column encounter_id column deleted.


In [68]:
# Identify the missing values
missing_values = data.isnull().sum()
print(missing_values)

patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide     

In [69]:
# Identify the missing values with ? notation
missing_values = (data == '?').sum()
missing_values

patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide     

In [70]:
# Replacing the ? with NaN
data = data.replace('?', np.nan)

In [71]:
# Identify the missing values with NaN after replacing ?
missing_values_after_replacment = data.isnull().sum()
missing_values_after_replacment

patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide     

In [73]:
# Replacing readmitted values to binary
data['readmitted'] = data['readmitted'].replace('<30', 1)
data['readmitted'] = data['readmitted'].astype(str).replace(['>30','NO'], 0)

In [77]:
# Data types of the columns
data_types = data.dtypes
print(data_types)

patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide       

In [79]:
# Drop columns with more than 90% missing values
data = data.dropna(thresh=0.9*len(data), axis=1)

In [87]:
# Dropping columns with near zero variance
columns_to_drop = ['repaglinide',
                   'nateglinide',
                   'chlorpropamide',
                   'glimepiride',
                   'acetohexamide',
                   'tolbutamide',
                   'acarbose',
                   'miglitol',
                   'troglitazone',
                   'tolazamide',
                   'examide',
                   'citoglipton',
                   'glyburide-metformin',
                   'glipizide-metformin',
                   'glimepiride-pioglitazone',
                   'metformin-rosiglitazone',
                   'metformin-pioglitazone']
for col in columns_to_drop:
	if col in data.columns:
		data = data.drop(col, axis=1)

In [90]:
# Drop rows with null values
data = data.dropna()

In [94]:
# Summary statistics
summary_statistics = data.describe()
print(summary_statistics)

        patient_nbr  admission_type_id  discharge_disposition_id  \
count  9.805300e+04       98053.000000              98053.000000   
mean   5.484792e+07           2.025813                  3.753368   
std    3.866175e+07           1.450117                  5.309392   
min    1.350000e+02           1.000000                  1.000000   
25%    2.350234e+07           1.000000                  1.000000   
50%    4.687790e+07           1.000000                  1.000000   
75%    8.800306e+07           3.000000                  4.000000   
max    1.895026e+08           8.000000                 28.000000   

       admission_source_id  time_in_hospital  num_lab_procedures  \
count         98053.000000      98053.000000        98053.000000   
mean              5.776692          4.421976           43.148073   
std               4.071640          2.993074           19.712033   
min               1.000000          1.000000            1.000000   
25%               1.000000          2.000000   