In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file_path = '/content/drive/MyDrive/ECS_271/diabetic_data (2).csv'
data = pd.read_csv(file_path)

print("Original Dataset:")
print(data.info())
print("\nRows:")
print(data.head())

Original Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null

In [None]:
data.replace('?', np.nan, inplace=True)
missing_values = data.isnull().sum()

print("Columns with missing values:")
print(missing_values[missing_values > 0])

print("\nData types:")
print(data.dtypes)

Columns with missing values:
race                  2273
weight               98569
payer_code           40256
medical_specialty    49949
diag_1                  21
diag_2                 358
diag_3                1423
max_glu_serum        96420
A1Cresult            84748
dtype: int64

Data types:
encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
di

In [None]:
data.drop(['weight', 'medical_specialty', 'payer_code'], axis=1, inplace=True)

In [None]:
data = data[(data['diag_1'] != '?') | (data['diag_2'] != '?') | (data['diag_3'] != '?')]
data = data[data['discharge_disposition_id'] != 11]
data = data[data['gender'] != 'Unknown/Invalid']
data = data[data['race'] != '?']

In [None]:
zero_variance_columns = [col for col in data.columns if data[col].nunique() == 1]
print("\nShape of the dataset before dropping zero-variance columns:", data.shape)
print("Columns with Zero Variance: ")
print(zero_variance_columns)
data = data.drop(columns=zero_variance_columns, axis=1)
print("\nShape of the dataset after dropping zero-variance columns:", data.shape)


Shape of the dataset before dropping zero-variance columns: (100121, 47)
Columns with Zero Variance: 
['examide', 'citoglipton']

Shape of the dataset after dropping zero-variance columns: (100121, 45)


In [None]:
data.drop(['encounter_id', 'patient_nbr'], axis=1, inplace=True)

In [None]:
data['readmitted'] = data['readmitted'].replace({'<30': 1, '>30': 0, 'NO': 0})
data['age'] = data['age'].replace({'[0-10)': 5, '[10-20)': 15, '[20-30)': 25, '[30-40)': 35,
                                   '[40-50)': 45, '[50-60)': 55, '[60-70)': 65, '[70-80)': 75,
                                   '[80-90)': 85, '[90-100)': 95})

  data['readmitted'] = data['readmitted'].replace({'<30': 1, '>30': 0, 'NO': 0})
  data['age'] = data['age'].replace({'[0-10)': 5, '[10-20)': 15, '[20-30)': 25, '[30-40)': 35,


In [None]:
data['patient_activity'] = data['number_outpatient'] + data['number_emergency'] + data['number_inpatient']

keys = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide',
        'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide',
        'metformin-pioglitazone', 'metformin-rosiglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin',
        'troglitazone', 'tolbutamide', 'acetohexamide']

data['medication_variation'] = sum([data[col].apply(lambda x: 0 if x in ['No', 'Steady'] else 1) for col in keys])

data['total_medications'] = sum([data[col].replace({'No': 0, 'Steady': 1, 'Up': 1, 'Down': 1}) for col in keys])

  data['total_medications'] = sum([data[col].replace({'No': 0, 'Steady': 1, 'Up': 1, 'Down': 1}) for col in keys])


In [None]:
data['primary_diag'] = data['diag_1']
data['secondary_diag'] = data['diag_2']
data['additional_diag'] = data['diag_3']

data['primary_diag'] = data['primary_diag'].replace('?', -1)
data['secondary_diag'] = data['secondary_diag'].replace('?', -1)
data['additional_diag'] = data['additional_diag'].replace('?', -1)

data.loc[data['diag_1'].str.contains('V', na=False), 'primary_diag'] = 0
data.loc[data['diag_1'].str.contains('E', na=False), 'primary_diag'] = 0
data.loc[data['diag_2'].str.contains('V', na=False), 'secondary_diag'] = 0
data.loc[data['diag_2'].str.contains('E', na=False), 'secondary_diag'] = 0
data.loc[data['diag_3'].str.contains('V', na=False), 'additional_diag'] = 0
data.loc[data['diag_3'].str.contains('E', na=False), 'additional_diag'] = 0

data['primary_diag'] = data['primary_diag'].astype(float)
data['secondary_diag'] = data['secondary_diag'].astype(float)
data['additional_diag'] = data['additional_diag'].astype(float)

def recode_diag_column(column):
    for index, row in data.iterrows():
        if (row[column] >= 390 and row[column] < 460) or (np.floor(row[column]) == 785):
            data.loc[index, column] = 1  # Circulatory
        elif (row[column] >= 460 and row[column] < 520) or (np.floor(row[column]) == 786):
            data.loc[index, column] = 2  # Respiratory
        elif (row[column] >= 520 and row[column] < 580) or (np.floor(row[column]) == 787):
            data.loc[index, column] = 3  # Digestive
        elif (np.floor(row[column]) == 250):
            data.loc[index, column] = 4  # Diabetes
        elif (row[column] >= 800 and row[column] < 1000):
            data.loc[index, column] = 5  # Injury
        elif (row[column] >= 710 and row[column] < 740):
            data.loc[index, column] = 6  # Musculoskeletal
        elif (row[column] >= 580 and row[column] < 630) or (np.floor(row[column]) == 788):
            data.loc[index, column] = 7  # Genitourinary
        elif (row[column] >= 140 and row[column] < 240):
            data.loc[index, column] = 8  # Neoplasms
        else:
            data.loc[index, column] = 0  # Other

for col in ['primary_diag', 'secondary_diag', 'additional_diag']:
    recode_diag_column(col)

In [None]:
data['admission_source_id'] = data['admission_source_id'].replace({2: 1, 3: 1, 5: 4, 6: 4, 10: 4, 22: 4, 25: 4,
                                                                    7: 9, 17: 9, 20: 9, 21: 9, 13: 11, 14: 11})
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace({
    6: 1, 8: 1, 9: 1, 13: 1,
    3: 2, 4: 2, 5: 2, 14: 2, 22: 2, 23: 2, 24: 2,
    12: 10, 15: 10, 16: 10, 17: 10,
    25: 18, 26: 18
})

data['admission_type_id'] = data['admission_type_id'].replace({2: 1, 7: 1, 6: 5, 8: 5})

data['race'] = data['race'].replace({
    'Caucasian': 1,
    'AfricanAmerican': 2,
    'Hispanic': 3,
    'Asian': 4,
    'Other': 0
})

data['diabetesMed'] = data['diabetesMed'].replace({'Yes': 1, 'No': 0})

data['gender'] = data['gender'].replace({'Male': 1, 'Female': 0})

data['change'] = data['change'].replace({'Ch': 1, 'No': 0})

data['A1Cresult'] = data['A1Cresult'].replace({
    '>7': 1,
    '>8': 1,
    'Norm': 0,
    'None': 99
})

data['max_glu_serum'] = data['max_glu_serum'].replace({
    '>200': 1,
    '>300': 1,
    'Norm': 0,
    'None': 99
})

data.to_csv("cleaned_diabetes_data.csv", index=False)

print("Data Cleaning and Feature Engineering Complete!")

Data Cleaning and Feature Engineering Complete!


In [None]:
data.head(5).T

Unnamed: 0,0,1,2,3,4
race,1.0,1.0,2.0,1.0,1.0
gender,0,0,0,1,1
age,5,15,25,35,45
admission_type_id,5,1,1,1,1
discharge_disposition_id,18,1,1,1,1
admission_source_id,1,9,9,9,9
time_in_hospital,1,3,2,2,1
num_lab_procedures,41,59,11,44,51
num_procedures,0,0,5,1,0
num_medications,1,18,13,16,8


In [None]:
data.dtypes

Unnamed: 0,0
race,float64
gender,int64
age,int64
admission_type_id,int64
discharge_disposition_id,int64
admission_source_id,int64
time_in_hospital,int64
num_lab_procedures,int64
num_procedures,int64
num_medications,int64
