In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import math
sns.set(style="whitegrid")

In [92]:
df = pd.read_csv('F:/MyProjects/PraxisProjects/hospital-readmission-risk-prediction/notebooks/dataset/data_eda.csv', low_memory=False)
display(df.shape)
display(df.columns)
df.head()

(101766, 41)

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_binary',
       'diag_1_group', 'diag_2_group', 'diag_3_group'],
      dtype='object')

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted_binary,diag_1_group,diag_2_group,diag_3_group
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,0,Diabetes,Unknown,Unknown
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,No,No,No,Ch,Yes,0,Other,Diabetes,Other
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,Yes,0,Other,Diabetes,Unknown
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,No,No,No,Ch,Yes,0,Other,Diabetes,Circulatory
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,No,No,No,Ch,Yes,0,Cancer,Cancer,Diabetes


In [93]:
df['gender'].value_counts()

gender
Female             54708
Male               47055
Unknown/Invalid        3
Name: count, dtype: int64

In [94]:
df = df[df['gender'].isin(['Female', 'Male'])]
df['gender'].value_counts()

gender
Female    54708
Male      47055
Name: count, dtype: int64

In [95]:
df['age'].value_counts()

age
[70-80)     26066
[60-70)     22482
[50-60)     17256
[80-90)     17197
[40-50)      9685
[30-40)      3775
[90-100)     2793
[20-30)      1657
[10-20)       691
[0-10)        161
Name: count, dtype: int64

In [96]:
age_mapping = {
    '[0-10)': 5,
    '[10-20)': 15,
    '[20-30)': 25,
    '[30-40)': 35,
    '[40-50)': 45,
    '[50-60)': 55,
    '[60-70)': 65,
    '[70-80)': 75,
    '[80-90)': 85,
    '[90-100)': 95
}

In [97]:
df['age'] = df['age'].map(age_mapping).astype(int)
display(df['age'].value_counts())

age
75    26066
65    22482
55    17256
85    17197
45     9685
35     3775
95     2793
25     1657
15      691
5       161
Name: count, dtype: int64

In [98]:
'admission_type_id','discharge_disposition_id', 'admission_source_id'

('admission_type_id', 'discharge_disposition_id', 'admission_source_id')

In [99]:
# display(df['admission_type_id'].value_counts())
# display(df['discharge_disposition_id'].value_counts())
# display(df['admission_source_id'].value_counts())
cols_to_group = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']

threshold = 0.01

for col in cols_to_group:
    value_counts = df[col].value_counts(normalize=True)
    
    keep_values = value_counts[value_counts >= threshold].index
    
    df[col] = df[col].astype(str)
    
    df[col] = df[col].where(df[col].isin(keep_values.astype(str)), 'Other')

for col in cols_to_group:
    print(df[col].value_counts())

admission_type_id
1        53988
3        18868
2        18480
6         5291
5         4785
Other      351
Name: count, dtype: int64
discharge_disposition_id
1        60232
3        13954
6        12902
Other     4038
18        3691
2         2128
22        1992
11        1642
5         1184
Name: count, dtype: int64
admission_source_id
7        57492
1        29564
17        6781
4         3187
6         2264
Other     1371
2         1104
Name: count, dtype: int64


In [100]:
df['total_visits'] = df['number_outpatient'] + df['number_emergency'] + df['number_inpatient']

print(df[['number_outpatient', 'number_emergency', 'number_inpatient', 'total_visits']].head())

   number_outpatient  number_emergency  number_inpatient  total_visits
0                  0                 0                 0             0
1                  0                 0                 0             0
2                  2                 0                 1             3
3                  0                 0                 0             0
4                  0                 0                 0             0


In [101]:
med_cols = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
    'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
    'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
    'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
    'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone'
]

df['num_med_changes'] = df[med_cols].isin(['Up', 'Down']).sum(axis=1)

df['num_med_active'] = (df[med_cols] != 'No').sum(axis=1)

insulin_map = {'No': 0, 'Steady': 1, 'Down': 2, 'Up': 3}
df['insulin_coded'] = df['insulin'].map(insulin_map)

print(df[['num_med_changes', 'num_med_active', 'insulin_coded']].head())

   num_med_changes  num_med_active  insulin_coded
0                0               0              0
1                1               1              3
2                0               1              0
3                1               1              3
4                0               2              1


In [102]:

df['num_med_active'] = (df[med_cols] != 'No').sum(axis=1)

df['interaction_visits_meds'] = df['total_visits'] * df['num_med_active']

print(df[['total_visits', 'num_med_active', 'interaction_visits_meds']].head())

   total_visits  num_med_active  interaction_visits_meds
0             0               0                        0
1             0               1                        0
2             3               1                        3
3             0               1                        0
4             0               2                        0


In [103]:
df.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,diabetesMed,readmitted_binary,diag_1_group,diag_2_group,diag_3_group,total_visits,num_med_changes,num_med_active,insulin_coded,interaction_visits_meds
0,Caucasian,Female,5,6,Other,1,1,41,0,1,...,No,0,Diabetes,Unknown,Unknown,0,0,0,0,0
1,Caucasian,Female,15,1,1,7,3,59,0,18,...,Yes,0,Other,Diabetes,Other,0,1,1,3,0
2,AfricanAmerican,Female,25,1,1,7,2,11,5,13,...,Yes,0,Other,Diabetes,Unknown,3,0,1,0,3
3,Caucasian,Male,35,1,1,7,2,44,1,16,...,Yes,0,Other,Diabetes,Circulatory,0,1,1,3,0
4,Caucasian,Male,45,1,1,7,1,51,0,8,...,Yes,0,Cancer,Cancer,Diabetes,0,0,2,1,0


In [104]:
from scipy.stats import f_oneway, chi2_contingency

In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 101763 entries, 0 to 101765
Data columns (total 46 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   race                      101763 non-null  object
 1   gender                    101763 non-null  object
 2   age                       101763 non-null  int64 
 3   admission_type_id         101763 non-null  object
 4   discharge_disposition_id  101763 non-null  object
 5   admission_source_id       101763 non-null  object
 6   time_in_hospital          101763 non-null  int64 
 7   num_lab_procedures        101763 non-null  int64 
 8   num_procedures            101763 non-null  int64 
 9   num_medications           101763 non-null  int64 
 10  number_outpatient         101763 non-null  int64 
 11  number_emergency          101763 non-null  int64 
 12  number_inpatient          101763 non-null  int64 
 13  number_diagnoses          101763 non-null  int64 
 14  metformin

In [106]:
#  df.to_csv('F:/MyProjects/PraxisProjects/hospital-readmission-risk-prediction/notebooks/dataset/data_fe.csv',index=False)

In [None]:
from sklearn.feature_selection import f_classif, chi2
from sklearn.preprocessing import LabelEncoder


In [84]:
label_col = df['readmitted_binary']

In [109]:
X = df.drop(columns='readmitted_binary')
y = df['readmitted_binary']

In [110]:
num_cols = X.select_dtypes(include=['int','float']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [111]:
f_scores, p_values = f_classif(X[num_cols].fillna(0), y)

In [112]:
anova_df = pd.DataFrame({'Feature': num_cols, 'F_Score': f_scores, 'p_score':p_values})
print(anova_df.sort_values(by='F_Score', ascending=False).head(10))

                    Feature      F_Score        p_score
7          number_inpatient  2853.058043   0.000000e+00
9              total_visits  1644.501141   0.000000e+00
13  interaction_visits_meds   872.925123  4.874380e-191
6          number_emergency   376.877119   8.449728e-84
8          number_diagnoses   250.133813   2.834299e-56
1          time_in_hospital   199.163345   3.508332e-45
12            insulin_coded   160.283856   1.045568e-36
4           num_medications   150.531135   1.403914e-34
10          num_med_changes   124.190546   7.953138e-29
2        num_lab_procedures    42.197358   8.289005e-11


In [113]:
X_cat_encoded = X[cat_cols].copy()
for col in cat_cols:
    X_cat_encoded[col] = LabelEncoder().fit_transform(X[col].astype(str))

In [114]:
chi2_scores, p_values_chi = chi2(X_cat_encoded, y)

In [None]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()


from sklearn.preprocessing import LabelEncoder

X_encoded = X[cat_cols].copy()
for col in cat_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X[col].astype(str))

chi2_scores, p_values = chi2(X_encoded, y)

chi2_df = pd.DataFrame({
    'Feature': cat_cols,
    'Score': chi2_scores,
    'p_value': p_values
})

print(chi2_df.sort_values(by='Score', ascending=False).head(10))

                     Feature        Score       p_value
3   discharge_disposition_id  1753.895435  0.000000e+00
4        admission_source_id    47.724100  4.906232e-12
2          admission_type_id    24.517018  7.365640e-07
26                    change    17.894612  2.334812e-05
27               diabetesMed    17.215418  3.337169e-05
28              diag_1_group    12.865901  3.346237e-04
5                  metformin     9.155850  2.479242e-03
29              diag_2_group     3.389482  6.561356e-02
13               tolbutamide     1.076588  2.994622e-01
19                tolazamide     0.540504  4.622238e-01
