In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

In [2]:
df1=pd.read_csv('synthetic_readmission_data_1.csv')

In [3]:
df1= df1.reindex(sorted(df1.columns), axis=1)

In [4]:
df1.head()

Unnamed: 0,Access_To_Healthy_Food,Age,Alchohol,Anxiety,Blood_Glucose_Levels,CKD,COPD,Cancer,Cholesterol_Levels,Creatinine_Levels,...,Osteoarthritis,Other,Patient_ID,Previous_Readmissions,Proactivity_In_Health,Readmission,Renal_Function_Tests,Side_Effects_And_Complications,Smoking,Support System Availability
0,Moderate,69,Frequent,Occasional,Significantly Elevated,Stage 5,Severe,Stage 0,Normal,Elevated,...,Moderate,No,1ea395de-999f-4b94-bcc0-14f06e9d03e3,4,Moderate,0,Abnormal,Mild,Severe,Moderate
1,Moderate,32,Occasional,Occasional,Significantly Elevated,Stage 1-2,Moderate,No,Normal,Normal,...,Mild,Mild,26140959-8f18-4a13-9729-5b127926e977,1,Moderate,1,Normal,Mild,Severe,Moderate
2,Moderate,89,No,Occasional,Elevated,Stage 3-4,Moderate,No,Normal,Elevated,...,Moderate,No,b1fa06db-01ee-41a9-b947-a40dd581efc7,3,Passive,1,Normal,Severe,Severe,Moderate
3,Easy,78,Occasional,No,Elevated,Stage 3-4,Moderate,Stage 0,Normal,Significantly Elevated,...,Moderate,Moderate,0b348404-07c4-4a65-bc78-a29ac5d2182f,2,Moderate,1,Normal,Severe,Severe,Weak
4,Limited,38,Frequent,No,Elevated,Stage 1-2,No,No,Normal,Normal,...,No,Mild,ba3dc0f4-2f37-4dd0-937d-d141414fe375,2,Moderate,1,Abnormal,Mild,Severe,Moderate


In [5]:
df1.columns

Index(['Access_To_Healthy_Food', 'Age', 'Alchohol', 'Anxiety',
       'Blood_Glucose_Levels', 'CKD', 'COPD', 'Cancer', 'Cholesterol_Levels',
       'Creatinine_Levels', 'Depression', 'Diabetes', 'Discharge_To',
       'Distance_From_Hospital', 'Drugs', 'Education_Level',
       'Emergency_Visits', 'Employment_Status', 'Ethinicity',
       'Follow_Up_Attendance', 'Follow_Up_Scheduled', 'Gender', 'HIV/AIDS',
       'Health_Literacy', 'Heart_Diseases', 'Hemoglobin_Levels',
       'Hospital_Stay_Duration', 'Household_Composition', 'Housing_Stability',
       'Hypertension', 'IBD', 'Income_Level', 'Inflammatory_Markers',
       'Insurance_Type', 'Insurence_Coverage', 'Liver_Function_Tests',
       'Liver_Related_Conditions', 'Medication_Adherence', 'Medication_Type',
       'Neighborhood_Safety', 'Neurological_Disorders',
       'Number_Of_Medications', 'Osteoarthritis', 'Other', 'Patient_ID',
       'Previous_Readmissions', 'Proactivity_In_Health', 'Readmission',
       'Renal_Function_Tes

In [6]:
df1.describe()

Unnamed: 0,Age,Distance_From_Hospital,Emergency_Visits,Follow_Up_Scheduled,Hospital_Stay_Duration,Previous_Readmissions,Readmission
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,53.3987,24.94774,0.99982,0.50104,7.5288,2.0015,0.6161
std,20.799006,14.186904,0.818091,0.500004,4.042016,1.417257,0.486339
min,18.0,1.0,0.0,0.0,1.0,0.0,0.0
25%,35.0,13.0,0.0,0.0,4.0,1.0,0.0
50%,53.0,25.0,1.0,1.0,8.0,2.0,1.0
75%,71.0,37.0,2.0,1.0,11.0,3.0,1.0
max,89.0,49.0,2.0,1.0,14.0,4.0,1.0


In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 52 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Access_To_Healthy_Food          50000 non-null  object
 1   Age                             50000 non-null  int64 
 2   Alchohol                        50000 non-null  object
 3   Anxiety                         50000 non-null  object
 4   Blood_Glucose_Levels            50000 non-null  object
 5   CKD                             50000 non-null  object
 6   COPD                            50000 non-null  object
 7   Cancer                          50000 non-null  object
 8   Cholesterol_Levels              50000 non-null  object
 9   Creatinine_Levels               50000 non-null  object
 10  Depression                      50000 non-null  object
 11  Diabetes                        50000 non-null  object
 12  Discharge_To                    50000 non-null

In [8]:
df1.isnull().sum()

Access_To_Healthy_Food            0
Age                               0
Alchohol                          0
Anxiety                           0
Blood_Glucose_Levels              0
CKD                               0
COPD                              0
Cancer                            0
Cholesterol_Levels                0
Creatinine_Levels                 0
Depression                        0
Diabetes                          0
Discharge_To                      0
Distance_From_Hospital            0
Drugs                             0
Education_Level                   0
Emergency_Visits                  0
Employment_Status                 0
Ethinicity                        0
Follow_Up_Attendance              0
Follow_Up_Scheduled               0
Gender                            0
HIV/AIDS                          0
Health_Literacy                   0
Heart_Diseases                    0
Hemoglobin_Levels                 0
Hospital_Stay_Duration            0
Household_Composition       

In [9]:
df1.shape

(50000, 52)

In [10]:
#Checking Counts of Train Data & Test Data
yes_count, no_count = df1.Readmission.value_counts()
print(no_count, yes_count)

19195 30805


In [11]:
#Handling Data Imbalance using Over Sampling
balanced_df  = df1.loc[df1.Readmission==0]
df_oversampled = balanced_df.sample(n=(yes_count-no_count),replace=True)

df1 = pd.concat([df1, df_oversampled], ignore_index=True)
df1.Readmission.value_counts()

Readmission
0    30805
1    30805
Name: count, dtype: int64

In [12]:
X = df1.drop(columns=['Readmission', 'Patient_ID'])  # Dropping target and irrelevant columns
y = df1['Readmission']

In [13]:
# List of numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [14]:
# Defining transformations for numeric features (e.g., scaling) and categorical features (e.g., encoding)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values by filling with median
    ('scaler', StandardScaler())                   # Normalize numeric features
])

In [15]:
# Ordinal Columns: Categorical features that have an inherent order (e.g., stages, severity)
ordinal_features = ['Heart_Diseases', 'Hypertension', 'Diabetes', 'COPD', 'CKD', 'Cancer', 
                    'Osteoarthritis', 'Neurological_Disorders', 'Depression', 'Anxiety', 
                    'Smoking', 'Medication_Adherence', 'Medication_Type', 'Side_Effects_And_Complications']

In [16]:
ordinal_mappings = {
    'Heart_Diseases': ['No', 'Mild', 'Moderate', 'Severe'],
    'Hypertension': ['No', 'Controlled', 'Uncontrolled'],
    'Diabetes': ['No', 'Controlled', 'Uncontrolled'],
    'COPD': ['No', 'Mild', 'Moderate', 'Severe'],
    'CKD': ['No', 'Stage 1-2', 'Stage 3-4', 'Stage 5'],
    'Cancer': ['No', 'Stage 0', 'Stage 1', 'Stage 2', 'Stage 3', 'Stage 4'],
    'Osteoarthritis': ['No', 'Mild', 'Moderate', 'Severe'],
    'Neurological_Disorders': ['No', 'Mild', 'Moderate', 'Severe'],
    'Depression': ['No', 'Mild', 'Moderate', 'Severe'],
    'Anxiety': ['No', 'Occasional', 'Frequent'],
    'Smoking': ['No', 'Mild', 'Moderate', 'Severe'],
    'Medication_Adherence': ['No', 'Partially', 'Fully'],
    'Medication_Type': ['Low-risk', 'Moderate-risk', 'High-risk'],
    'Side_Effects_And_Complications': ['No', 'Mild', 'Severe']
}

In [17]:
# # Applying ordinal encoding for ordered features
# ordinal_transformer = ColumnTransformer(
#     transformers=[(col, OrdinalEncoder(categories=[ordinal_mappings[col]]), [col]) for col in ordinal_features],
#     remainder='passthrough'  # Keep other columns unchanged for now
# )

In [18]:
# Ordinal Encoding with missing value handling
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent category
    ('ordinal', OrdinalEncoder(categories=[ordinal_mappings[col] for col in ordinal_features]))  # Apply ordinal encoding
])


In [19]:
# Categorical transformer for non-ordinal categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values by filling with most frequent
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode categorical features
])

In [20]:
# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat_ord', ordinal_transformer, ordinal_features),
        ('cat', categorical_transformer, [col for col in categorical_features if col not in ordinal_features])
    ] 
)

MODEL TRAING USING RANDOMFORESTCLASSIFIER

In [21]:
# Build the final pipeline by combining preprocessor and a classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
]) 

In [22]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

In [24]:
from sklearn import set_config
set_config(display='diagram')

In [25]:
# Make predictions
y_pred = pipeline.predict(X_test)

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7615646810582698

In [27]:
# Output the pipeline structure for further steps if neede
pipeline.named_steps['preprocessor']

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import pickle

In [29]:
# Model evaluation function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    print("Accuracy: ", accuracy)
    print("ROC AUC Score: ", roc_auc)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    return accuracy, roc_auc


In [30]:
# Evaluate the model on test data
evaluate_model(pipeline, X_test, y_test)

Accuracy:  0.7615646810582698
ROC AUC Score:  0.8658971678947989
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.80      0.77      6164
           1       0.78      0.73      0.75      6158

    accuracy                           0.76     12322
   macro avg       0.76      0.76      0.76     12322
weighted avg       0.76      0.76      0.76     12322



(0.7615646810582698, np.float64(0.8658971678947989))

HYPERPARAMETER TUNING OF RANDOM FOREST CLASSIFIER

In [31]:
# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10]
}

In [32]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [33]:
# Best estimator and hyperparameters
best_model = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)

Best Parameters:  {'classifier__max_depth': 20, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}


In [34]:

# Evaluate the tuned model
evaluate_model(best_model, X_test, y_test)

Accuracy:  0.765703619542282
ROC AUC Score:  0.8689156426728635
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.80      0.77      6164
           1       0.78      0.74      0.76      6158

    accuracy                           0.77     12322
   macro avg       0.77      0.77      0.77     12322
weighted avg       0.77      0.77      0.77     12322



(0.765703619542282, np.float64(0.8689156426728635))

In [35]:
#pickle.dump(pipeline,open('pipe.pkl','wb'))

In [36]:
y_train

16739    1
30091    1
39718    0
10496    1
4914     0
        ..
54343    0
38158    0
860      1
15795    1
56422    0
Name: Readmission, Length: 49288, dtype: int64

In [37]:
# 1. Select a row from X_train, for example, the first row
row_to_predict = X_train.iloc[16739]
# 2. Reshape the row to 2D array
# 2. Convert the row to a DataFrame to maintain column names
row_to_predict_df = pd.DataFrame([row_to_predict], columns=X_train.columns)

# 3. Predict the class or target using the trained model
prediction = pipeline.predict(row_to_predict_df)

# 4. Display the prediction result
print(f'Prediction for the selected row: {prediction}')

Prediction for the selected row: [1]


MODEL TRAINING USING LOGISTIC REGREESION

In [38]:
# train a logistic regression model on the training set
from sklearn.linear_model import LogisticRegression

In [39]:
# Build the final pipeline by combining preprocessor and a classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression( random_state=0))
])

In [40]:
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

In [41]:
# Make predictions
y_pred = pipeline.predict(X_test)

In [42]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6111832494724883

HYPERPARAMETER TUNING USING LOGISTIC REGRESSION

In [43]:

# Define the parameter grid (note how 'logreg__' prefixes the parameters for LogisticRegression)
param_grid = {
    'logreg__C': [0.1, 1.0, 10.0],  # Regularization strength
    'logreg__solver': ['lbfgs', 'liblinear'],  # Solvers for LogisticRegression
    'logreg__penalty': ['l2'],  # Regularization type
    'logreg__max_iter': [100, 200, 300]  # Number of iterations
}
# Grid search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [44]:
# Best estimator and hyperparameters
best_model = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)
# Evaluate the tuned model
evaluate_model(best_model, X_test, y_test)

Best Parameters:  {'logreg__C': 0.1, 'logreg__max_iter': 100, 'logreg__penalty': 'l2', 'logreg__solver': 'lbfgs'}
Accuracy:  0.6111020938159389
ROC AUC Score:  0.6435044424993661
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.61      0.61      6164
           1       0.61      0.61      0.61      6158

    accuracy                           0.61     12322
   macro avg       0.61      0.61      0.61     12322
weighted avg       0.61      0.61      0.61     12322



(0.6111020938159389, np.float64(0.6435044424993661))

MODEL TRAINING USING DECISION TREE CLASSIFIER

In [45]:
from sklearn.tree import DecisionTreeClassifier

In [46]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('tree', DecisionTreeClassifier(random_state=0))  # DecisionTreeClassifier step
])

In [47]:
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

In [48]:
# Make predictions
y_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

0.7170102256127252

In [49]:
# Define the parameter grid for DecisionTreeClassifier (prefixed with 'tree__')
param_grid = {
    'tree__criterion': ['gini', 'entropy'],  
    'tree__max_depth': [None, 10, 20],  # Reduced the number of depths
    'tree__min_samples_split': [10, 20],  # Reduced options for splitting
    'tree__min_samples_leaf': [1, 5],  # Fewer options for leaf samples
    'tree__max_features': ['sqrt', 'log2'],  # Dropped 'None' as an option
    'tree__ccp_alpha': [0.0, 0.01]  # Reduced pruning options
}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [50]:
# Best estimator and hyperparameters
best_model = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)
# Evaluate the tuned model
evaluate_model(best_model, X_test, y_test)

Best Parameters:  {'tree__ccp_alpha': 0.0, 'tree__criterion': 'entropy', 'tree__max_depth': None, 'tree__max_features': 'sqrt', 'tree__min_samples_leaf': 1, 'tree__min_samples_split': 10}
Accuracy:  0.6317156305794513
ROC AUC Score:  0.6723094515841652
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.67      0.65      6164
           1       0.64      0.59      0.62      6158

    accuracy                           0.63     12322
   macro avg       0.63      0.63      0.63     12322
weighted avg       0.63      0.63      0.63     12322



(0.6317156305794513, np.float64(0.6723094515841652))

MODEL TRAINING USING GRADIENT BOOSTING CLASSIFIER

In [51]:
from sklearn.ensemble import GradientBoostingClassifier

In [52]:
# Gradient Boosting Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('gb', GradientBoostingClassifier(random_state=0))  # GradientBoostingClassifier step
])

In [53]:
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

In [54]:

# Make predictions
y_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

0.6722123031975329

In [55]:
# Define the parameter grid for GradientBoostingClassifier (prefixed with 'gb__')
param_grid = {
    'gb__n_estimators': [50, 100],  # Number of boosting stages
    'gb__learning_rate': [0.01, 0.1],  # Learning rate shrinks contribution of each tree
    'gb__max_depth': [5, 10],  # Maximum depth of individual estimators
    'gb__min_samples_split': [10, 20],  # Minimum samples required to split an internal node
    'gb__min_samples_leaf': [1, 5],  # Minimum samples required to be at a leaf node
    'gb__subsample': [0.8, 1.0],  # Fraction of samples used for fitting each base learner
    'gb__max_features': ['sqrt', 'log2']  # Number of features to consider for best split
}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [56]:
# Best estimator and hyperparameters
best_model = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)
# Evaluate the tuned model
evaluate_model(best_model, X_test, y_test)

Best Parameters:  {'gb__learning_rate': 0.1, 'gb__max_depth': 10, 'gb__max_features': 'sqrt', 'gb__min_samples_leaf': 1, 'gb__min_samples_split': 10, 'gb__n_estimators': 100, 'gb__subsample': 1.0}
Accuracy:  0.7471189741925012
ROC AUC Score:  0.8373563066377308
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.79      0.76      6164
           1       0.77      0.70      0.73      6158

    accuracy                           0.75     12322
   macro avg       0.75      0.75      0.75     12322
weighted avg       0.75      0.75      0.75     12322



(0.7471189741925012, np.float64(0.8373563066377308))

MODEL TRAINING USING XGBOOST CLASSIFIER

In [57]:
%pip install xgboost


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [58]:
import xgboost as xgb

In [59]:
# XGBoost Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=0))  # XGBClassifier step
])

In [60]:
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)
# Make predictions
y_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

Parameters: { "use_label_encoder" } are not used.



0.7047557214737867

HYPERPARAMETER TUNING OF XGBOOSTING

In [None]:
# Define the parameter grid for XGBoost (prefixed with 'xgb__')
param_grid = {
    'xgb__n_estimators': [50, 100, 200],  # Number of boosting stages
    'xgb__learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'xgb__max_depth': [3, 5, 10],  # Maximum depth of individual estimators
    'xgb__min_child_weight': [1, 5, 10],  # Minimum sum of instance weight needed in a child
    'xgb__subsample': [0.8, 1.0],  # Fraction of samples used for fitting each tree
    'xgb__colsample_bytree': [0.8, 1.0],  # Fraction of features used at each split
    'xgb__gamma': [0, 0.1, 0.2],  # Minimum loss reduction required to make a further partition
    'xgb__reg_lambda': [1, 1.5, 2],  # L2 regularization term
    'xgb__reg_alpha': [0, 0.5, 1]  # L1 regularization term
}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [None]:
# Best estimator and hyperparameters
best_model = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)
# Evaluate the tuned model
evaluate_model(best_model, X_test, y_test)

*MODEL COMPARISION*