In [1]:
# importing the libraries needed for the project
# import kagglehub
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go


# Download latest version
# path = kagglehub.dataset_download("dubradave/hospital-readmissions")

# print("Path to dataset files:", path)

In [2]:
# List contents of the downloaded directory to find the CSV file
# dataset_files = os.listdir(path)
# csv_file = [f for f in dataset_files if f.endswith('.csv')][0]

# Construct the full path to the CSV file
data_file_path = "hospital_readmissions.csv"

# Load the dataset
data = pd.read_csv(data_file_path)

# Preview dataset
print("\n First five rows of the dataset:")
print(data.head())


 First five rows of the dataset:
       age  time_in_hospital  n_lab_procedures  n_procedures  n_medications  \
0  [70-80)                 8                72             1             18   
1  [70-80)                 3                34             2             13   
2  [50-60)                 5                45             0             18   
3  [70-80)                 2                36             0             12   
4  [60-70)                 1                42             0              7   

   n_outpatient  n_inpatient  n_emergency medical_specialty       diag_1  \
0             2            0            0           Missing  Circulatory   
1             0            0            0             Other        Other   
2             0            0            0           Missing  Circulatory   
3             1            0            0           Missing  Circulatory   
4             0            0            0  InternalMedicine        Other   

        diag_2       diag_3 glucos

In [3]:
# Basic info
print("\n Dataset Info:")
print(data.info())


 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                25000 non-null  object
 1   time_in_hospital   25000 non-null  int64 
 2   n_lab_procedures   25000 non-null  int64 
 3   n_procedures       25000 non-null  int64 
 4   n_medications      25000 non-null  int64 
 5   n_outpatient       25000 non-null  int64 
 6   n_inpatient        25000 non-null  int64 
 7   n_emergency        25000 non-null  int64 
 8   medical_specialty  25000 non-null  object
 9   diag_1             25000 non-null  object
 10  diag_2             25000 non-null  object
 11  diag_3             25000 non-null  object
 12  glucose_test       25000 non-null  object
 13  A1Ctest            25000 non-null  object
 14  change             25000 non-null  object
 15  diabetes_med       25000 non-null  object
 16  readmitted         25000

In [4]:
# Dataset shape
print(f"\n Dataset shape: {data.shape}")


 Dataset shape: (25000, 17)


In [5]:
# Check for missing data
print("\n Missing values per column:")
print(data.isnull().sum())


 Missing values per column:
age                  0
time_in_hospital     0
n_lab_procedures     0
n_procedures         0
n_medications        0
n_outpatient         0
n_inpatient          0
n_emergency          0
medical_specialty    0
diag_1               0
diag_2               0
diag_3               0
glucose_test         0
A1Ctest              0
change               0
diabetes_med         0
readmitted           0
dtype: int64


In [6]:
# Check data types
print("\n Data types:")
print(data.dtypes)


 Data types:
age                  object
time_in_hospital      int64
n_lab_procedures      int64
n_procedures          int64
n_medications         int64
n_outpatient          int64
n_inpatient           int64
n_emergency           int64
medical_specialty    object
diag_1               object
diag_2               object
diag_3               object
glucose_test         object
A1Ctest              object
change               object
diabetes_med         object
readmitted           object
dtype: object


### Feature Engineering and Data Cleaning

In [7]:
# --------------------------------------
# STEP 2: FEATURE ENGINEERING
# --------------------------------------

# Convert age ranges to numeric midpoints
def age_to_mid(age_range):
    """Convert an age range like '[70-80)' to a numeric midpoint (e.g., 75)."""
    low, high = age_range.strip('[]()').split('-')
    return int(round((int(low) + int(high)) / 2))

# Apply transformation
data['age_mid'] = data['age'].apply(age_to_mid)


# Handle target variable ('readmitted')
data['readmitted'] = data['readmitted'].map({'no': 0, 'yes': 1})

# Create new engineered features
data['total_visits'] = data['n_outpatient'] + data['n_inpatient'] + data['n_emergency']
data['procedure_intensity'] = data['n_lab_procedures'] / (data['time_in_hospital'] + 1)
data['medication_per_day'] = data['n_medications'] / (data['time_in_hospital'] + 1)

data.head()


Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,...,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted,age_mid,total_visits,procedure_intensity,medication_per_day
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,...,Other,no,no,no,yes,0,75,2,8.0,2.0
1,[70-80),3,34,2,13,0,0,0,Other,Other,...,Other,no,no,no,yes,0,75,0,8.5,3.25
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,...,Circulatory,no,no,yes,yes,1,55,0,7.5,3.0
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,...,Diabetes,no,no,yes,yes,1,75,1,12.0,4.0
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,...,Respiratory,no,no,no,yes,0,65,0,21.0,3.5


In [8]:
# --------------------------------------
# STEP 4: CHECK CLEAN DATA
# --------------------------------------
print("\nðŸ§¹ Missing values per column (after cleaning):")
print(data.isnull().sum())

print("\n Sample after feature engineering:")
print(data.head())


ðŸ§¹ Missing values per column (after cleaning):
age                    0
time_in_hospital       0
n_lab_procedures       0
n_procedures           0
n_medications          0
n_outpatient           0
n_inpatient            0
n_emergency            0
medical_specialty      0
diag_1                 0
diag_2                 0
diag_3                 0
glucose_test           0
A1Ctest                0
change                 0
diabetes_med           0
readmitted             0
age_mid                0
total_visits           0
procedure_intensity    0
medication_per_day     0
dtype: int64

 Sample after feature engineering:
       age  time_in_hospital  n_lab_procedures  n_procedures  n_medications  \
0  [70-80)                 8                72             1             18   
1  [70-80)                 3                34             2             13   
2  [50-60)                 5                45             0             18   
3  [70-80)                 2                36             0

### Exploratory Data Analysis

#### Basic Descriptive Statistics

In [9]:
print("Dataset Information:")
print(data.info())

print("\n Descriptive Statistics:")
print(data.describe().T)

categorical_cols = ['medical_specialty', 'diag_1', 'diag_2', 'diag_3']
for col in categorical_cols:
    print(f"\nUnique values in {col}: {data[col].nunique()}")

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  25000 non-null  object 
 1   time_in_hospital     25000 non-null  int64  
 2   n_lab_procedures     25000 non-null  int64  
 3   n_procedures         25000 non-null  int64  
 4   n_medications        25000 non-null  int64  
 5   n_outpatient         25000 non-null  int64  
 6   n_inpatient          25000 non-null  int64  
 7   n_emergency          25000 non-null  int64  
 8   medical_specialty    25000 non-null  object 
 9   diag_1               25000 non-null  object 
 10  diag_2               25000 non-null  object 
 11  diag_3               25000 non-null  object 
 12  glucose_test         25000 non-null  object 
 13  A1Ctest              25000 non-null  object 
 14  change               25000 non-null  object 
 15  diabetes_med   

#### Correlations

##### Correlation of Numerical Features

In [10]:
# Select only numerical columns for correlation analysis
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
correlation_matrix = data[numerical_cols].corr()

# Visualize the correlation matrix using a heatmap
fig = px.imshow(correlation_matrix, text_auto=True, aspect="auto",
                title='Correlation Matrix of Numerical Features')
fig.update_layout(xaxis_title="Features", yaxis_title="Features")
fig.show()

print("\nCorrelation Matrix (Numerical Features):\n")
print(correlation_matrix)


Correlation Matrix (Numerical Features):

                     time_in_hospital  n_lab_procedures  n_procedures  \
time_in_hospital             1.000000          0.328326      0.179249   
n_lab_procedures             0.328326          1.000000      0.048827   
n_procedures                 0.179249          0.048827      1.000000   
n_medications                0.449415          0.271741      0.368681   
n_outpatient                -0.011309         -0.004309     -0.034796   
n_inpatient                  0.076916          0.043713     -0.069078   
n_emergency                 -0.013115         -0.004929     -0.042906   
readmitted                   0.043141          0.032970     -0.044467   
age_mid                      0.086268          0.028796     -0.093356   
total_visits                 0.030437          0.019515     -0.074843   
procedure_intensity         -0.513252          0.486496     -0.104292   
medication_per_day          -0.495858         -0.122677      0.160199   

       

In [11]:
# Select only numerical columns for correlation analysis, including 'readmitted'
numerical_cols_with_target = data.select_dtypes(include=['int64', 'float64']).columns

# Calculate correlation with 'readmitted'
correlation_with_readmitted = data[numerical_cols_with_target].corr()['readmitted'].sort_values(ascending=False)

print("\nCorrelation with 'readmitted' column:\n")
print(correlation_with_readmitted)

# Visualize the correlation with 'readmitted'
fig = px.bar(correlation_with_readmitted.drop('readmitted'),
             x=correlation_with_readmitted.drop('readmitted').index,
             y=correlation_with_readmitted.drop('readmitted').values,
             title='Correlation of Numerical Features with Readmitted',
             labels={'x':'Numerical Feature', 'y':'Correlation Coefficient'})
fig.update_xaxes(categoryorder='total descending')
fig.show()


Correlation with 'readmitted' column:

readmitted             1.000000
n_inpatient            0.212480
total_visits           0.207957
n_outpatient           0.095487
n_emergency            0.093519
time_in_hospital       0.043141
n_medications          0.036871
n_lab_procedures       0.032970
age_mid                0.030289
medication_per_day    -0.008965
procedure_intensity   -0.016592
n_procedures          -0.044467
Name: readmitted, dtype: float64


Here's a summary of the correlations with 'readmitted':

Strongest Positive Correlations:

1. n_inpatient (0.212): The number of inpatient visits has the strongest positive correlation, suggesting that patients with more previous inpatient hospitalizations are more likely to be readmitted.
2. total_visits (0.208): This engineered feature (sum of outpatient, inpatient, and emergency visits) also shows a strong positive correlation, reinforcing the idea that higher overall healthcare utilization is linked to readmission.

**Moderate Positive Correlations:**

3. n_outpatient (0.095): The number of outpatient visits has a moderate positive correlation.
4. n_emergency (0.094): Emergency visits also show a moderate positive correlation
5. Weak Positive Correlations:
time_in_hospital (0.043): Longer hospital stays have a weak positive correlation.
6. n_medications (0.037): More medications weakly correlate with readmission.
7. n_lab_procedures (0.033): More lab procedures weakly correlate with readmission.
8. age_mid (0.030): The patient's age midpoint has a very weak positive correlation.

**Weak/Negligible Correlations (including newly fixed ones):**

9. glucose_test (-0.009): The glucose test result shows a very weak negative correlation. This is likely due to the majority of values being 'no' or 'normal', as observed previously.
10. A1Ctest (0.007): The A1C test result shows a very weak positive correlation.
11. change (0.036): Changes in medication regime also show a very weak positive correlation.
12. diabetes_med (0.021): Patients on diabetes medication show a very weak positive correlation.
13. medication_per_day (-0.009): The rate of medication per day shows a very weak negative correlation.
14. procedure_intensity (-0.017): Procedure intensity shows a very weak negative correlation.
15. n_procedures (-0.044): The number of procedures has a weak negative correlation.

Key Takeaway: The strongest predictors of readmission among the numerical features are those related to previous hospital utilization, especially inpatient visits. glucose_test and A1Ctest now show very weak correlations, which might be due to a lack of variance in their values or genuinely weak predictive power in this dataset.


##### Correlation of Categorical Features with 'readmitted'

In [12]:
categorical_features_for_corr = ['medical_specialty', 'diag_1', 'diag_2', 'diag_3']

for col in categorical_features_for_corr:
    # Calculate readmission rate for each category
    readmission_rate_by_category = data.groupby(col)['readmitted'].mean().sort_values(ascending=False).reset_index()

    # Get top 10 categories for visualization
    top_10_categories = readmission_rate_by_category.head(10)

    # Plotting the readmission rates for the top 10 categories
    fig = px.bar(top_10_categories,
                 x=col,
                 y='readmitted',
                 title=f'Readmission Rate by {col} (Top 10)',
                 labels={'readmitted': 'Readmission Rate', col: col},
                 color=col)
    fig.update_layout(xaxis_title=col, yaxis_title='Average Readmission Rate (0-1)')
    fig.show()

    print(f"\nReadmission rates for top categories in {col}:\n")
    print(top_10_categories.round(3))


Readmission rates for top categories in medical_specialty:

        medical_specialty  readmitted
0  Family/GeneralPractice       0.495
1        Emergency/Trauma       0.494
2                 Missing       0.489
3              Cardiology       0.450
4        InternalMedicine       0.448
5                   Other       0.415
6                 Surgery       0.412



Readmission rates for top categories in diag_1:

            diag_1  readmitted
0         Diabetes       0.536
1          Missing       0.500
2      Respiratory       0.491
3      Circulatory       0.479
4        Digestive       0.474
5            Other       0.451
6           Injury       0.436
7  Musculoskeletal       0.395



Readmission rates for top categories in diag_2:

            diag_2  readmitted
0      Respiratory       0.490
1      Circulatory       0.483
2            Other       0.469
3  Musculoskeletal       0.462
4        Digestive       0.443
5         Diabetes       0.442
6           Injury       0.406
7          Missing       0.405



Readmission rates for top categories in diag_3:

            diag_3  readmitted
0      Respiratory       0.498
1      Circulatory       0.483
2        Digestive       0.469
3            Other       0.467
4         Diabetes       0.457
5  Musculoskeletal       0.451
6           Injury       0.425
7          Missing       0.286


Let's break down the categorical correlation results with 'readmitted' based on the plots and printouts:

**1. Medical Specialty:**
*   **Highest Readmission Rates:** 'Family/GeneralPractice', 'Emergency/Trauma', and 'Missing' medical specialties show the highest average readmission rates, hovering around 0.49-0.50.
*   **Moderate Readmission Rates:** 'Cardiology' and 'InternalMedicine' also have significant readmission rates, around 0.45.
*   **Lower Readmission Rates:** 'Other' and 'Surgery' have slightly lower rates, around 0.41.
*   **Insight:** The 'Missing' category for medical specialty being high is concerning, as it suggests incomplete data for patients who are frequently readmitted. High rates in Emergency/Trauma and Family/General Practice might indicate patients with less specialized or more generalized care needs, or perhaps those without a stable primary care provider, leading to readmissions.

**2. Primary Diagnosis (diag_1):**
*   **Highest Readmission Rates:** Patients with a primary diagnosis of 'Diabetes' (0.536), 'Missing' (0.500), 'Respiratory' (0.491), and 'Circulatory' (0.479) tend to have the highest readmission rates.
*   **Lower Readmission Rates:** 'Musculoskeletal' and 'Injury' diagnoses show comparatively lower readmission rates, around 0.395 and 0.436 respectively.
*   **Insight:** Diabetes, Respiratory, and Circulatory diseases are often chronic conditions requiring ongoing management, making patients more susceptible to readmission. The 'Missing' diagnosis again highlights a data quality issue that correlates with high readmission.

**3. Secondary Diagnosis (diag_2) and Tertiary Diagnosis (diag_3):**
*   The patterns observed in `diag_2` and `diag_3` largely mirror those in `diag_1`.
*   **Respiratory**, **Circulatory**, and **Digestive** conditions consistently appear among the diagnoses with higher readmission rates across all three diagnosis fields.
*   The 'Missing' category in `diag_3` has the lowest readmission rate (0.286), which is an interesting anomaly compared to 'Missing' in `medical_specialty` and `diag_1`. This could mean that if a third diagnosis is missing, it's often for less complex cases.

**Overall Takeaway for Categorical Features:**
Certain medical specialties and specific diagnostic categories, particularly those related to **chronic diseases like Diabetes, Respiratory, and Circulatory conditions**, show a clear association with higher readmission rates. The presence of 'Missing' values in diagnostic and specialty fields also tends to correlate with higher readmission, suggesting potential issues with data collection or underlying patient complexities not captured.

These insights can guide interventions targeted at specific patient groups or improvements in managing certain chronic conditions to reduce readmission rates.

##### Correlation analysis summary

**Numerical Feature Correlations:**

1. Strongest Predictors: The most influential numerical features are those related to hospital utilization, particularly n_inpatient and total_visits. Patients with a history of more inpatient stays and higher overall healthcare interactions (outpatient, inpatient, emergency) are significantly more likely to be readmitted.
2. Other Influencers: time_in_hospital, n_medications, n_lab_procedures, and age_mid show weak to moderate positive correlations, suggesting that longer stays, more medications, more lab tests, and older age are mildly associated with higher readmission risk.
3. Limited Impact: glucose_test and A1Ctest, even after correct encoding, show very weak correlations. This implies that these specific test results, as represented in this dataset, are not strong individual predictors of readmission.

**Categorical Feature Correlations:**

1. Medical Specialty: Certain medical specialties, such as Family/General Practice, Emergency/Trauma, and even 'Missing' (which could indicate a wide range of situations or data entry issues), show slightly higher readmission rates. Specialties like Cardiology and Internal Medicine also have notable rates.
2. Primary Diagnosis (diag_1): Patients diagnosed with Diabetes, certain Respiratory, and Circulatory conditions as their primary diagnosis appear to have higher readmission rates. The 'Missing' diagnosis also correlates with a high readmission rate.
3. Secondary and Tertiary Diagnoses (diag_2, diag_3): Similar to diag_1, Respiratory, Circulatory, and Digestive diagnoses frequently appear among categories with higher readmission rates in secondary and tertiary diagnosis fields.

**Overall Comparison:**
1. Predictive Strength: The numerical features related to prior hospital utilization (n_inpatient, total_visits) appear to have a stronger, more direct correlation with readmission compared to most individual categorical features. Their correlation coefficients (around 0.21) are higher than the average readmission rate differences observed across categories (which typically range from 0.4 to 0.5, with narrower variations between specific categories).
2. Nature of Influence: Numerical features provide insights into the volume and intensity of care received, while categorical features highlight specific conditions or care settings that might be associated with higher readmission risks. For example, a patient with a high number of inpatient visits is likely more complex, and a patient primarily diagnosed with diabetes might have chronic management challenges.
3. Combined Power: Both types of features offer valuable but different pieces of the puzzle. The numerical features quantify past patient behavior and health interaction, while the categorical features provide context about the patient's medical state and care environment. A robust predictive model would likely combine insights from both.

#### Variable Analysis

##### Univariate Analysis

In [13]:
print("\n--- Univariate Analysis ---\n")

# Distribution of Target Variable: readmitted
readmitted_counts = data['readmitted'].value_counts().reset_index()
readmitted_counts.columns = ['Readmitted', 'Count']
fig = px.bar(readmitted_counts, x='Readmitted', y='Count', title='Distribution of Readmitted Patients')
fig.update_layout(xaxis_title="Readmitted (0=No, 1=Yes)", yaxis_title="Number of Patients")
fig.show()

# Distribution of age_mid
fig = px.histogram(data, x='age_mid', title='Distribution of Patient Age (Midpoint)')
fig.show()

# Distribution of time_in_hospital
fig = px.histogram(data, x='time_in_hospital', title='Distribution of Time in Hospital (Days)')
fig.show()

# Distribution of n_medications
fig = px.histogram(data, x='n_medications', title='Distribution of Number of Medications')
fig.show()

# Distribution of medical_specialty (Top 10)
top_medical_specialties = data['medical_specialty'].value_counts().head(10).index
fig = px.bar(data[data['medical_specialty'].isin(top_medical_specialties)],
             x='medical_specialty', title='Top 10 Medical Specialties', color='medical_specialty')
fig.update_xaxes(categoryorder='total descending')
fig.show()

# Distribution of diag_1 (Top 10)
top_diag_1 = data['diag_1'].value_counts().head(10).index
fig = px.bar(data[data['diag_1'].isin(top_diag_1)],
             x='diag_1', title='Top 10 Primary Diagnoses', color='diag_1')
fig.update_xaxes(categoryorder='total descending')
fig.show()


--- Univariate Analysis ---



Let's review the results from the univariate analysis:

**1. Distribution of Readmitted Patients:**
*   The bar chart for 'readmitted' shows a fairly balanced distribution, with slightly more patients *not* readmitted (0) than readmitted (1). This indicates that roughly half of the patients in this dataset are readmitted, which is a significant proportion and suggests that readmission is a common event.

**2. Distribution of Patient Age (Midpoint):**
*   The histogram for `age_mid` reveals that the dataset contains a wide range of patient ages, with a notable concentration in the older age groups. Specifically, the `70-80` age group (midpoint 75) appears to be the most frequent, followed by `60-70` (midpoint 65) and `80-90` (midpoint 85). This indicates that the dataset primarily consists of older adult patients.

**3. Distribution of Time in Hospital (Days):**
*   The `time_in_hospital` histogram shows that most patients have relatively short hospital stays, with a peak around 2-4 days. The distribution then gradually decreases as the number of hospital days increases, with fewer patients staying for very long durations (e.g., 10-14 days). This suggests that a majority of admissions are for acute care rather than extended treatments.

**4. Distribution of Number of Medications:**
*   The histogram for `n_medications` indicates that many patients are prescribed a moderate number of medications, with a central tendency around 10-20 medications. There are also patients with fewer and significantly more medications, suggesting a diverse range of patient complexities and treatment plans.

**5. Top 10 Medical Specialties:**
*   The bar chart for `medical_specialty` shows that 'Missing' is the most frequent category, which is a data quality issue but also an important observation about the dataset. Following that, 'InternalMedicine', 'Cardiology', and 'Family/GeneralPractice' are the most common medical specialties. This highlights the primary areas of care covered in this hospital dataset.

**6. Top 10 Primary Diagnoses (diag_1):**
*   Similar to medical specialties, 'Missing' is the most frequent `diag_1` category. Among the identified diagnoses, 'Circulatory', 'Other', 'Diabetes', and 'Respiratory' are the most common. This points to a prevalence of chronic conditions and common health issues among the patient population.

In summary, the univariate analysis provides a foundational understanding of the dataset's characteristics, highlighting the distribution of patient demographics, hospital stay details, medication usage, and the most common medical specialties and primary diagnoses.

##### Bivariate Analysis

In [14]:
print("\n--- Bivariate Analysis ---\n")

# Bivariate Analysis: Numerical Features vs. Readmitted

# Time in Hospital vs. Readmitted
fig = px.box(data, x='readmitted', y='time_in_hospital', title='Time in Hospital vs. Readmitted')
fig.update_layout(xaxis_title="Readmitted (0=No, 1=Yes)", yaxis_title="Time in Hospital (Days)")
fig.show()

# Age vs. Readmitted
fig = px.box(data, x='readmitted', y='age_mid', title='Age (Midpoint) vs. Readmitted')
fig.update_layout(xaxis_title="Readmitted (0=No, 1=Yes)", yaxis_title="Age (Midpoint)")
fig.show()

# Number of Inpatient Visits vs. Readmitted
# Using violin plot for better distribution insight given discrete nature and potential outliers
fig = px.violin(data, x='readmitted', y='n_inpatient', title='Number of Inpatient Visits vs. Readmitted')
fig.update_layout(xaxis_title="Readmitted (0=No, 1=Yes)", yaxis_title="Number of Inpatient Visits")
fig.show()

# Number of Medications vs. Readmitted
fig = px.box(data, x='readmitted', y='n_medications', title='Number of Medications vs. Readmitted')
fig.update_layout(xaxis_title="Readmitted (0=No, 1=Yes)", yaxis_title="Number of Medications")
fig.show()


# Bivariate Analysis: Categorical Features vs. Readmitted

# Medical Specialty vs. Readmission Rate (Top 10)
readmission_rate_by_specialty = data.groupby('medical_specialty')['readmitted'].mean().sort_values(ascending=False).reset_index()
top_10_specialties = readmission_rate_by_specialty.head(10)
fig = px.bar(top_10_specialties,
             x='medical_specialty',
             y='readmitted',
             title='Readmission Rate by Medical Specialty (Top 10)',
             labels={'readmitted': 'Average Readmission Rate', 'medical_specialty': 'Medical Specialty'},
             color='medical_specialty')
fig.update_xaxes(categoryorder='total descending')
fig.show()

# Diag_1 (Primary Diagnosis) vs. Readmission Rate (Top 10)
readmission_rate_by_diag1 = data.groupby('diag_1')['readmitted'].mean().sort_values(ascending=False).reset_index()
top_10_diag1 = readmission_rate_by_diag1.head(10)
fig = px.bar(top_10_diag1,
             x='diag_1',
             y='readmitted',
             title='Readmission Rate by Primary Diagnosis (Top 10)',
             labels={'readmitted': 'Average Readmission Rate', 'diag_1': 'Primary Diagnosis'},
             color='diag_1')
fig.update_xaxes(categoryorder='total descending')
fig.show()

# Change in Medication vs. Readmitted
fig = px.bar(data.groupby('change')['readmitted'].mean().reset_index(),
             x='change', y='readmitted', title='Readmission Rate by Change in Medication',
             labels={'change': 'Change in Medication', 'readmitted': 'Average Readmission Rate'})
fig.show()


--- Bivariate Analysis ---



Let's review the results from the bivariate analysis to understand how different features relate to readmission:

**1. Time in Hospital vs. Readmitted (Box Plot):**
*   The box plot shows that patients who are readmitted (`readmitted = 1`) tend to have slightly longer median hospital stays compared to those who are not readmitted (`readmitted = 0`). The interquartile range (IQR) and whiskers for readmitted patients also appear to extend further, suggesting a wider spread of longer hospitalizations among this group.

**2. Age (Midpoint) vs. Readmitted (Box Plot):**
*   The box plot for `age_mid` indicates a slight trend where readmitted patients tend to be, on average, slightly older than non-readmitted patients. However, the overlap in the distributions is substantial, suggesting age alone is not a very strong differentiator.

**3. Number of Inpatient Visits vs. Readmitted (Violin Plot):**
*   This plot is quite telling. Patients who are readmitted (`readmitted = 1`) show a clear tendency to have a higher number of previous inpatient visits. The violin plot for readmitted patients is thicker at higher values of `n_inpatient` and extends to many more inpatient visits compared to non-readmitted patients, where the distribution is heavily concentrated at 0 or 1 inpatient visits. This strongly suggests that a history of inpatient visits is a significant factor in readmission.

**4. Number of Medications vs. Readmitted (Box Plot):**
*   Similar to time in hospital, readmitted patients generally have a slightly higher median number of medications, and their distribution shows a propensity for more medications. This could be indicative of higher patient complexity or more chronic conditions.

**5. Medical Specialty vs. Readmission Rate (Top 10 Bar Chart):**
*   This chart highlights that certain medical specialties, such as 'Family/GeneralPractice', 'Emergency/Trauma', and particularly the 'Missing' category, have higher average readmission rates. This reinforces the idea that general care or emergency presentations, or perhaps incomplete data, are associated with increased readmission risk.

**6. Primary Diagnosis (Diag_1) vs. Readmission Rate (Top 10 Bar Chart):**
*   Patients with 'Diabetes', 'Missing' diagnosis, 'Respiratory', and 'Circulatory' primary diagnoses show the highest readmission rates. These often represent chronic conditions requiring ongoing management, making patients prone to readmission.

**7. Change in Medication vs. Readmitted (Bar Chart):**
*   This plot shows that patients who had a 'change' in their medication regimen during their hospital stay tend to have a slightly higher readmission rate compared to those who had 'no' change. This could suggest that medication adjustments (perhaps due to worsening condition or complex management) are a factor in readmission.

In summary, the bivariate analysis reinforces insights from the correlation analysis, emphasizing the importance of factors like prior hospital utilization (especially inpatient visits), chronic conditions (Diabetes, Respiratory, Circulatory), and indicators of patient complexity (more medications, longer hospital stays) in predicting readmission.

###### Multi-Variate Analysis

**Reasoning**:
The subtask requires visualizing the interaction between 'age_mid', 'time_in_hospital', and 'readmitted' using a scatter plot. I will use `plotly.express` to create this plot, coloring the points by the 'readmitted' status.



In [15]:
fig = px.scatter(
    data,
    x='age_mid',
    y='time_in_hospital',
    color='readmitted',
    title='Age vs. Time in Hospital by Readmission Status',
    labels={'age_mid': 'Age Midpoint', 'time_in_hospital': 'Time in Hospital (Days)', 'readmitted': 'Readmitted (0=No, 1=Yes)'}
)
fig.show()

**Reasoning**:
The next step is to visualize the interaction between two categorical features (`medical_specialty`, `change`) and the target variable (`readmitted`) using a grouped bar chart to understand how these factors collectively influence readmission rates.



In [16]:
medical_specialty_change_readmitted = data.groupby(['medical_specialty', 'change'])['readmitted'].mean().reset_index()

fig = px.bar(
    medical_specialty_change_readmitted,
    x='medical_specialty',
    y='readmitted',
    color='change',
    barmode='group',
    title='Readmission Rate by Medical Specialty and Change in Medication',
    labels={
        'medical_specialty': 'Medical Specialty',
        'readmitted': 'Average Readmission Rate',
        'change': 'Change in Medication'
    }
)
fig.update_layout(xaxis_categoryorder='total descending')
fig.show()

Looking at the grouped bar chart ('Readmission Rate by Medical Specialty and Change in Medication'), we can observe the following:

*   **General Trend:** Across most medical specialties, patients who experienced a **'yes'** in `change` (meaning their medication regimen was changed during their hospital stay) tend to have a **higher average readmission rate** compared to those who had **'no'** change.

*   **Specific Specialties with Notable Differences:**
    *   **Emergency/Trauma:** This specialty shows a significant difference. Patients with a 'change' in medication have a considerably higher readmission rate than those without a change. This might indicate that patients needing emergency care who also require medication adjustments are particularly vulnerable to readmission.
    *   **Family/GeneralPractice:** Similar to Emergency/Trauma, there's a noticeable increase in readmission rates for patients with a 'change' in medication.
    *   **Cardiology and InternalMedicine:** While both show higher rates with 'change', the difference isn't as pronounced as in Emergency/Trauma or Family/GeneralPractice, but it's still present.
    *   **Missing (Medical Specialty):** This category also follows the trend, with 'change' being associated with higher readmission rates.

*   **Insights:**
    *   **Medication Changes as an Indicator of Complexity:** The consistent trend suggests that a change in medication regimen during hospitalization might be an indicator of underlying patient complexity, unstable condition, or a challenging treatment plan, all of which could increase the risk of readmission.
    *   **Intervention Opportunities:** For specialties like Emergency/Trauma and Family/General Practice, where the difference is more significant, specific post-discharge care plans focusing on medication management and patient education after a change in regimen could be particularly effective in reducing readmissions.

This multivariate analysis highlights that not only do certain medical specialties have higher baseline readmission rates, but the act of altering medication during a stay appears to be an additional risk factor, emphasizing the importance of medication reconciliation and patient follow-up.

### Saving the Final Data

In [17]:
# --------------------------------------
# STEP 3: CLEAN BINARY/ORDINAL COLUMNS
# --------------------------------------

# Columns that are truly binary (yes/no)
binary_yes_no_cols = ['change', 'diabetes_med']

for col in binary_yes_no_cols:
    data[col] = (
        data[col]
        .astype(str)
        .str.lower()
        .replace({'missing': np.nan, 'unknown': np.nan, '?': np.nan, 'nan': np.nan})
        .map({'yes': 1, 'no': 0})
        .fillna(0)
        .astype(int)
    )

# Columns with 'high', 'normal', 'no' (ordinal mapping)
ordinal_cols = ['glucose_test', 'A1Ctest']

for col in ordinal_cols:
    data[col] = (
        data[col]
        .astype(str)
        .str.lower()
        .replace({'missing': np.nan, 'unknown': np.nan, '?': np.nan, 'nan': np.nan})
        .map({'no': 0, 'normal': 1, 'high': 2})
        .fillna(0) # 'no' or not performed as 0
        .astype(int)
    )

data.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,...,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted,age_mid,total_visits,procedure_intensity,medication_per_day
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,...,Other,0,0,0,1,0,75,2,8.0,2.0
1,[70-80),3,34,2,13,0,0,0,Other,Other,...,Other,0,0,0,1,0,75,0,8.5,3.25
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,...,Circulatory,0,0,1,1,1,55,0,7.5,3.0
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,...,Diabetes,0,0,1,1,1,75,1,12.0,4.0
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,...,Respiratory,0,0,0,1,0,65,0,21.0,3.5


In [18]:
# --------------------------------------
# STEP 6: SAVE CLEANED DATA
# --------------------------------------

# Define a suitable output path for Colab's environment.
# The original path 'C:\Users\USER\Documents\Data Analytics Project\hospital_readmissions_cleaned.csv'
# is a Windows-specific path and would not work directly in Colab.
output_path = "hospital_readmissions_cleaned.csv"

data.to_csv(output_path, index=False)

print(f"\nCleaned dataset saved to: {output_path}")

# The line 'st.success(...)' caused a NameError because 'st' (Streamlit)
# is not defined in this Colab environment. It has been commented out.
# If you need to enable direct file download in Colab, you can use google.colab.files.
# For example:
# from google.colab import files
# files.download(output_path)


Cleaned dataset saved to: hospital_readmissions_cleaned.csv
