# Hospital Readmission - Exploratory Data Analysis

This notebook explores the UCI Diabetes dataset to understand:
1. Data structure and quality
2. Feature distributions
3. Readmission patterns
4. Key risk factors

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Load the Dataset

In [None]:
# Load the dataset
df = pd.read_csv('../data/diabetic_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

## 2. Data Overview

In [None]:
# Basic info
print("Dataset Info:")
print("="*50)
print(f"Total records: {len(df):,}")
print(f"Total features: {len(df.columns)}")
print(f"\nData types:")
print(df.dtypes.value_counts())

print(f"\nMissing values per column:")
missing = df.isnull().sum()
print(missing[missing > 0])

## 3. Target Variable Analysis (Readmission)

In [None]:
# Check readmission distribution
print("Readmission Distribution:")
print(df['readmitted'].value_counts())
print(f"\nPercentages:")
print(df['readmitted'].value_counts(normalize=True) * 100)

## 4. Feature Analysis

In [None]:
# Age distribution
plt.figure(figsize=(10, 6))
df['age'].value_counts().sort_index().plot(kind='bar')
plt.title('Age Distribution')
plt.xlabel('Age Group')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 5. Next Steps

1. Clean the data
2. Handle missing values
3. Encode categorical variables
4. Train ML models