# 🧪 Exploratory Data Analysis - Medical Diagnostic System

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from PIL import Image
import numpy as np

%matplotlib inline
sns.set(style="whitegrid")


## 🔍 Dataset Overview

In [None]:
# Sample path structure (update with your own path)
image_folder = './data/train/NORMAL'

# Count number of files
normal_files = os.listdir(image_folder)
print(f"Number of NORMAL images: {len(normal_files)}")

# Show a few images
fig, axs = plt.subplots(1, 5, figsize=(15,3))
for i, file in enumerate(normal_files[:5]):
    img = Image.open(os.path.join(image_folder, file))
    axs[i].imshow(img, cmap='gray')
    axs[i].axis('off')
plt.suptitle("Sample NORMAL Images")
plt.show()


## 📊 Class Distribution

In [None]:
# Assuming data is structured as ./data/train/NORMAL and ./data/train/PNEUMONIA
data_dir = './data/train'
class_counts = {cls: len(os.listdir(os.path.join(data_dir, cls))) for cls in os.listdir(data_dir)}
sns.barplot(x=list(class_counts.keys()), y=list(class_counts.values()))
plt.title("Image Count per Class")
plt.ylabel("Number of Images")
plt.show()


## 🖼 Image Dimension Analysis

In [None]:
dims = []
for cls in os.listdir(data_dir):
    files = os.listdir(os.path.join(data_dir, cls))
    for file in files[:100]:  # Sample 100 images per class
        img = Image.open(os.path.join(data_dir, cls, file))
        dims.append(img.size)

dims_df = pd.DataFrame(dims, columns=["Width", "Height"])
sns.histplot(dims_df["Width"], bins=20, kde=True, color="skyblue")
sns.histplot(dims_df["Height"], bins=20, kde=True, color="salmon")
plt.title("Distribution of Image Dimensions")
plt.show()
