# Mammography data Analysis

The goal of the project is to find the Cancer type, position and radious of the cyst from mammography

### Init

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
import cv2
import os

### Functions

In [None]:
def load_data(file_path):
    df = pd.read_csv(file_path, sep=' ', header=None)
    df.columns = ['ImageID', 'BreastDensity', 'AbnormalityType', 'Assessment', 'X', 'Y', 'Radius']
    return df

def initial_analysis(df):
    print("\n--- First 5 rows of the DataFrame ---")
    print(df.head(5)) # Display the first 5 rows of the DataFrame
    print("\n--- Dataset Info ---")
    print(df.info()) # Display information about the DataFrame
    print("\nTotal NaN values in DataFrame:", df.isna().sum().sum())  # Count all NaNs
    print("\n--- Unique Categories in Each Column ---")



def convert_coordinates_to_float(df):
    # Convert specified columns to float, handling errors
    df_copy = df.copy()
    for col in ['X', 'Y', 'Radius']:
        if col in df_copy.columns:
            df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').astype(float)
    return df_copy

def fill_nan(df):
    df_copy = df.copy()

    # Fill numeric coordinate/size columns with 0
    for col in ['X', 'Y', 'Radius']:
        if col in df_copy.columns:
            df_copy[col] = df_copy[col].fillna(0)

    # Fill Assessment with 'N' for Normal
    if 'Assessment' in df_copy.columns:
        df_copy['Assessment'] = df_copy['Assessment'].fillna('N')

    return df_copy


In [None]:
def initial_plot(df, figsize=(5, 4), dropna=False, max_cols_per_row=4):
    # Columns to include
    include_cols = ['Assessment', 'BreastDensity', 'AbnormalityType']

    # Filter to keep only columns present in the DataFrame
    categorical_cols = [col for col in include_cols if col in df.columns]

    if not categorical_cols:
        print("No specified columns found in the DataFrame.")
        return

    # Grid dimensions
    total_plots = len(categorical_cols)
    n_rows = math.ceil(total_plots / max_cols_per_row)
    n_cols = min(total_plots, max_cols_per_row)

    # Create figure and axes
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(figsize[0] * n_cols, figsize[1] * n_rows))
    axes = axes.flatten() if total_plots > 1 else [axes]

    # Plot each categorical column
    for i, col in enumerate(categorical_cols):
        counts = df[col].value_counts(dropna=dropna)
        axes[i].bar(counts.index.astype(str), counts.values)
        axes[i].set_title(col)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('Count')
        axes[i].tick_params(axis='x', rotation=45)

    # Hide unused subplots
    for j in range(i + 1, len(axes)):
        axes[j].axis('off')

    plt.tight_layout()
    plt.show()

In [None]:
def enforce_numeric_coords(df):
    for col in ["X", "Y", "Radius"]:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)
    return df

In [None]:

def plot_breastdensity_vs_abnormalitytype(df):
    """Plot Breast Density vs Abnormality Type heatmap."""
    plt.figure(figsize=(8,5))
    ct_density_abn = pd.crosstab(df["BreastDensity"], df["AbnormalityType"])
    sns.heatmap(ct_density_abn, annot=True, fmt="d", cmap="YlGnBu")
    plt.title("Breast Density vs Abnormality Type")
    plt.ylabel("Breast Density")
    plt.xlabel("Abnormality Type")
    plt.show()


In [None]:
def plot_abnormalitytype_vs_assessment(df):
    """Plot Abnormality Type vs Assessment stacked bar."""
    plt.figure(figsize=(8,5))
    ct_abn_assess = pd.crosstab(df["AbnormalityType"], df["Assessment"])
    ct_abn_assess.plot(kind="bar", stacked=True, colormap="coolwarm", figsize=(8,5))
    plt.title("Abnormality Type vs Assessment")
    plt.ylabel("Count")
    plt.xlabel("Abnormality Type")
    plt.xticks(rotation=45)
    plt.show()


In [None]:
def plot_breastdensity_vs_assessment(df):
    """Plot Breast Density vs Assessment grouped bar."""
    plt.figure(figsize=(6,4))
    sns.countplot(data=df.dropna(subset=["Assessment"]), 
                  x="BreastDensity", hue="Assessment", palette="Set2")
    plt.title("Breast Density vs Assessment")
    plt.ylabel("Count")
    plt.xlabel("Breast Density")
    plt.show()


In [None]:

def plot_mias_image(df, image_folder, image_id):
    """
    Plots one MIAS dataset image with abnormality marked.
    
    df            : DataFrame containing parsed MIAS data
    image_folder  : Path to folder containing PGM images
    image_id      : ImageID from the DataFrame (e.g., 'mdb001')
    """
    
    # Get the row for this image
    row = df[df["ImageID"] == image_id].iloc[0]
    
    # Build image file path
    img_path = os.path.join(image_folder, f"{image_id}.pgm")
    
    # Read image
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    
    # Plot image
    plt.figure(figsize=(6, 6))
    plt.imshow(img, cmap="gray")
    plt.title(f"{image_id} | {row['BreastDensity']} | {row['AbnormalityType']} | {row['Assessment']}")
    
    # Draw circle only if Radius > 0
    if row["Radius"] > 0:
        circle = plt.Circle((row["X"], img.shape[0] - row["Y"]),  # Flip Y for image coordinates
                            row["Radius"], 
                            color='red', fill=False, linewidth=2)
        plt.gca().add_patch(circle)
    
    plt.axis("off")
    plt.show()


### load data

In [None]:
df = load_data('C:/Users/Sreerag/Documents/ML_chellange/Detection-of-Breast-Abnormalities/Detection-of-Breast-Abnormalities-from-Mammogram/Data/Raw_data/Parse_file/Parse_data.txt')

### Initial Analysis

Analysing for nan and empty values

In [None]:
initial_analysis(df) # Initial analysis of the DataFrame

Data Descriptions

- **X**  
  The horizontal coordinate (in pixels) of the abnormality's center within the mammogram image.

- **Y**  
  The vertical coordinate (in pixels) of the abnormality's center within the mammogram image.

- **Radius**  
  The approximate radius (in pixels) of the abnormality region, measured from the center `(X, Y)` to the boundary.  
  This is useful for defining a region of interest (ROI) for further analysis, such as cropping or segmentation.


In [None]:
initial_plot(df)

### Preprocessing

Filling nan and empty data with suitable values

In this dataset, the X, Y, and Radius columns represent the coordinates and size of a detected abnormality (tumor).
When there is no tumor present, these values are not applicable, and are recorded as NaN.
Since NaN here specifically means “no tumor,” replacing these missing values with 0 is appropriate —
it clearly indicates the absence of a tumor and keeps the columns in a numeric format, which is important for further analysis and modeling. Also, replace the nan in assesment with "N" means norman because all other images without B or M comes under Normal so replace with N (normal).


In [None]:
df = fill_nan(df)

Recheck for nan

In [None]:
initial_analysis(df)  # Re-check after filling NaN values

In [None]:
enforce_numeric_coords(df)  # Ensure numeric coordinates
df.info()  # Display DataFrame info after enforcing numeric coordinates

Initial analysis of data after preprocessing

In [None]:
initial_plot(df)

### Save data

In [None]:
path = "C:/Users/Sreerag/Documents/ML_chellange/Detection-of-Breast-Abnormalities/Detection-of-Breast-Abnormalities-from-Mammogram/Data/Preprocessed_data/Parse_file/Parse_data_preprocessed"
df.to_csv(path, index=False)

### Exploratory Data Analysis (EDA)

Targets
1) Cancer type
2) X, Y, Radious

In [None]:
plot_breastdensity_vs_abnormalitytype(df)


D – Dense-glandular, 
F – Fatty ,
G – Fatty-glandular ,

images of Dense-grandular and Fatty type have more upnormalities in this dataset.


In [None]:
plot_abnormalitytype_vs_assessment(df)

MIAS Abnormality Categories

- **CALC** – Calcification (tiny calcium deposits, may be benign or malignant)  
- **CIRC** – Well-defined/circumscribed mass (round/oval with clear edges)  
- **SPIC** – Spiculated mass (irregular shape with radiating lines, often malignant)  
- **MISC** – Other ill-defined mass (not clearly round/oval, fuzzy borders)  
- **ARCH** – Architectural distortion (normal breast structure is disrupted)  
- **ASYM** – Asymmetry (one breast shows more tissue density than the other)  
- **NORM** – Normal (no abnormality detected)  

The chance of tumor being manignen is less for CIRC condeation compaired to all other upnormalities

In [None]:
plot_breastdensity_vs_assessment(df)

The dataset have similar quantity of images with tumors for each brest density category. 

### Image analysis

Plot image 

In [None]:
Image_folder = "C:/Users/Sreerag/Documents/ML_chellange/Detection-of-Breast-Abnormalities/Detection-of-Breast-Abnormalities-from-Mammogram/Data/Raw_data/Images"
plot_mias_image(df, Image_folder, 'mdb002')  # Example image ID

## Model

Imports


In [49]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.model_selection import train_test_split

In [None]:
# ===== 1. Dataset Class =====
class MIASDataset(Dataset):
    def __init__(self, df, image_folder, transform=None):
        self.df = df.reset_index(drop=True)
        self.image_folder = image_folder
        self.transform = transform
        self.label_map = {label: idx for idx, label in enumerate(df['Assessment'].unique())}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.image_folder, row['ImageID'] + ".pgm")
        image = Image.open(img_path).convert("L")  # grayscale

        if self.transform:
            image = self.transform(image)

        label = self.label_map[row['Assessment']]
        return image, label

In [42]:
# ===== 2. Data Preprocessing =====
def create_datasets(df, image_folder, test_size=0.3):
    transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

    train_df, test_df = train_test_split(df, test_size=test_size, stratify=df['Assessment'], random_state=42)
    train_dataset = MIASDataset(train_df, image_folder, transform=transform)
    test_dataset = MIASDataset(test_df, image_folder, transform=transform)

    return train_dataset, test_dataset


In [43]:
# ===== 3. Model Setup =====
def build_model(num_classes):
    model = models.resnet18(pretrained=True)
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model

In [44]:
# ===== 4. Training =====
def train_model(model, train_loader, criterion, optimizer, device, epochs=5):
    model.train()
    for epoch in range(epochs):
        running_loss = 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}] - Loss: {running_loss/len(train_loader):.4f}")


In [45]:
# ===== 5. Evaluation =====
def evaluate_model(model, test_loader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (preds == labels).sum().item()
    print(f"Test Accuracy: {100 * correct / total:.2f}%")

In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_dataset, test_dataset = create_datasets(df, Image_folder)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [47]:
model = build_model(num_classes=len(df['Assessment'].unique())).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)



In [48]:
train_model(model, train_loader, criterion, optimizer, device, epochs=5)

Epoch [1/5] - Loss: 0.9990
Epoch [2/5] - Loss: 0.8958
Epoch [3/5] - Loss: 0.8545
Epoch [4/5] - Loss: 0.8548
Epoch [5/5] - Loss: 0.8065


In [58]:
evaluate_model(model, test_loader, device)

Test Accuracy: 62.63%
