In [1]:
# Mount to Google Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
FOLDERNAME = "poster"
%cd drive/MyDrive/$FOLDERNAME

Mounted at /content/drive
/content/drive/MyDrive/poster


In [19]:
import pandas as pd
import matplotlib.pyplot as plt

def analyze_label_distribution(csv_file):
    # Load CSV
    df = pd.read_csv(csv_file)

    # Filter out IDs without an underscore
    df = df[df['id'].str.contains('_')]

    # Extract Scan ID and Slice Index
    df[['ScanID', 'SliceIndex']] = df['id'].str.rsplit('_', n=1, expand=True)
    df['SliceIndex'] = df['SliceIndex'].astype(int)

    # Count total slices per scan
    scan_counts = df.groupby('ScanID')['SliceIndex'].count()

    # Count slices with appendicitis (Label = 1) per scan
    positive_counts = df[df['label'] == 1].groupby('ScanID')['SliceIndex'].count()

    # Merge counts and compute percentage
    distribution = pd.DataFrame({'TotalSlices': scan_counts, 'PositiveSlices': positive_counts}).fillna(0)
    distribution['PositivePercentage'] = (distribution['PositiveSlices'] / distribution['TotalSlices']) * 100

    # Get first and last occurrence of label = 1 for each scan
    first_positive_slice = df[df['label'] == 1].groupby('ScanID')['SliceIndex'].min()
    last_positive_slice = df[df['label'] == 1].groupby('ScanID')['SliceIndex'].max()

    # Add to distribution dataframe
    distribution['FirstPositiveSlice'] = first_positive_slice
    distribution['LastPositiveSlice'] = last_positive_slice

    # Fill NaN values with -1 (for scans with no positive slices)
    distribution[['FirstPositiveSlice', 'LastPositiveSlice']] = distribution[['FirstPositiveSlice', 'LastPositiveSlice']].fillna(-1)

    # Print summary statistics
    print("Summary Statistics:")
    print(distribution.describe())

    # Find ScanIDs where FirstPositiveSlice max
    scans_firstmax = distribution[distribution['FirstPositiveSlice'] == distribution['FirstPositiveSlice'].max()].index.tolist()

    print(f"ScanIDs where FirstPositiveSlice:{distribution['FirstPositiveSlice'].max()}")
    print(scans_firstmax)

    return distribution


In [20]:
# Example usage:
csv_path = "dataset/500_healthy_appen_new.csv"
label_distribution = analyze_label_distribution(csv_path)

Summary Statistics:
       TotalSlices  PositiveSlices  PositivePercentage  FirstPositiveSlice  \
count   500.000000      500.000000          500.000000          500.000000   
mean     96.118000        4.152000            4.303884           36.666000   
std      12.367133        2.868685            2.830808            8.967009   
min      71.000000        1.000000            0.621118           18.000000   
25%      89.000000        2.000000            2.168069           31.000000   
50%      94.500000        3.000000            3.571429           36.000000   
75%     100.000000        5.000000            5.698145           42.000000   
max     163.000000       17.000000           15.555556           73.000000   

       LastPositiveSlice  
count         500.000000  
mean           39.818000  
std             9.349637  
min            19.000000  
25%            33.000000  
50%            39.000000  
75%            45.000000  
max            79.000000  
ScanIDs where FirstPositiveSlice:7

In [21]:
# Example usage:
csv_path = "dataset/TrainValid_ground_truth.csv"
label_distribution = analyze_label_distribution(csv_path)

Summary Statistics:
       TotalSlices  PositiveSlices  PositivePercentage  FirstPositiveSlice  \
count  1000.000000     1000.000000         1000.000000         1000.000000   
mean     95.203000        4.498000            4.794409           15.174000   
std      10.889186        4.978834            5.322019           16.884455   
min      51.000000        0.000000            0.000000           -1.000000   
25%      89.000000        0.000000            0.000000           -1.000000   
50%      94.000000        1.500000            1.530612            5.500000   
75%     100.000000        9.000000            9.183673           31.000000   
max     163.000000       20.000000           21.505376           60.000000   

       LastPositiveSlice  
count        1000.000000  
mean           19.175000  
std            20.819842  
min            -1.000000  
25%            -1.000000  
50%            10.000000  
75%            39.000000  
max            66.000000  
ScanIDs where FirstPositiveSlice:6