In [3]:
import pandas as pd
import os

# Use the path we confirmed works
BASE_PATH = r"C:\Users\rthakre\reports-20251124T181120Z-1-001\reports"
TRAIN_FILE = os.path.join(BASE_PATH, "train_reports.csv")

# Read just the first few lines
df = pd.read_csv(TRAIN_FILE)

print("--- COLUMN NAMES IN YOUR FILE ---")
print(df.columns.tolist())
print("---------------------------------")
print("Look for the one that contains the text report.")

--- COLUMN NAMES IN YOUR FILE ---
['uid', 'filename', 'projection', 'findings_final', 'impression_final', 'full_report', 'findings_len', 'impression_len', 'full_report_len', 'MeSH', 'Problems', 'split']
---------------------------------
Look for the one that contains the text report.


In [4]:
# Cell: Task 1 - Frequency Baseline (FIXED NLTK ERROR)

import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from collections import Counter
import numpy as np
import os

# =============================================================================
# üõ†Ô∏è FIX: DOWNLOAD MISSING NLTK DATA
# =============================================================================
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

# =============================================================================
# 1. CONFIGURATION
# =============================================================================
# Path to your report folder
BASE_PATH = r"C:\Users\rthakre\reports-20251124T181120Z-1-001\reports"

TRAIN_FILE = os.path.join(BASE_PATH, "train_reports.csv")
TEST_FILE  = os.path.join(BASE_PATH, "test_reports.csv")

# We are using the column you identified
TEXT_COLUMN = 'findings_final' 

# =============================================================================
# 2. BASELINE LOGIC
# =============================================================================

def load_and_find_mode(train_path):
    print(f"\n1. Analyzing Training Data...")
    if not os.path.exists(train_path):
        print(f"‚ùå ERROR: File not found at {train_path}")
        return None

    df = pd.read_csv(train_path)
    
    # Handle missing text
    df[TEXT_COLUMN] = df[TEXT_COLUMN].fillna("")
    
    # Find the most common report text
    all_reports = df[TEXT_COLUMN].str.strip().tolist()
    counts = Counter(all_reports)
    most_common_text, frequency = counts.most_common(1)[0]
    
    percentage = (frequency / len(df)) * 100
    
    print(f"   Most frequent report appears {frequency} times ({percentage:.2f}% of dataset).")
    print(f"   Text: '{most_common_text[:100]}...'") 
    
    return most_common_text

def evaluate_baseline(test_path, baseline_prediction):
    print(f"\n2. Evaluating on Test Data...")
    if not os.path.exists(test_path):
        print(f"‚ùå ERROR: File not found at {test_path}")
        return None, None, None

    df = pd.read_csv(test_path)
    df[TEXT_COLUMN] = df[TEXT_COLUMN].fillna("")
    
    ground_truth = df[TEXT_COLUMN].str.strip().tolist()
    
    # The "Model" predicts the exact same text for every image
    predictions = [baseline_prediction] * len(ground_truth)
    
    print(f"   Comparing {len(ground_truth)} predictions...")
    
    # Metrics
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth = SmoothingFunction().method1
    
    bleu1_scores = []
    bleu4_scores = []
    rouge_scores = []
    
    for ref, hyp in zip(ground_truth, predictions):
        # Tokenize using the now-downloaded punkt_tab
        ref_tokens = nltk.word_tokenize(ref.lower())
        hyp_tokens = nltk.word_tokenize(hyp.lower())
        
        # BLEU-1
        bleu1_scores.append(sentence_bleu([ref_tokens], hyp_tokens, weights=(1, 0, 0, 0), smoothing_function=smooth))
        
        # BLEU-4
        bleu4_scores.append(sentence_bleu([ref_tokens], hyp_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth))
        
        # ROUGE-L
        r_score = scorer.score(ref, hyp)
        rouge_scores.append(r_score['rougeL'].fmeasure)
        
    return np.mean(bleu1_scores), np.mean(bleu4_scores), np.mean(rouge_scores)

# Run the main logic
if os.path.exists(TRAIN_FILE) and os.path.exists(TEST_FILE):
    # 1. Train (Find the most common sentence)
    baseline_text = load_and_find_mode(TRAIN_FILE)
    
    if baseline_text:
        # 2. Test (Score that sentence against the test set)
        b1, b4, rouge = evaluate_baseline(TEST_FILE, baseline_text)
        
        print("\n" + "="*40)
        print(" FINAL BASELINE SCORES (Frequency Model)")
        print("="*40)
        print(f"BLEU-1:  {b1:.4f}")
        print(f"BLEU-4:  {b4:.4f}")
        print(f"ROUGE-L: {rouge:.4f}")
        print("="*40)


1. Analyzing Training Data...
   Most frequent report appears 265 times (5.07% of dataset).
   Text: 'normal...'

2. Evaluating on Test Data...
   Comparing 1121 predictions...

üèÜ FINAL BASELINE SCORES (Frequency Model)
BLEU-1:  0.0473
BLEU-4:  0.0084
ROUGE-L: 0.1020


In [5]:
####part 2

In [10]:
import sys
import subprocess

try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn"])
    print(" scikit-learn installed!")
except:
    print(" Installation failed.")

‚úÖ scikit-learn installed!


In [11]:
# Cell: Task 2 - KNN Baseline (Pixel-Match Version)
# This version DOES NOT require PyTorch, avoiding the DLL error.

import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.neighbors import NearestNeighbors
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from tqdm import tqdm

# =============================================================================
# 1. CONFIGURATION
# =============================================================================
# Using the paths you confirmed earlier
REPORT_PATH = r"C:\Users\rthakre\reports-20251124T181120Z-1-001\reports"
TRAIN_CSV = os.path.join(REPORT_PATH, "train_reports.csv")
TEST_CSV  = os.path.join(REPORT_PATH, "test_reports.csv")

IMAGE_ROOT = r"C:\Users\rthakre\images-20251124T181116Z-1-001\images"
TRAIN_IMG_DIR = os.path.join(IMAGE_ROOT, "train")
TEST_IMG_DIR = os.path.join(IMAGE_ROOT, "test")

# Column names based on your file
TEXT_COL = 'findings_final'
FILENAME_COL = 'filename'

# =============================================================================
# 2. HELPER FUNCTIONS
# =============================================================================

def load_and_flatten_images(image_dir, filenames):
    """
    Reads images, resizes them to 64x64, and uses raw pixels as features.
    This mimics "visual similarity" without needing a heavy Neural Network.
    """
    matrix = []
    valid_indices = [] 
    
    print(f"Processing {len(filenames)} images from {image_dir}...")
    
    # Resize to 64x64 (thumbnails) for speed
    TARGET_SIZE = (64, 64)
    
    for idx, fname in enumerate(tqdm(filenames)):
        img_path = os.path.join(image_dir, fname)
        
        # Handle missing extensions if necessary
        if not os.path.exists(img_path):
            if os.path.exists(img_path + ".png"):
                img_path += ".png"
            else:
                continue
        
        try:
            # Open, convert to Grayscale (L), Resize, Flatten
            img = Image.open(img_path).convert("L") 
            img = img.resize(TARGET_SIZE)
            # Flatten to a single vector of numbers
            arr = np.array(img).flatten()
            matrix.append(arr)
            valid_indices.append(idx)
        except Exception as e:
            continue

    return np.array(matrix), valid_indices

def calculate_metrics(references, hypotheses):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth = SmoothingFunction().method1
    
    b1_scores, b4_scores, rouge_scores = [], [], []
    
    print("Calculating scores...")
    for ref, hyp in zip(references, hypotheses):
        ref_tok = nltk.word_tokenize(ref.lower())
        hyp_tok = nltk.word_tokenize(hyp.lower())
        
        b1_scores.append(sentence_bleu([ref_tok], hyp_tok, weights=(1,0,0,0), smoothing_function=smooth))
        b4_scores.append(sentence_bleu([ref_tok], hyp_tok, weights=(0.25,0.25,0.25,0.25), smoothing_function=smooth))
        rouge_scores.append(scorer.score(ref, hyp)['rougeL'].fmeasure)
        
    return np.mean(b1_scores), np.mean(b4_scores), np.mean(rouge_scores)

# =============================================================================
# 3. MAIN LOGIC
# =============================================================================

# A. Load Data
print("\n1. Loading CSVs...")
if os.path.exists(TRAIN_CSV) and os.path.exists(TEST_CSV):
    train_df = pd.read_csv(TRAIN_CSV).dropna(subset=[TEXT_COL, FILENAME_COL])
    test_df = pd.read_csv(TEST_CSV).dropna(subset=[TEXT_COL, FILENAME_COL])
    
    # B. Process Images (Flatten Pixels)
    print("\n2. extracting 'pixel features' (No GPU needed)...")
    train_feats, train_indices = load_and_flatten_images(TRAIN_IMG_DIR, train_df[FILENAME_COL].tolist())
    train_df_clean = train_df.iloc[train_indices].reset_index(drop=True)
    
    test_feats, test_indices = load_and_flatten_images(TEST_IMG_DIR, test_df[FILENAME_COL].tolist())
    test_df_clean = test_df.iloc[test_indices].reset_index(drop=True)
    
    if len(train_feats) > 0 and len(test_feats) > 0:
        # C. Find Nearest Neighbors based on Pixels
        print(f"\n3. Finding closest matches among {len(train_feats)} training images...")
        # 'cityblock' (Manhattan distance) works well for comparing pixel brightness
        knn = NearestNeighbors(n_neighbors=1, metric='cityblock') 
        knn.fit(train_feats)
        
        distances, indices = knn.kneighbors(test_feats)
        
        # D. Generate Report Predictions
        predictions = []
        for i in range(len(indices)):
            neighbor_idx = indices[i][0]
            # We steal the report from the most similar looking training image
            predictions.append(train_df_clean.iloc[neighbor_idx][TEXT_COL])
            
        # E. Score
        ground_truth = test_df_clean[TEXT_COL].tolist()
        b1, b4, rouge = calculate_metrics(ground_truth, predictions)
        
        print("\n" + "="*40)
        print(" FINAL KNN BASELINE SCORES")
        print("="*40)
        print(f"BLEU-1:  {b1:.4f}")
        print(f"BLEU-4:  {b4:.4f}")
        print(f"ROUGE-L: {rouge:.4f}")
        print("="*40)
        print("This is the score to beat. If the complex model is lower than this,")
        print("it means the AI isn't even looking at the image correctly.")
    else:
        print("Error: No images were successfully loaded. Check paths.")
else:
    print("Error: CSV files not found.")


1. Loading CSVs...

2. extracting 'pixel features' (No GPU needed)...
Processing 5223 images from C:\Users\rthakre\images-20251124T181116Z-1-001\images\train...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5223/5223 [00:15<00:00, 340.16it/s]


Processing 1121 images from C:\Users\rthakre\images-20251124T181116Z-1-001\images\test...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1121/1121 [00:04<00:00, 263.70it/s]



3. Finding closest matches among 5223 training images...
Calculating scores...

üèÜ FINAL KNN BASELINE SCORES
BLEU-1:  0.2618
BLEU-4:  0.0636
ROUGE-L: 0.2107
This is the score to beat. If the complex model is lower than this,
it means the AI isn't even looking at the image correctly.


In [12]:
#####

C:\Users\rthakre
