In [None]:
!du -sh /kaggle/input/notebook-qwen

## Installing Dependencies through a Utility Notebook

In [None]:
!pip install --no-index --find-links=/kaggle/input/notebook-qwen -U torch==2.6.0 torchaudio==2.6.0 torchvision==0.21.0 trl==0.17.0 bitsandbytes==0.45.5 pyzmq==26.4 vllm==0.8.5 

In [None]:
%cd /kaggle/input/vllm-deepth/vllm

In [4]:
# !VLLM_USE_PRECOMPILED=1 pip install --editable .

In [None]:
%cd /kaggle/working

In [None]:
!pip show transformers trl bitsandbytes torch

## Running the Actual code

In [None]:
!python /kaggle/input/qwen-confidence-try/qwen_confdemo.py

In [8]:
# vllm==0.8.5 

In [None]:
# !python /kaggle/input/qwen-kuch-4b/qwen_base.py 

## Visialise Results(Same as sample nb)

In [None]:
# Add visualization functions to the notebook
import os
import json
fake_mode = not os.getenv('KAGGLE_IS_COMPETITION_RERUN')

if fake_mode:
    arc_challenge_file = '/kaggle/input/arc-prize-2025/arc-agi_evaluation_challenges.json'
else:
    arc_challenge_file = '/kaggle/input/arc-prize-2025/arc-agi_test_challenges.json'

with open(arc_challenge_file, 'r') as f:
    arc_data = json.load(f)

training_solution_path = '/kaggle/input/arc-prize-2025/arc-agi_training_solutions.json'
with open(training_solution_path, 'r') as f:
    training_solution = json.load(f)
evaluation_solution_path = '/kaggle/input/arc-prize-2025/arc-agi_evaluation_solutions.json'
with open(evaluation_solution_path, 'r') as f:
    evaluation_solution = json.load(f)
        
def visualize_arc_results():
    """Visualize ARC problem solutions from submission.json"""
    import matplotlib.pyplot as plt
    from matplotlib import colors
    import json
    import os
    import numpy as np
    
    print("\n" + "="*80)
    print("VISUALIZING ARC SOLUTION RESULTS")
    print("="*80)
    
    # Check if submission file exists
    submission_path = 'submission.json'
    if not os.path.exists(submission_path):
        print(f"Submission file not found at {submission_path}")
        return
    
    print(f"Found submission file: {submission_path}")
    
    # Load submission data
    with open(submission_path, 'r') as f:
        submission_data = json.load(f)
    
    print(f"Loaded submission with {len(submission_data)} tasks")
    
    # ARC color map - colors for values 0-9
    cmap = colors.ListedColormap(
        ['#000000', '#0074D9', '#FF4136', '#2ECC40', '#FFDC00',
         '#AAAAAA', '#F012BE', '#FF851B', '#7FDBFF', '#870C25'])
    norm = colors.Normalize(vmin=0, vmax=9)
    
    # Function to check if prediction is non-trivial (not just zeros)
    def is_non_trivial_prediction(pred_array):
        # Check if the prediction contains any non-zero values
        return np.any(np.array(pred_array) > 0)
    
    # Function to visualize a single task result
    def visualize_submission_result(task_id, task_data, submission_output, test_idx):
        # Skip visualization if both predictions are just zeros
        pred_1 = np.array(submission_output['attempt_1'])
        pred_2 = np.array(submission_output['attempt_2'])
        
        if not is_non_trivial_prediction(pred_1) and not is_non_trivial_prediction(pred_2):
            print(f"  Skipping visualization for Task {task_id} - Test #{test_idx+1} (all predictions are zeros)")
            return False
        
        # Create visualization
        fig = plt.figure(figsize=(15, 8))
        grid_spec = plt.GridSpec(2, 3, width_ratios=[1, 1, 1])
        
        # Training examples (first one only for simplicity)
        if task_data['train']:
            # Train Input
            ax1 = fig.add_subplot(grid_spec[0, 0])
            ax1.imshow(task_data['train'][0]['input'], cmap=cmap, norm=norm)
            ax1.grid(True, which='both', color='lightgrey', linewidth=0.5)
            ax1.set_title("Training Input")
            ax1.set_xticks([])
            ax1.set_yticks([])
            
            # Train Output
            ax2 = fig.add_subplot(grid_spec[1, 0])
            ax2.imshow(task_data['train'][0]['output'], cmap=cmap, norm=norm)
            ax2.grid(True, which='both', color='lightgrey', linewidth=0.5)
            ax2.set_title("Training Output")
            ax2.set_xticks([])
            ax2.set_yticks([])
        
        # Test Input
        if test_idx < len(task_data['test']):
            ax3 = fig.add_subplot(grid_spec[0, 1])
            ax3.imshow(task_data['test'][test_idx]['input'], cmap=cmap, norm=norm)
            ax3.grid(True, which='both', color='lightgrey', linewidth=0.5)
            ax3.set_title(f"Test Input (Test #{test_idx+1})")
            ax3.set_xticks([])
            ax3.set_yticks([])
        
        # Model Predictions
        # Attempt 1
        ax5 = fig.add_subplot(grid_spec[0, 2])
        ax5.imshow(pred_1, cmap=cmap, norm=norm)
        ax5.grid(True, which='both', color='lightgrey', linewidth=0.5)
        ax5.set_title("Model Prediction (Attempt 1)")
        ax5.set_xticks([])
        ax5.set_yticks([])
        
        # Attempt 2
        ax6 = fig.add_subplot(grid_spec[1, 2])
        ax6.imshow(pred_2, cmap=cmap, norm=norm)
        ax6.grid(True, which='both', color='lightgrey', linewidth=0.5)
        ax6.set_title("Model Prediction (Attempt 2)")
        ax6.set_xticks([])
        ax6.set_yticks([])
        
        # If ground truth is available and we're in fake/debug mode
        if (task_id in arc_data) and (task_id in training_solution or task_id in evaluation_solution):
            # Get ground truth
            ground_truth = None
            if task_id in training_solution and len(training_solution[task_id]) > test_idx:
                ground_truth = training_solution[task_id][test_idx]
            elif task_id in evaluation_solution and len(evaluation_solution[task_id]) > test_idx:
                ground_truth = evaluation_solution[task_id][test_idx]
                
            if ground_truth:
                ax4 = fig.add_subplot(grid_spec[1, 1])
                ax4.imshow(ground_truth, cmap=cmap, norm=norm)
                ax4.grid(True, which='both', color='lightgrey', linewidth=0.5)
                ax4.set_title("Ground Truth")
                ax4.set_xticks([])
                ax4.set_yticks([])
                
                # Calculate match information
                match_1 = np.array_equal(pred_1, ground_truth) if is_non_trivial_prediction(pred_1) else False
                match_2 = np.array_equal(pred_2, ground_truth) if is_non_trivial_prediction(pred_2) else False
                
                # Add match indicators to prediction titles
                ax5.set_title(f"Prediction 1: {'✓' if match_1 else '✗'}")
                ax6.set_title(f"Prediction 2: {'✓' if match_2 else '✗'}")
                
                # Display match information
                print(f"  Results: Attempt 1: {'✓' if match_1 else '✗'}, Attempt 2: {'✓' if match_2 else '✗'}")
                print(f"  Shape - Ground Truth: {np.array(ground_truth).shape}, "
                      f"Prediction 1: {pred_1.shape}, Prediction 2: {pred_2.shape}")
                print(f"  Values - Ground Truth unique values: {np.unique(ground_truth)}")
                print(f"          Prediction 1 unique values: {np.unique(pred_1)}")
                print(f"          Prediction 2 unique values: {np.unique(pred_2)}")
        
        plt.suptitle(f"Task {task_id} - Test Example #{test_idx+1}", fontsize=16)
        plt.tight_layout()
        plt.subplots_adjust(top=0.9)
        plt.show()
        return True
    
    # Process all results from submission
    visualized_count = 0
    skipped_count = 0
    
    # Create a list of all tasks and their test indices
    all_predictions = []
    for task_id in submission_data:
        if task_id in arc_data:
            task_data = arc_data[task_id]
            for test_idx, test_prediction in enumerate(submission_data[task_id]):
                # Check if predictions are non-trivial
                pred_1 = np.array(test_prediction['attempt_1'])
                pred_2 = np.array(test_prediction['attempt_2'])
                has_non_zero_pred = is_non_trivial_prediction(pred_1) or is_non_trivial_prediction(pred_2)
                
                # Check if we have ground truth available
                has_ground_truth = False
                correct_count = 0
                
                if task_id in training_solution and len(training_solution[task_id]) > test_idx:
                    has_ground_truth = True
                    ground_truth = training_solution[task_id][test_idx]
                    
                    if has_non_zero_pred:
                        match_1 = np.array_equal(pred_1, ground_truth) if is_non_trivial_prediction(pred_1) else False
                        match_2 = np.array_equal(pred_2, ground_truth) if is_non_trivial_prediction(pred_2) else False
                        correct_count = int(match_1) + int(match_2)
                
                elif task_id in evaluation_solution and len(evaluation_solution[task_id]) > test_idx:
                    has_ground_truth = True
                    ground_truth = evaluation_solution[task_id][test_idx]
                    
                    if has_non_zero_pred:
                        match_1 = np.array_equal(pred_1, ground_truth) if is_non_trivial_prediction(pred_1) else False
                        match_2 = np.array_equal(pred_2, ground_truth) if is_non_trivial_prediction(pred_2) else False
                        correct_count = int(match_1) + int(match_2)
                
                all_predictions.append((task_id, test_idx, correct_count, has_ground_truth, has_non_zero_pred))
    
    # Sort predictions by correctness and ground truth availability
    all_predictions.sort(key=lambda x: (-int(x[3]), -x[2]))
    
    print(f"\nFound {len(all_predictions)} total predictions to visualize")
    
    # Limit visualization to first N samples for performance
    max_samples = 10  # Change this number to see more or fewer examples
    samples_to_show = all_predictions[:max_samples]
    
    print(f"Showing {len(samples_to_show)} of {len(all_predictions)} prediction samples")
    
    # Visualize selected predictions
    for task_id, test_idx, correct_count, has_ground_truth, has_non_zero_pred in samples_to_show:
        task_data = arc_data[task_id]
        submission_output = submission_data[task_id][test_idx]
        
        # Visualize this task
        score_info = f" (Score: {correct_count}/2)" if has_ground_truth and has_non_zero_pred else " (no ground truth)" if not has_ground_truth else " (all zeros - no score)"
        print(f"\nTask: {task_id} - Test #{test_idx+1}{score_info}")
        
        # Only increment visualized_count if actually visualized
        if visualize_submission_result(task_id, task_data, submission_output, test_idx):
            visualized_count += 1
        else:
            skipped_count += 1
    
    print(f"\nVisualized {visualized_count} inference results (skipped {skipped_count} with all-zero predictions)")
    
    # Calculate overall accuracy statistics if in fake/debug mode
    if fake_mode:  # Only run statistics in debug/local mode
        total_tests = 0
        total_scored_tests = 0
        correct_attempt1 = 0
        correct_attempt2 = 0
        correct_any = 0
        zero_predictions = 0
        
        for task_id, test_idx, _, has_ground_truth, _ in all_predictions:
            if has_ground_truth:
                total_tests += 1
                
                # Get ground truth
                ground_truth = None
                if task_id in training_solution and len(training_solution[task_id]) > test_idx:
                    ground_truth = training_solution[task_id][test_idx]
                elif task_id in evaluation_solution and len(evaluation_solution[task_id]) > test_idx:
                    ground_truth = evaluation_solution[task_id][test_idx]
                
                if not ground_truth:
                    continue
                    
                pred_1 = np.array(submission_data[task_id][test_idx]['attempt_1'])
                pred_2 = np.array(submission_data[task_id][test_idx]['attempt_2'])
                
                # Check if both predictions are all zeros
                if not is_non_trivial_prediction(pred_1) and not is_non_trivial_prediction(pred_2):
                    zero_predictions += 1
                    continue
                
                # Only count tests with at least one non-zero prediction
                total_scored_tests += 1
                
                match_1 = np.array_equal(pred_1, ground_truth) if is_non_trivial_prediction(pred_1) else False
                match_2 = np.array_equal(pred_2, ground_truth) if is_non_trivial_prediction(pred_2) else False
                
                if match_1: correct_attempt1 += 1
                if match_2: correct_attempt2 += 1
                if match_1 or match_2: correct_any += 1
        
        if total_tests > 0:
            print("\n" + "="*80)
            print("OVERALL ACCURACY STATISTICS")
            print("="*80)
            print(f"Total test examples: {total_tests}")
            print(f"Test examples with zero predictions (excluded from accuracy): {zero_predictions}")
            print(f"Test examples included in accuracy calculation: {total_scored_tests}")
            
            if total_scored_tests > 0:
                print(f"Correct on attempt 1: {correct_attempt1}/{total_scored_tests} ({correct_attempt1/total_scored_tests:.2%})")
                print(f"Correct on attempt 2: {correct_attempt2}/{total_scored_tests} ({correct_attempt2/total_scored_tests:.2%})")
                print(f"Correct on either attempt: {correct_any}/{total_scored_tests} ({correct_any/total_scored_tests:.2%})")
            else:
                print("No non-zero predictions to calculate accuracy")
                
            print(f"Overall completion rate: {total_scored_tests/total_tests:.2%} of tests have non-zero predictions")
            print("="*80)

# Add this line to the notebook to call the visualization function
# Call after your submission.json has been created
if fake_mode:
    visualize_arc_results()