In [1]:
# Import libraries for dataset downloading
import kagglehub

# Import libraries for file and directory operations
import os
from pathlib import Path

# Import libraries for numerical operations and array handling
import numpy as np

# Import library for image processing
from PIL import Image

# Import library for random sampling
import random

print("All libraries imported successfully!")

All libraries imported successfully!


In [2]:
# Download the rice image dataset from Kaggle
path = kagglehub.dataset_download("muratkokludataset/rice-image-dataset")

print(f"Dataset downloaded to: {path}")
print(f"Download path type: {type(path)}")

# Verify the path exists
if os.path.exists(path):
    print("✓ Download path verified and exists")
else:
    print("✗ Warning: Download path does not exist")

Dataset downloaded to: /Users/rheanibert/.cache/kagglehub/datasets/muratkokludataset/rice-image-dataset/versions/1
Download path type: <class 'str'>
✓ Download path verified and exists


In [3]:
# Define the five rice variety classes we need to find
rice_classes = ['Arborio', 'Basmati', 'Ipsala', 'Jasmine', 'Karacadag']

print(f"Looking for {len(rice_classes)} rice varieties:")
for variety in rice_classes:
    print(f"  - {variety}")

def find_rice_folders(base_path, target_folders):
    """
    Recursively search for the target folders containing rice images.
    
    Parameters:
    - base_path: The root directory to start searching from
    - target_folders: List of folder names to search for
    
    Returns:
    - Dictionary mapping class names to their full paths
    """
    
    # Initialize dictionary to store found folders
    found_folders = {}
    
    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(base_path):
        
        # Check each directory name
        for dir_name in dirs:
            
            # If this directory is one we're looking for and hasn't been found yet
            if dir_name in target_folders and dir_name not in found_folders:
                
                # Get the full path to this folder
                folder_path = os.path.join(root, dir_name)
                
                # Get list of image files in this folder
                image_files = [f for f in os.listdir(folder_path) 
                              if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
                
                # Only add folder if it contains images
                if len(image_files) > 0:
                    found_folders[dir_name] = folder_path
                    print(f"  ✓ Found {dir_name} with {len(image_files)} images")
    
    return found_folders

print("\nFunction defined successfully!")

Looking for 5 rice varieties:
  - Arborio
  - Basmati
  - Ipsala
  - Jasmine
  - Karacadag

Function defined successfully!


In [4]:
print("Searching for rice image folders...\n")

# Execute the search function
rice_folders = find_rice_folders(path, rice_classes)

print(f"\n{'='*50}")
print(f"Search complete! Found {len(rice_folders)}/{len(rice_classes)} folders")

# Verify all classes were found
if len(rice_folders) != len(rice_classes):
    
    # Find which folders are missing
    missing = set(rice_classes) - set(rice_folders.keys())
    print(f"\n✗ ERROR: Could not find folders for: {missing}")
    raise FileNotFoundError(f"Missing folders: {missing}")
else:
    print("✓ All rice variety folders found successfully!")
    
    # Display the paths
    print("\nFolder locations:")
    for variety, folder_path in rice_folders.items():
        print(f"  {variety}: .../{os.path.basename(os.path.dirname(folder_path))}/{variety}")

Searching for rice image folders...

  ✓ Found Karacadag with 15000 images
  ✓ Found Ipsala with 15000 images
  ✓ Found Arborio with 15000 images
  ✓ Found Basmati with 15000 images
  ✓ Found Jasmine with 15000 images

Search complete! Found 5/5 folders
✓ All rice variety folders found successfully!

Folder locations:
  Karacadag: .../Rice_Image_Dataset/Karacadag
  Ipsala: .../Rice_Image_Dataset/Ipsala
  Arborio: .../Rice_Image_Dataset/Arborio
  Basmati: .../Rice_Image_Dataset/Basmati
  Jasmine: .../Rice_Image_Dataset/Jasmine


In [5]:
# Define sampling parameters
samples_per_class = 15000
image_size = (224, 224)
total_samples = samples_per_class * len(rice_classes)

print("Dataset Parameters:")
print(f"  - Samples per rice variety: {samples_per_class}")
print(f"  - Image size: {image_size[0]}x{image_size[1]} pixels")
print(f"  - Total number of samples: {total_samples}")
print(f"  - Color channels: 3 (RGB)")

# Calculate memory requirement
memory_mb = (total_samples * image_size[0] * image_size[1] * 3 * 4) / (1024 * 1024)
print(f"  - Estimated memory requirement: {memory_mb:.2f} MB")

# Initialize arrays to store processed images and labels
images = np.zeros((total_samples, image_size[0], image_size[1], 3), dtype=np.float32)
labels = []

print(f"\n✓ Initialized images array with shape: {images.shape}")
print(f"✓ Initialized empty labels list")

Dataset Parameters:
  - Samples per rice variety: 15000
  - Image size: 224x224 pixels
  - Total number of samples: 75000
  - Color channels: 3 (RGB)
  - Estimated memory requirement: 43066.41 MB

✓ Initialized images array with shape: (75000, 224, 224, 3)
✓ Initialized empty labels list


In [6]:
# Install memory_profiler if not already installed
import subprocess
import sys

try:
    import psutil
except ImportError:
    print("Installing psutil...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "psutil"])
    import psutil

# Import garbage collection for memory management
import gc

def get_memory_info():
    """
    Get detailed memory information about the system and current process.
    """
    # Get system memory info
    memory = psutil.virtual_memory()
    
    # Get current process memory info
    process = psutil.Process()
    process_memory = process.memory_info()
    
    return {
        'total_gb': memory.total / (1024**3),
        'available_gb': memory.available / (1024**3),
        'used_gb': memory.used / (1024**3),
        'percent': memory.percent,
        'process_gb': process_memory.rss / (1024**3)
    }

def print_memory_status(label="Current"):
    """
    Print formatted memory status.
    """
    info = get_memory_info()
    print(f"\n{'='*50}")
    print(f"{label} Memory Status:")
    print(f"  System Total: {info['total_gb']:.2f} GB")
    print(f"  System Available: {info['available_gb']:.2f} GB")
    print(f"  System Used: {info['used_gb']:.2f} GB ({info['percent']:.1f}%)")
    print(f"  This Process: {info['process_gb']:.2f} GB")
    print(f"{'='*50}")

# Initial memory check
print_memory_status("Initial")

# Calculate theoretical memory requirements
image_size_bytes = 224 * 224 * 3 * 4  # float32
total_images = 75000
required_gb = (image_size_bytes * total_images) / (1024**3)

print(f"\nTheoretical Memory Requirements:")
print(f"  Per image: {image_size_bytes / (1024**2):.2f} MB")
print(f"  For 75,000 images: {required_gb:.2f} GB")

available = get_memory_info()['available_gb']
if required_gb > available:
    print(f"\n⚠ WARNING: Need {required_gb:.2f} GB but only {available:.2f} GB available!")
    max_images = int((available * 0.8 * 1024**3) / image_size_bytes)  # Use only 80% of available
    print(f"  Recommended max images: {max_images:,}")
else:
    print(f"\n✓ Sufficient memory available")


Initial Memory Status:
  System Total: 8.00 GB
  System Available: 1.51 GB
  System Used: 3.20 GB (81.1%)
  This Process: 0.16 GB

Theoretical Memory Requirements:
  Per image: 0.57 MB
  For 75,000 images: 42.06 GB

  Recommended max images: 2,160


In [7]:
# Test with progressively larger batches to find the limit
test_sizes = [100, 500, 1000, 2000, 5000, 10000, 15000, 20000, 30000, 40000, 50000, 60000, 75000]

print("Testing maximum image capacity...\n")

max_successful = 0
failure_point = None

for test_size in test_sizes:
    # Check if we have enough memory for this test
    required_gb = (224 * 224 * 3 * 4 * test_size) / (1024**3)
    available_gb = get_memory_info()['available_gb']
    
    print(f"Testing {test_size:,} images (requires {required_gb:.2f} GB)...")
    
    # Less conservative - only skip if we need MORE than available memory (no buffer)
    if required_gb > available_gb:
        print(f"  ✗ Skipping - insufficient memory (only {available_gb:.2f} GB available)")
        failure_point = test_size
        break
    
    try:
        # Try to allocate array
        test_array = np.zeros((test_size, 224, 224, 3), dtype=np.float32)
        
        # If successful, update max
        max_successful = test_size
        print(f"  ✓ Success - {get_memory_info()['process_gb']:.2f} GB used by process")
        
        # Clean up
        del test_array
        gc.collect()
        
    except MemoryError:
        print(f"  ✗ Failed - MemoryError")
        failure_point = test_size
        break
    except Exception as e:
        print(f"  ✗ Failed - {type(e).__name__}: {str(e)[:50]}")
        failure_point = test_size
        break

print(f"\n{'='*50}")
print(f"Maximum successful allocation: {max_successful:,} images")
if failure_point:
    print(f"Failed at: {failure_point:,} images")

# Less conservative recommendation - use 95% of max instead of 80%
recommended = int(max_successful * 0.95)
print(f"Recommended limit: {recommended:,} images (95% of max)")

# Also show what's possible with different safety margins
print(f"\nSafety margin options:")
print(f"  100% (no margin): {max_successful:,} images")
print(f"   95% (tight):     {int(max_successful * 0.95):,} images")
print(f"   90% (standard):  {int(max_successful * 0.90):,} images")
print(f"   85% (safe):      {int(max_successful * 0.85):,} images")

Testing maximum image capacity...

Testing 1,000 images (requires 0.56 GB)...
  ✓ Success - 0.33 GB process / 1.41 GB available
Testing 5,000 images (requires 2.80 GB)...
  ✓ Success - 0.36 GB process / 1.50 GB available
Testing 10,000 images (requires 5.61 GB)...
  ✓ Success - 0.36 GB process / 1.35 GB available
Testing 15,000 images (requires 8.41 GB)...
  ✓ Success - 0.36 GB process / 1.39 GB available
Testing 25,000 images (requires 14.02 GB)...
  ✓ Success - 0.36 GB process / 1.45 GB available
Testing 50,000 images (requires 28.04 GB)...
  ✓ Success - 0.36 GB process / 1.43 GB available
Testing 75,000 images (requires 42.06 GB)...
  ✓ Success - 0.36 GB process / 1.40 GB available

Maximum successful allocation: 75,000 images
Recommended limit: 67,500 images (90% of max)
Recommended samples_per_class: 13,500


In [None]:
# Set the number of samples based on previous test
# Using 95% of max capacity instead of 80%
samples_per_class = int(max_successful * 0.95 / len(rice_classes))  # Divide among 5 classes

print(f"Based on memory test, using {samples_per_class:,} samples per class")
print(f"Total images to process: {samples_per_class * len(rice_classes):,}")

# Show what we know works
print(f"\nKnown working configurations:")
print(f"  - Confirmed working: 5,000 images (1,000 per class)")
print(f"  - Maximum tested: {max_successful:,} images")
print(f"  - Attempting now: {samples_per_class * len(rice_classes):,} images")

# Confirm before proceeding
response = input("\nProceed with this configuration? (yes/no): ")
if response.lower() != 'yes':
    print("Aborted. Adjust samples_per_class manually.")
    # Allow manual override
    manual = input("Enter desired samples_per_class (or press Enter to skip): ")
    if manual.isdigit():
        samples_per_class = int(manual)
        print(f"Using manual setting: {samples_per_class:,} samples per class")
else:
    
    # Initialize with memory monitoring
    print_memory_status("Before Allocation")
    
    total_samples = samples_per_class * len(rice_classes)
    
    # Allocate arrays with monitoring
    try:
        images = np.zeros((total_samples, 224, 224, 3), dtype=np.float32)
        labels = []
        print(f"✓ Successfully allocated array for {total_samples:,} images")
        print_memory_status("After Allocation")
        
    except MemoryError as e:
        print(f"✗ Failed to allocate memory for {total_samples:,} images")
        print("Try reducing samples_per_class")
        raise e
    
    # Process images with detailed monitoring
    current_index = 0
    random.seed(42)
    np.random.seed(42)
    
    print("\nStarting image processing with memory monitoring...\n")
    
    for class_num, class_name in enumerate(rice_classes):
        
        print(f"\n[{class_num + 1}/{len(rice_classes)}] Processing {class_name}")
        print_memory_status(f"Before {class_name}")
        
        folder_path = rice_folders[class_name]
        
        # Get all image files
        all_images = [f for f in os.listdir(folder_path) 
                      if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
        
        print(f"  Total images available: {len(all_images)}")
        
        # Sample images
        if len(all_images) >= samples_per_class:
            sampled_images = random.sample(all_images, samples_per_class)
        else:
            sampled_images = all_images[:samples_per_class]
        
        print(f"  Processing {len(sampled_images)} images...")
        
        # Process with progress monitoring
        processed_count = 0
        failed_count = 0
        
        # Process in chunks to monitor memory
        chunk_size = 100
        
        for chunk_start in range(0, len(sampled_images), chunk_size):
            chunk_end = min(chunk_start + chunk_size, len(sampled_images))
            
            for img_name in sampled_images[chunk_start:chunk_end]:
                img_path = os.path.join(folder_path, img_name)
                
                try:
                    # Load and process image
                    img = Image.open(img_path)
                    img = img.convert('RGB')
                    img = img.resize((224, 224), Image.LANCZOS)
                    img_array = np.array(img, dtype=np.float32) / 255.0
                    
                    # Store
                    images[current_index] = img_array
                    labels.append(class_name)
                    current_index += 1
                    processed_count += 1
                    
                    # Close image to free memory
                    img.close()
                    
                except Exception as e:
                    failed_count += 1
                    if failed_count <= 5:  # Only show first 5 errors
                        print(f"    ✗ Error: {str(e)[:50]}")
            
            # Memory check every chunk - increased to every 1000 images
            if (chunk_start + chunk_size) % 1000 == 0:
                mem_info = get_memory_info()
                print(f"    Processed {chunk_start + chunk_size}/{len(sampled_images)}: "
                      f"Process using {mem_info['process_gb']:.2f} GB, "
                      f"{mem_info['available_gb']:.2f} GB available")
                
                # Less conservative warning - only at 95%
                if mem_info['percent'] > 95:
                    print("    ⚠ WARNING: System memory usage above 95%!")
        
        print(f"  ✓ Completed: {processed_count} processed, {failed_count} failed")
        
        # Garbage collection after each class
        gc.collect()
    
    print_memory_status("Final")
    print(f"\n✓ Successfully processed {current_index} images total")

Based on memory test, using 12,000 samples per class
Total images to process: 60,000



Proceed with this configuration? (yes/no):  yes



Before Allocation Memory Status:
  System Total: 8.00 GB
  System Available: 1.75 GB
  System Used: 3.13 GB (78.1%)
  This Process: 0.15 GB
✓ Successfully allocated array for 60,000 images

After Allocation Memory Status:
  System Total: 8.00 GB
  System Available: 1.75 GB
  System Used: 3.13 GB (78.1%)
  This Process: 0.15 GB

Starting image processing with memory monitoring...


[1/5] Processing Arborio

Before Arborio Memory Status:
  System Total: 8.00 GB
  System Available: 1.75 GB
  System Used: 3.13 GB (78.1%)
  This Process: 0.15 GB
  Total images available: 15000
  Processing 12000 images...
    Processed 1000/12000: Process using 0.52 GB, 1.50 GB available
    Processed 2000/12000: Process using 0.50 GB, 1.47 GB available
    Processed 3000/12000: Process using 0.52 GB, 1.44 GB available
    Processed 4000/12000: Process using 0.46 GB, 1.41 GB available
    Processed 5000/12000: Process using 0.34 GB, 1.39 GB available
    Processed 6000/12000: Process using 0.43 GB, 1.28 GB

In [None]:
# Final memory analysis
print("="*60)
print("MEMORY USAGE ANALYSIS")
print("="*60)

# Get current memory stats
final_info = get_memory_info()

# Calculate actual memory used by images array
if 'images' in locals():
    actual_array_gb = images.nbytes / (1024**3)
    images_per_gb = len(images) / actual_array_gb
    
    print(f"\nActual Array Statistics:")
    print(f"  Array size: {images.shape}")
    print(f"  Memory used by array: {actual_array_gb:.2f} GB")
    print(f"  Images per GB: {images_per_gb:.0f}")
    print(f"  Memory per image: {(actual_array_gb * 1024) / len(images):.2f} MB")
    
    print(f"\nMemory Breakdown:")
    print(f"  Image array: {actual_array_gb:.2f} GB")
    print(f"  Process total: {final_info['process_gb']:.2f} GB")
    print(f"  Overhead: {final_info['process_gb'] - actual_array_gb:.2f} GB")
    
    print(f"\nSystem Resources:")
    print(f"  Total RAM: {final_info['total_gb']:.2f} GB")
    print(f"  Currently available: {final_info['available_gb']:.2f} GB")
    print(f"  Could theoretically hold: {int(final_info['total_gb'] * 0.8 * images_per_gb):,} images")
    
    # Recommendations
    print(f"\n{'='*60}")
    print("RECOMMENDATIONS:")
    print(f"{'='*60}")
    
    max_safe_images = int(final_info['total_gb'] * 0.7 * images_per_gb)
    
    if max_safe_images >= 75000:
        print(f"✓ Your system can handle all 75,000 images!")
        print(f"  Set samples_per_class = 15000")
    else:
        recommended_per_class = max_safe_images // 5
        print(f"✗ Your system can safely handle up to {max_safe_images:,} images")
        print(f"  Recommended samples_per_class = {recommended_per_class:,}")
        print(f"\nAlternative solutions needed for 75,000 images:")
        print(f"  - Use batch processing with disk storage")
        print(f"  - Use data generators for on-the-fly loading")
        print(f"  - Reduce image size to {int(224 * np.sqrt(max_safe_images/75000))}")
        print(f"  - Use float16 instead of float32 (doubles capacity)")

In [None]:
# Convert labels list to numpy array
labels = np.array(labels)

print("Array Finalization:")
print(f"  Original allocated size: {total_samples}")
print(f"  Actually processed: {current_index}")

# Trim arrays if some images failed to load
if current_index < total_samples:
    print(f"  ⚠ Trimming arrays to actual size...")
    images = images[:current_index]
    labels = labels[:current_index]
    print(f"  ✓ Arrays trimmed to size: {current_index}")
else:
    print(f"  ✓ All images processed successfully!")

# Verify array shapes
print(f"\nFinal array shapes:")
print(f"  Images: {images.shape}")
print(f"  Labels: {labels.shape}")

# Verify data types
print(f"\nData types:")
print(f"  Images: {images.dtype}")
print(f"  Labels: {labels.dtype}")

In [None]:
print("Shuffling dataset for random distribution...")

# Generate random permutation of indices
shuffle_indices = np.random.permutation(len(labels))

# Apply shuffle to both arrays
images = images[shuffle_indices]
labels = labels[shuffle_indices]

print(f"✓ Dataset shuffled successfully!")

# Show first 10 labels to verify shuffling
print(f"\nFirst 10 labels after shuffling:")
for i in range(10):
    print(f"  Index {i}: {labels[i]}")

In [None]:
print("="*60)
print("DATASET PREPARATION COMPLETE")
print("="*60)

# Basic statistics
print("\nDataset Summary:")
print(f"  Total samples: {len(labels)}")
print(f"  Image dimensions: {images.shape[1]}x{images.shape[2]} pixels")
print(f"  Color channels: {images.shape[3]}")
print(f"  Data type: {images.dtype}")

# Pixel value statistics
print(f"\nPixel Value Statistics:")
print(f"  Min value: {images.min():.4f}")
print(f"  Max value: {images.max():.4f}")
print(f"  Mean value: {images.mean():.4f}")
print(f"  Std deviation: {images.std():.4f}")

# Class distribution
print(f"\nClass Distribution:")
unique_classes, counts = np.unique(labels, return_counts=True)
for class_name, count in zip(unique_classes, counts):
    percentage = (count / len(labels)) * 100
    print(f"  {class_name:10s}: {count:4d} images ({percentage:.1f}%)")

# Memory usage
memory_usage = images.nbytes / (1024 * 1024)
print(f"\nMemory Usage:")
print(f"  Images array: {memory_usage:.2f} MB")

# Verify normalization
if images.min() >= 0 and images.max() <= 1:
    print(f"\n✓ Images properly normalized to [0, 1] range")
else:
    print(f"\n✗ Warning: Images not in [0, 1] range!")

# Sample a random image to display its properties
sample_idx = np.random.randint(0, len(labels))
print(f"\nSample Image Check (index {sample_idx}):")
print(f"  Label: {labels[sample_idx]}")
print(f"  Shape: {images[sample_idx].shape}")
print(f"  Pixel range: [{images[sample_idx].min():.3f}, {images[sample_idx].max():.3f}]")

In [None]:
# Import matplotlib for visualization
import matplotlib.pyplot as plt

# Create a grid of sample images
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
fig.suptitle('Sample Processed Rice Images', fontsize=16)

# Randomly select 10 images to display
random_indices = np.random.choice(len(labels), 10, replace=False)

for idx, ax in enumerate(axes.flat):
    
    # Get image and label
    img_idx = random_indices[idx]
    img = images[img_idx]
    label = labels[img_idx]
    
    # Display image
    ax.imshow(img)
    ax.set_title(f'{label}\nIndex: {img_idx}')
    ax.axis('off')

plt.tight_layout()
plt.show()

print("✓ Visualization complete!")
print("\nYour dataset is ready for CNN training:")
print(f"  - images: shape {images.shape}")
print(f"  - labels: shape {labels.shape}")