In [16]:
"""
AgriGenAI - Day 1: Phenotype Feature Extraction
================================================
Goal: Extract visual phenotype features from tomato plant images
Time: 8 hours

What this does:
1. Loads PlantVillage + Laboro datasets
2. Uses ResNet50 (pretrained) to extract 2048-dim feature vectors
3. Saves features + metadata
4. Visualizes feature distributions

NO TRAINING - Just feature extraction!
"""

import os
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("🌱 AgriGenAI - Day 1: Phenotype Feature Extraction")
print("=" * 60)


🌱 AgriGenAI - Day 1: Phenotype Feature Extraction


In [8]:
# ============================================
# 1. PROJECT SETUP
# ============================================


from pathlib import Path

class Config:
    """Configuration for Day 1"""
    BASE_PATH = Path('../AgriGenAI_Dataset')
    OUTPUT_PATH = Path('../AgriGenAI_Output')
    
    # Dataset paths
    PLANTVILLAGE_PATH = BASE_PATH / 'PlantVillage' / 'images'
    LABORO_PATH = BASE_PATH / 'Laboro' / 'images'
    
    # Output paths
    FEATURES_PATH = OUTPUT_PATH / 'features'
    METADATA_PATH = OUTPUT_PATH / 'metadata'
    VISUALIZATIONS_PATH = OUTPUT_PATH / 'visualizations'
    
    # Model config
    IMG_SIZE = (224, 224)
    BATCH_SIZE = 32
    FEATURE_DIM = 2048  # ResNet50 output
    
    # Sampling (for quick testing)
    SAMPLE_SIZE = None  # Set to 100 for quick test, None for full dataset

# Create output directories
for path in [Config.FEATURES_PATH, Config.METADATA_PATH, Config.VISUALIZATIONS_PATH]:
    path.mkdir(parents=True, exist_ok=True)

print(f"✅ Output directories created at: {Config.OUTPUT_PATH}")


✅ Output directories created at: ..\AgriGenAI_Output


In [9]:
print(Config.PLANTVILLAGE_PATH.exists(), Config.LABORO_PATH.exists())


True True


In [10]:
# ============================================
# 2. DATA LOADER
# ============================================


class TomatoDatasetLoader:
    """Load and organize tomato images from multiple sources"""
    
    def __init__(self, config):
        self.config = config
        self.image_data = []
    
    def load_plantvillage(self):
        """Load PlantVillage tomato leaf images"""
        print("\n📂 Loading PlantVillage dataset...")
        
        pv_path = self.config.PLANTVILLAGE_PATH
        if not pv_path.exists():
            print(f"⚠️  PlantVillage path not found: {pv_path}")
            return 0
        
        count = 0
        for disease_folder in pv_path.iterdir():
            if disease_folder.is_dir():
                disease_name = disease_folder.name
                for img_file in disease_folder.glob('*.jpg'):
                    self.image_data.append({
                        'image_path': str(img_file),
                        'source': 'PlantVillage',
                        'category': disease_name,
                        'organ': 'leaf'
                    })
                    count += 1
        
        print(f"✅ Loaded {count} PlantVillage images")
        return count
    
    def load_laboro(self):
        """Load Laboro tomato fruit images"""
        print("\n📂 Loading Laboro dataset...")
        
        laboro_path = self.config.LABORO_PATH
        if not laboro_path.exists():
            print(f"⚠️  Laboro path not found: {laboro_path}")
            return 0
        
        count = 0
        for img_file in laboro_path.glob('*.jpg'):
            self.image_data.append({
                'image_path': str(img_file),
                'source': 'Laboro',
                'category': 'fruit_detection',
                'organ': 'fruit'
            })
            count += 1
        
        print(f"✅ Loaded {count} Laboro images")
        return count
    
    def load_all(self):
        """Load all datasets"""
        pv_count = self.load_plantvillage()
        lab_count = self.load_laboro()
        
        # Convert to DataFrame
        df = pd.DataFrame(self.image_data)
        
        # Sample if needed
        if self.config.SAMPLE_SIZE:
            df = df.sample(n=min(self.config.SAMPLE_SIZE, len(df)), random_state=42)
            print(f"\n⚠️  Using sample of {len(df)} images for quick testing")
        
        print(f"\n📊 Total dataset: {len(df)} images")
        print(f"   - PlantVillage (leaves): {len(df[df['source']=='PlantVillage'])}")
        print(f"   - Laboro (fruits): {len(df[df['source']=='Laboro'])}")
        
        return df


In [12]:
# ============================================
# 3. CNN FEATURE EXTRACTOR
# ============================================


class PhenotypeFeatureExtractor:
    """Extract phenotype features using pretrained ResNet50"""
    
    def __init__(self, config):
        self.config = config
        print("\n🔧 Loading ResNet50 model (pretrained on ImageNet)...")
        
        # Load ResNet50 WITHOUT top classification layer
        self.model = ResNet50(
            weights='imagenet',
            include_top=False,
            pooling='avg',
            input_shape=(224, 224, 3)
        )
        
        print(f"✅ ResNet50 loaded. Output shape: {self.model.output_shape}")
        print("   This will extract 2048-dimensional feature vectors")
    
    def preprocess_image(self, img_path):
        """Load and preprocess single image"""
        try:
            img = image.load_img(img_path, target_size=self.config.IMG_SIZE)
            img_array = image.img_to_array(img)
            img_array = np.expand_dims(img_array, axis=0)
            img_array = preprocess_input(img_array)
            return img_array
        except Exception as e:
            print(f"❌ Error loading {img_path}: {e}")
            return None
    
    def extract_features_batch(self, image_paths):
        """Extract features for a batch of images"""
        batch_arrays = []
        valid_paths = []
        
        for img_path in image_paths:
            img_array = self.preprocess_image(img_path)
            if img_array is not None:
                batch_arrays.append(img_array)
                valid_paths.append(img_path)
        
        if not batch_arrays:
            return None, valid_paths
        
        # Stack and predict
        batch_arrays = np.vstack(batch_arrays)
        features = self.model.predict(batch_arrays, verbose=0)
        
        return features, valid_paths
    
    def extract_all_features(self, df):
        """Extract features for all images with progress bar"""
        print("\n🚀 Extracting phenotype features from all images...")
        print(f"   Processing {len(df)} images in batches of {self.config.BATCH_SIZE}")
        
        all_features = []
        all_paths = []
        
        # Process in batches
        for i in tqdm(range(0, len(df), self.config.BATCH_SIZE)):
            batch_df = df.iloc[i:i+self.config.BATCH_SIZE]
            batch_paths = batch_df['image_path'].tolist()
            
            features, valid_paths = self.extract_features_batch(batch_paths)
            
            if features is not None:
                all_features.append(features)
                all_paths.extend(valid_paths)
        
        # Combine all features
        if all_features:
            all_features = np.vstack(all_features)
            print(f"\n✅ Feature extraction complete!")
            print(f"   Shape: {all_features.shape}")
            print(f"   Successfully processed: {len(all_paths)}/{len(df)} images")
            return all_features, all_paths
        else:
            print("❌ No features extracted!")
            return None, None



In [13]:
# ============================================
# 4. FEATURE ANALYSIS & VISUALIZATION
# ============================================

class FeatureVisualizer:
    """Visualize extracted features"""
    
    def __init__(self, config):
        self.config = config
    
    def plot_feature_distribution(self, features, df):
        """Plot distribution of feature values"""
        print("\n📊 Creating feature distribution plot...")
        
        plt.figure(figsize=(12, 6))
        
        # Plot 1: Feature mean distribution
        plt.subplot(1, 2, 1)
        feature_means = features.mean(axis=0)
        plt.hist(feature_means, bins=50, color='green', alpha=0.7)
        plt.xlabel('Feature Value')
        plt.ylabel('Frequency')
        plt.title('Distribution of Feature Means')
        plt.grid(True, alpha=0.3)
        
        # Plot 2: Feature std distribution
        plt.subplot(1, 2, 2)
        feature_stds = features.std(axis=0)
        plt.hist(feature_stds, bins=50, color='blue', alpha=0.7)
        plt.xlabel('Standard Deviation')
        plt.ylabel('Frequency')
        plt.title('Distribution of Feature Std Dev')
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        save_path = self.config.VISUALIZATIONS_PATH / 'feature_distribution.png'
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✅ Saved: {save_path}")
        plt.close()
    
    def plot_pca(self, features, df):
        """Reduce dimensions with PCA and visualize"""
        print("\n📊 Creating PCA visualization...")
        
        # Apply PCA
        pca = PCA(n_components=2)
        features_2d = pca.fit_transform(features)
        
        plt.figure(figsize=(10, 8))
        
        # Color by source
        for source in df['source'].unique():
            mask = df['source'] == source
            plt.scatter(
                features_2d[mask, 0],
                features_2d[mask, 1],
                label=source,
                alpha=0.6,
                s=20
            )
        
        plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
        plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
        plt.title('PCA: Phenotype Feature Space')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        save_path = self.config.VISUALIZATIONS_PATH / 'pca_visualization.png'
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✅ Saved: {save_path}")
        plt.close()
        
        return pca
    
    def plot_tsne(self, features, df, sample_size=1000):
        """Reduce dimensions with t-SNE and visualize"""
        print("\n📊 Creating t-SNE visualization...")
        
        # Sample if too large
        if len(features) > sample_size:
            indices = np.random.choice(len(features), sample_size, replace=False)
            features_sample = features[indices]
            df_sample = df.iloc[indices]
            print(f"   Using {sample_size} samples for t-SNE (computational efficiency)")
        else:
            features_sample = features
            df_sample = df
        
        # Apply t-SNE
        tsne = TSNE(n_components=2, random_state=42, perplexity=30)
        features_2d = tsne.fit_transform(features_sample)
        
        plt.figure(figsize=(10, 8))
        
        # Color by organ type
        for organ in df_sample['organ'].unique():
            mask = df_sample['organ'] == organ
            plt.scatter(
                features_2d[mask, 0],
                features_2d[mask, 1],
                label=organ,
                alpha=0.6,
                s=20
            )
        
        plt.xlabel('t-SNE Dimension 1')
        plt.ylabel('t-SNE Dimension 2')
        plt.title('t-SNE: Phenotype Feature Clustering')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        save_path = self.config.VISUALIZATIONS_PATH / 'tsne_visualization.png'
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✅ Saved: {save_path}")
        plt.close()

In [14]:
# ============================================
# 5. SAVE OUTPUTS
# ============================================

def save_features_and_metadata(features, df, config):
    """Save extracted features and metadata"""
    print("\n💾 Saving features and metadata...")
    
    # Save features as numpy array
    features_file = config.FEATURES_PATH / 'phenotype_features.npy'
    np.save(features_file, features)
    print(f"✅ Features saved: {features_file}")
    print(f"   Shape: {features.shape}")
    
    # Save metadata as CSV
    metadata_file = config.METADATA_PATH / 'image_metadata.csv'
    df.to_csv(metadata_file, index=False)
    print(f"✅ Metadata saved: {metadata_file}")
    print(f"   Columns: {list(df.columns)}")
    
    # Save feature statistics
    stats = {
        'mean': features.mean(axis=0),
        'std': features.std(axis=0),
        'min': features.min(axis=0),
        'max': features.max(axis=0)
    }
    stats_file = config.FEATURES_PATH / 'feature_statistics.npz'
    np.savez(stats_file, **stats)
    print(f"✅ Statistics saved: {stats_file}")


In [15]:
# ============================================
# 6. MAIN EXECUTION
# ============================================

def main():
    """Run Day 1 pipeline"""
    
    print("\n" + "="*60)
    print("STARTING DAY 1 PIPELINE")
    print("="*60)
    
    config = Config()
    
    # Step 1: Load datasets
    loader = TomatoDatasetLoader(config)
    df = loader.load_all()
    
    if len(df) == 0:
        print("❌ No images found! Check your dataset paths.")
        return
    
    # Step 2: Extract features
    extractor = PhenotypeFeatureExtractor(config)
    features, valid_paths = extractor.extract_all_features(df)
    
    if features is None:
        print("❌ Feature extraction failed!")
        return
    
    # Filter dataframe to only valid images
    df = df[df['image_path'].isin(valid_paths)].reset_index(drop=True)
    
    # Step 3: Visualize
    visualizer = FeatureVisualizer(config)
    visualizer.plot_feature_distribution(features, df)
    pca = visualizer.plot_pca(features, df)
    visualizer.plot_tsne(features, df)
    
    # Step 4: Save outputs
    save_features_and_metadata(features, df, config)
    
    # Final summary
    print("\n" + "="*60)
    print("✅ DAY 1 COMPLETE!")
    print("="*60)
    print(f"\n📊 Summary:")
    print(f"   - Total images processed: {len(df)}")
    print(f"   - Feature vectors extracted: {features.shape[0]}")
    print(f"   - Feature dimensions: {features.shape[1]}")
    print(f"   - PlantVillage (leaves): {len(df[df['source']=='PlantVillage'])}")
    print(f"   - Laboro (fruits): {len(df[df['source']=='Laboro'])}")
    print(f"\n📁 Outputs saved in: {config.OUTPUT_PATH}")
    print(f"   - Features: phenotype_features.npy")
    print(f"   - Metadata: image_metadata.csv")
    print(f"   - Visualizations: 3 PNG files")
    print(f"\n🚀 Ready for Day 2: Genotype-Trait Mapping!")

if __name__ == "__main__":
    main()


STARTING DAY 1 PIPELINE

📂 Loading PlantVillage dataset...
✅ Loaded 14509 PlantVillage images

📂 Loading Laboro dataset...
✅ Loaded 804 Laboro images

📊 Total dataset: 15313 images
   - PlantVillage (leaves): 14509
   - Laboro (fruits): 804

🔧 Loading ResNet50 model (pretrained on ImageNet)...
✅ ResNet50 loaded. Output shape: (None, 2048)
   This will extract 2048-dimensional feature vectors

🚀 Extracting phenotype features from all images...
   Processing 15313 images in batches of 32


100%|██████████████████████████████████████████████████████████████████████████████| 479/479 [1:06:32<00:00,  8.34s/it]



✅ Feature extraction complete!
   Shape: (15313, 2048)
   Successfully processed: 15313/15313 images

📊 Creating feature distribution plot...
✅ Saved: ..\AgriGenAI_Output\visualizations\feature_distribution.png

📊 Creating PCA visualization...
✅ Saved: ..\AgriGenAI_Output\visualizations\pca_visualization.png

📊 Creating t-SNE visualization...
   Using 1000 samples for t-SNE (computational efficiency)
✅ Saved: ..\AgriGenAI_Output\visualizations\tsne_visualization.png

💾 Saving features and metadata...
✅ Features saved: ..\AgriGenAI_Output\features\phenotype_features.npy
   Shape: (15313, 2048)
✅ Metadata saved: ..\AgriGenAI_Output\metadata\image_metadata.csv
   Columns: ['image_path', 'source', 'category', 'organ']
✅ Statistics saved: ..\AgriGenAI_Output\features\feature_statistics.npz

✅ DAY 1 COMPLETE!

📊 Summary:
   - Total images processed: 15313
   - Feature vectors extracted: 15313
   - Feature dimensions: 2048
   - PlantVillage (leaves): 14509
   - Laboro (fruits): 804

📁 Output