In [1]:
import os

for root, dirs, files in os.walk(".", topdown=True):
    level = root.replace(".", "").count(os.sep)
    indent = " " * 4 * level
    print(f"{indent}{os.path.basename(root)}/")

    subindent = " " * 4 * (level + 1)
    # Show only first 5 files
    for f in files[:5]:
        print(f"{subindent}{f}")
    if len(files) > 5:
        print(f"{subindent}...")



./
    structure_of_data.ipynb
    .ipynb_checkpoints/
        model-checkpoint.ipynb
        structure_of_data-checkpoint.ipynb
        Untitled-checkpoint.ipynb
    AgriGenAI_Dataset/
        CropsDB/
            Solanum_lycopersicum.xlsx
        Laboro/
            annotations/
                test.json
                train.json
            images/
                IMG_0983.jpg
                IMG_0984.jpg
                IMG_0985.jpg
                IMG_0986.jpg
                IMG_0987.jpg
                ...
            labels/
                IMG_0983.txt
                IMG_0984.txt
                IMG_0985.txt
                IMG_0986.txt
                IMG_0987.txt
                ...
        metadata/
            master_dataset.csv
        PlantVillage/
            images/
                Bacterial_spot/
                    Bs10.jpg
                    Bs100.jpg
                    Bs1000.jpg
                    Bs1001.jpg
                    Bs1002.jpg
                    

In [2]:
import requests
from bs4 import BeautifulSoup
import os
import time
from pathlib import Path

class SGNImageDownloader:
    def __init__(self, output_dir='AgriGenAI_Dataset/SGN/downloaded_images'):
        self.base_url = 'https://solgenomics.net'
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
    def search_tomato_images(self, query='tomato fruit', max_images=500):
        """
        Search and download tomato images from SGN
        """
        search_url = f'{self.base_url}/search/images'
        
        # Note: SGN requires manual interaction for most searches
        # This is a template - you'll need to adapt based on actual SGN structure
        
        print("⚠️ SGN doesn't support automated bulk downloads.")
        print("Recommended approach:")
        print("1. Manual search on SGN website")
        print("2. Use their API if available (check documentation)")
        print("3. Contact SGN for research dataset access")
        
        return []
    
    def download_from_url_list(self, url_file):
        """
        Download images from a text file containing URLs
        """
        with open(url_file, 'r') as f:
            urls = [line.strip() for line in f if line.strip()]
        
        print(f"📥 Downloading {len(urls)} images...")
        
        for idx, url in enumerate(urls):
            try:
                response = requests.get(url, timeout=10)
                if response.status_code == 200:
                    filename = f"sgn_image_{idx:04d}.jpg"
                    filepath = self.output_dir / filename
                    
                    with open(filepath, 'wb') as f:
                        f.write(response.content)
                    
                    print(f"✅ Downloaded: {filename}")
                    time.sleep(0.5)  # Be respectful to the server
                else:
                    print(f"❌ Failed: {url}")
            except Exception as e:
                print(f"❌ Error downloading {url}: {e}")
        
        print(f"\n✅ Download complete! Images saved to {self.output_dir}")

# Usage
if __name__ == "__main__":
    downloader = SGNImageDownloader()
    
    # Option 1: If you have a URL list
    # downloader.download_from_url_list('sgn_urls.txt')
    
    # Option 2: Manual recommendation
    print("🌱 SGN Image Collection Strategy:")
    print("\n1. Visit: https://solgenomics.net/search/images")
    print("2. Search for 'tomato fruit', 'tomato leaf', 'tomato plant'")
    print("3. Right-click images and save manually (or use browser extension)")
    print("4. Target: 300-500 images from SGN")
    print("\nAlternatively, focus on existing datasets which are already comprehensive!")

🌱 SGN Image Collection Strategy:

1. Visit: https://solgenomics.net/search/images
2. Search for 'tomato fruit', 'tomato leaf', 'tomato plant'
3. Right-click images and save manually (or use browser extension)
4. Target: 300-500 images from SGN

Alternatively, focus on existing datasets which are already comprehensive!


In [2]:
import os
import csv
import pandas as pd

# Paths
plantvillage_root = 'AgriGenAI_Dataset/PlantVillage/images'
laboro_image_dir = 'AgriGenAI_Dataset/Laboro/images'
laboro_label_dir = 'AgriGenAI_Dataset/Laboro/labels'
cropsdb_excel = 'AgriGenAI_Dataset/CropsDB/Solanum_lycopersicum.xlsx'
output_csv = 'AgriGenAI_Dataset/metadata/master_dataset.csv'

# Create CSV
with open(output_csv, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow([
        'image_id', 'image_type', 'source_dataset',
        'trait_name', 'trait_value',
        'genotype_marker', 'genotype_value',
        'image_path'
    ])

    # 1. PlantVillage
    for disease_folder in os.listdir(plantvillage_root):
        folder_path = os.path.join(plantvillage_root, disease_folder)
        if os.path.isdir(folder_path):
            for img_file in os.listdir(folder_path):
                if img_file.endswith('.jpg'):
                    writer.writerow([
                        img_file, 'leaf', 'PlantVillage',
                        'leaf_health', disease_folder.replace('_', ' ').lower(),
                        '', '', os.path.join('PlantVillage/images', disease_folder, img_file)
                    ])

    # 2. Laboro
    label_map = {'0': 'ripe', '1': 'unripe', '2': 'reject'}
    for label_file in os.listdir(laboro_label_dir):
        if label_file.endswith('.txt'):
            image_id = label_file.replace('.txt', '.jpg')
            with open(os.path.join(laboro_label_dir, label_file), 'r') as f:
                for line in f:
                    class_id = line.strip().split()[0]
                    trait_value = label_map.get(class_id, 'unknown')
                    writer.writerow([
                        image_id, 'fruit', 'Laboro',
                        'fruit_quality', trait_value,
                        '', '', os.path.join('Laboro/images', image_id)
                    ])

    # 3. CropsDB
    df = pd.read_excel(cropsdb_excel)
    for i, row in df.iterrows():
        gene = row.get('gene_symbol', '')
        trait = row.get('Direct_annotation', '')
        if pd.notna(gene) and pd.notna(trait):
            writer.writerow([
                '', 'genotype', 'CropsDB',
                trait, '', gene, '', ''
            ])
