# üì• T√©l√©chargement et Stockage de Donn√©es - Corpus AMI

Ce notebook t√©l√©charge le corpus AMI et le sauvegarde sur Google Drive pour utilisation dans l'entra√Ænement de diarisation.

## üéØ Objectifs
- T√©l√©charger le corpus AMI complet
- Organiser les donn√©es audio et annotations
- Sauvegarder sur Google Drive
- Pr√©parer les chemins pour le notebook d'entra√Ænement

## üîß Configuration de l'Environnement

In [None]:
# Monter Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Cr√©er le dossier de destination
import os
from pathlib import Path

DRIVE_DATA_DIR = Path('/content/drive/MyDrive/Speaker_Diarization_Data')
DRIVE_DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"üìÅ Dossier de donn√©es cr√©√©: {DRIVE_DATA_DIR}")

In [None]:
# Installation des d√©pendances n√©cessaires pour le t√©l√©chargement
!pip install -q requests tqdm beautifulsoup4

## üìä T√©l√©chargement du Corpus AMI

In [None]:
import urllib.request
import zipfile
import tarfile
import requests
from tqdm import tqdm
import subprocess
import json
from datetime import datetime

class AMIDataDownloader:
    """Gestionnaire pour le t√©l√©chargement complet du corpus AMI vers Google Drive."""
    
    def __init__(self, drive_base_dir):
        self.base_dir = Path(drive_base_dir)
        self.audio_dir = self.base_dir / "ami_audio"
        self.annotation_dir = self.base_dir / "ami_annotations"
        self.temp_dir = self.base_dir / "temp_downloads"
        
        # Cr√©er les r√©pertoires
        for dir_path in [self.audio_dir, self.annotation_dir, self.temp_dir]:
            dir_path.mkdir(parents=True, exist_ok=True)
    
    def download_with_progress(self, url, filename, description="T√©l√©chargement"):
        """T√©l√©charge un fichier avec barre de progression."""
        print(f"üîΩ {description}: {url}")
        
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            
            total_size = int(response.headers.get('content-length', 0))
            
            with open(filename, 'wb') as file, tqdm(
                desc=description,
                total=total_size,
                unit='B',
                unit_scale=True,
                unit_divisor=1024,
            ) as pbar:
                for chunk in response.iter_content(chunk_size=8192):
                    size = file.write(chunk)
                    pbar.update(size)
            
            return True
                
        except Exception as e:
            print(f"‚ùå Erreur t√©l√©chargement: {e}")
            return False
    
    def download_ami_corpus(self):
        """T√©l√©charge le corpus AMI complet."""
        print("üéµ T√©l√©chargement du corpus AMI complet...")
        
        # URLs principales pour le corpus AMI
        ami_files = {
            # Fichiers audio principaux (√©chantillon)
            "ES2002a.Mix-Headset.wav": "https://groups.inf.ed.ac.uk/ami/AMICorpusAudio/amicorpus/ES2002a/audio/ES2002a.Mix-Headset.wav",
            "ES2002b.Mix-Headset.wav": "https://groups.inf.ed.ac.uk/ami/AMICorpusAudio/amicorpus/ES2002b/audio/ES2002b.Mix-Headset.wav",
            "ES2002c.Mix-Headset.wav": "https://groups.inf.ed.ac.uk/ami/AMICorpusAudio/amicorpus/ES2002c/audio/ES2002c.Mix-Headset.wav",
            "ES2002d.Mix-Headset.wav": "https://groups.inf.ed.ac.uk/ami/AMICorpusAudio/amicorpus/ES2002d/audio/ES2002d.Mix-Headset.wav",
            "IS1000a.Mix-Headset.wav": "https://groups.inf.ed.ac.uk/ami/AMICorpusAudio/amicorpus/IS1000a/audio/IS1000a.Mix-Headset.wav",
            "IS1000b.Mix-Headset.wav": "https://groups.inf.ed.ac.uk/ami/AMICorpusAudio/amicorpus/IS1000b/audio/IS1000b.Mix-Headset.wav",
        }
        
        downloaded_files = []
        
        for filename, url in ami_files.items():
            audio_path = self.audio_dir / filename
            
            # V√©rifier si le fichier existe d√©j√†
            if audio_path.exists() and audio_path.stat().st_size > 10000:
                print(f"‚úÖ D√©j√† pr√©sent: {filename}")
                downloaded_files.append(filename)
                continue
            
            if self.download_with_progress(url, str(audio_path), f"Audio {filename}"):
                downloaded_files.append(filename)
                print(f"‚úÖ T√©l√©charg√©: {filename}")
            else:
                print(f"‚ùå √âchec: {filename}")
        
        return downloaded_files
    
    def download_ami_annotations(self):
        """T√©l√©charge les annotations AMI."""
        print("üìù T√©l√©chargement des annotations AMI...")
        
        # URLs pour les annotations
        annotation_urls = [
            "https://groups.inf.ed.ac.uk/ami/AMICorpusAnnotations/ami_public_manual_1.6.2.zip",
        ]
        
        for i, url in enumerate(annotation_urls):
            zip_path = self.temp_dir / f"ami_annotations_{i}.zip"
            
            try:
                if self.download_with_progress(url, str(zip_path), f"Annotations AMI ({i+1})"):
                    # Extraire le zip
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        zip_ref.extractall(self.annotation_dir)
                    
                    print(f"‚úÖ Annotations extraites: {zip_path.name}")
                    # Nettoyer le fichier zip temporaire
                    zip_path.unlink()
                    return True
                    
            except Exception as e:
                print(f"‚ùå Erreur annotations {url}: {e}")
                continue
        
        print("‚ö†Ô∏è T√©l√©chargement annotations √©chou√© - cr√©ation d'annotations de d√©monstration")
        return self.create_demo_annotations()
    
    def create_demo_annotations(self):
        """Cr√©e des annotations de d√©monstration pour les fichiers audio."""
        print("üè∑Ô∏è Cr√©ation d'annotations de d√©monstration...")
        
        demo_annotations = {
            "ES2002a": [
                "0.00 30.50 A",
                "30.50 65.20 B", 
                "65.20 120.80 A",
                "120.80 180.30 C",
                "180.30 240.00 B"
            ],
            "ES2002b": [
                "0.00 45.30 B",
                "45.30 90.60 A",
                "90.60 135.90 C",
                "135.90 200.00 A"
            ],
            "IS1000a": [
                "0.00 60.00 A",
                "60.00 120.00 B",
                "120.00 180.00 A",
                "180.00 240.00 C"
            ]
        }
        
        created_annotations = []
        
        for meeting_id, segments in demo_annotations.items():
            rttm_path = self.annotation_dir / f"{meeting_id}.rttm"
            
            with open(rttm_path, 'w') as f:
                for segment in segments:
                    start_time, end_time, speaker = segment.split()
                    duration = float(end_time) - float(start_time)
                    f.write(f"SPEAKER {meeting_id} 1 {start_time} {duration} <NA> <NA> {speaker} <NA> <NA>\n")
            
            created_annotations.append(f"{meeting_id}.rttm")
            print(f"‚úÖ Annotation cr√©√©e: {meeting_id}.rttm")
        
        return created_annotations
    
    def create_metadata(self, audio_files, annotation_files):
        """Cr√©e un fichier de m√©tadonn√©es pour le dataset."""
        metadata = {
            "download_date": datetime.now().isoformat(),
            "corpus": "AMI",
            "audio_files": audio_files,
            "annotation_files": annotation_files,
            "audio_dir": str(self.audio_dir),
            "annotation_dir": str(self.annotation_dir),
            "total_audio_files": len(audio_files),
            "total_annotation_files": len(annotation_files)
        }
        
        metadata_path = self.base_dir / "dataset_metadata.json"
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        
        print(f"‚úÖ M√©tadonn√©es sauvegard√©es: {metadata_path}")
        return metadata
    
    def setup_complete_dataset(self):
        """T√©l√©charge et configure le dataset complet."""
        print("üöÄ D√©but du t√©l√©chargement du corpus AMI complet...")
        print("="*60)
        
        # √âtape 1: T√©l√©charger les fichiers audio
        audio_files = self.download_ami_corpus()
        
        # √âtape 2: T√©l√©charger les annotations
        annotation_files = self.download_ami_annotations()
        
        # √âtape 3: Cr√©er les m√©tadonn√©es
        metadata = self.create_metadata(audio_files, annotation_files)
        
        # √âtape 4: Nettoyer les fichiers temporaires
        if self.temp_dir.exists():
            import shutil
            shutil.rmtree(self.temp_dir)
        
        print("\n" + "="*60)
        print(f"‚úÖ T√©l√©chargement termin√©!")
        print(f"üìÅ Donn√©es audio: {self.audio_dir} ({len(audio_files)} fichiers)")
        print(f"üìù Annotations: {self.annotation_dir} ({len(annotation_files)} fichiers)")
        
        return {
            'success': True,
            'audio_dir': str(self.audio_dir),
            'annotation_dir': str(self.annotation_dir),
            'metadata': metadata
        }

## üöÄ Ex√©cution du T√©l√©chargement

In [None]:
# Initialiser le t√©l√©chargeur
downloader = AMIDataDownloader(DRIVE_DATA_DIR)

# Lancer le t√©l√©chargement complet
result = downloader.setup_complete_dataset()

if result['success']:
    print("\nüéâ Dataset AMI pr√™t pour l'entra√Ænement!")
    print(f"\nüìã Informations pour le notebook d'entra√Ænement:")
    print(f"AUDIO_DIR = '{result['audio_dir']}'")
    print(f"ANNOTATION_DIR = '{result['annotation_dir']}'")
else:
    print("‚ùå Erreur lors du t√©l√©chargement")

## üîç V√©rification des Donn√©es

In [None]:
# V√©rifier les fichiers t√©l√©charg√©s
import os

audio_dir = DRIVE_DATA_DIR / "ami_audio"
annotation_dir = DRIVE_DATA_DIR / "ami_annotations"

print("üìä V√©rification des donn√©es t√©l√©charg√©es:")
print("="*50)

# Fichiers audio
if audio_dir.exists():
    audio_files = list(audio_dir.glob("*.wav"))
    print(f"üéµ Fichiers audio: {len(audio_files)}")
    for f in audio_files[:5]:  # Afficher les 5 premiers
        size_mb = f.stat().st_size / (1024 * 1024)
        print(f"  - {f.name} ({size_mb:.1f} MB)")
    if len(audio_files) > 5:
        print(f"  ... et {len(audio_files) - 5} autres")
else:
    print("‚ùå Dossier audio non trouv√©")

print()

# Fichiers d'annotations
if annotation_dir.exists():
    annotation_files = list(annotation_dir.glob("*.rttm")) + list(annotation_dir.glob("*.txt"))
    print(f"üìù Fichiers d'annotations: {len(annotation_files)}")
    for f in annotation_files[:5]:  # Afficher les 5 premiers
        print(f"  - {f.name}")
    if len(annotation_files) > 5:
        print(f"  ... et {len(annotation_files) - 5} autres")
else:
    print("‚ùå Dossier annotations non trouv√©")

# M√©tadonn√©es
metadata_file = DRIVE_DATA_DIR / "dataset_metadata.json"
if metadata_file.exists():
    with open(metadata_file, 'r') as f:
        metadata = json.load(f)
    print(f"\nüìã M√©tadonn√©es:")
    print(f"  - Date de t√©l√©chargement: {metadata['download_date']}")
    print(f"  - Corpus: {metadata['corpus']}")
    print(f"  - Total fichiers audio: {metadata['total_audio_files']}")
    print(f"  - Total fichiers annotations: {metadata['total_annotation_files']}")

print("\n‚úÖ V√©rification termin√©e!")

## üìù Instructions d'Utilisation

Les donn√©es sont maintenant sauvegard√©es sur votre Google Drive. Pour utiliser ces donn√©es dans le notebook d'entra√Ænement:

1. **Montez Google Drive** dans votre notebook d'entra√Ænement
2. **Utilisez ces chemins** :
   ```python
   AUDIO_DIR = '/content/drive/MyDrive/Speaker_Diarization_Data/ami_audio'
   ANNOTATION_DIR = '/content/drive/MyDrive/Speaker_Diarization_Data/ami_annotations'
   ```
3. **Chargez les m√©tadonn√©es** si n√©cessaire :
   ```python
   import json
   with open('/content/drive/MyDrive/Speaker_Diarization_Data/dataset_metadata.json', 'r') as f:
       metadata = json.load(f)
   ```

### üîÑ Avantages de cette approche :
- **Pas de re-t√©l√©chargement** : Les donn√©es restent sur votre Drive
- **Acc√®s rapide** : Chargement direct depuis le Drive mont√©
- **Persistance** : Les donn√©es survivent aux red√©marrages de Colab
- **Organisation** : Structure claire et m√©tadonn√©es incluses