# üé§ DOWNLOAD E PREPARAZIONE DATASET ITALIANO PER PIPER
## Dataset: giacomoarienti/female-LJSpeech-italian

**Questo notebook scarica automaticamente il dataset italiano e lo prepara nel formato corretto per Piper TTS**

---

### Cosa fa questo notebook:
- ‚úÖ Scarica 5856 file audio in italiano da Hugging Face
- ‚úÖ Converte tutto a 16000Hz, Mono, 16-bit
- ‚úÖ Crea la struttura directory corretta (wavs/ + metadata.csv)
- ‚úÖ Verifica completezza e correttezza del dataset

### Risultato finale:
- üìÅ Directory: `/content/ljspeech_italian/`
- üéµ 5856 file audio (8h 23m totali)
- üìÑ metadata.csv con tutte le trascrizioni

---

In [None]:
# ============================================================
# STEP 1: Installazione dipendenze
# ============================================================
print("üì¶ Installazione dipendenze...")

!pip install -q datasets librosa soundfile pandas tqdm

print("‚úÖ Dipendenze installate!\n")

In [None]:
# ============================================================
# STEP 2: Download dataset italiano
# ============================================================
print("="*60)
print("  üì• DOWNLOAD DATASET ITALIANO")
print("="*60)

from datasets import load_dataset, Audio
import os

print("\nüìö Scarico dataset 'giacomoarienti/female-LJSpeech-italian'...")
print("‚è±Ô∏è  Questo richieder√† alcuni minuti...\n")

try:
    # Carica dataset SENZA decodificare automaticamente l'audio
    dataset = load_dataset("giacomoarienti/female-LJSpeech-italian", split="train")
    
    # Rimuovi il decoding automatico dell'audio
    dataset = dataset.cast_column("audio", Audio(decode=False))
    
    print(f"‚úÖ Dataset scaricato: {len(dataset)} campioni audio\n")
    
except Exception as e:
    print(f"‚ùå Errore durante il download: {e}")
    raise

In [None]:
# ============================================================
# STEP 3: Creazione struttura directory
# ============================================================
print("="*60)
print("  üìÅ CREAZIONE STRUTTURA DATASET")
print("="*60)

dataset_dir = "/content/ljspeech_italian"
wavs_dir = f"{dataset_dir}/wavs"

# Crea directory
os.makedirs(wavs_dir, exist_ok=True)
print(f"\n‚úÖ Directory create:")
print(f"   üìÅ {dataset_dir}/")
print(f"   üìÅ {wavs_dir}/\n")

In [None]:
# ============================================================
# STEP 4: Conversione e salvataggio audio
# ============================================================
print("="*60)
print("  üîä CONVERSIONE AUDIO E TRASCRIZIONI")
print("="*60)

import soundfile as sf
import librosa
import numpy as np
from tqdm import tqdm
import re
import io

metadata_lines = []
errors = 0
success = 0

print("\n‚öôÔ∏è  Conversione in corso (16000Hz, Mono, 16-bit)...\n")

for idx, item in enumerate(tqdm(dataset, desc="Processando")):
    try:
        # Nome file
        if 'id' in item and item['id']:
            filename = item['id']
        else:
            filename = f"audio_{idx:06d}"
        
        wav_path = f"{wavs_dir}/{filename}.wav"
        
        # Ottieni i bytes dell'audio
        audio_bytes = item['audio']['bytes']
        
        # Carica audio dai bytes
        audio, sr_original = sf.read(io.BytesIO(audio_bytes))
        
        # Converti a mono se stereo
        if len(audio.shape) > 1:
            audio = audio.mean(axis=1)
        
        # Resample a 16000Hz se necessario
        if sr_original != 16000:
            audio = librosa.resample(audio, orig_sr=sr_original, target_sr=16000)
        
        # Normalizza audio
        if np.max(np.abs(audio)) > 0:
            audio = audio / np.max(np.abs(audio)) * 0.95
        
        # Salva come WAV (16000Hz, Mono, 16-bit)
        sf.write(wav_path, audio, 16000, subtype='PCM_16')
        
        # Ottieni trascrizione
        text = None
        for field in ['text', 'sentence', 'transcription', 'transcript']:
            if field in item and item[field]:
                text = str(item[field]).strip()
                break
        
        if not text:
            errors += 1
            if os.path.exists(wav_path):
                os.remove(wav_path)
            continue
        
        # Pulizia testo
        text = text.replace('\n', ' ').replace('\r', ' ')
        text = re.sub(r'\s+', ' ', text)
        
        # Aggiungi a metadata
        if len(text) > 0:
            metadata_lines.append(f"{filename}|{text}")
            success += 1
        else:
            errors += 1
            if os.path.exists(wav_path):
                os.remove(wav_path)
            
    except Exception as e:
        errors += 1
        if errors <= 5:
            print(f"\n‚ö†Ô∏è  Errore file {idx}: {e}")
        if 'wav_path' in locals() and os.path.exists(wav_path):
            try:
                os.remove(wav_path)
            except:
                pass

print(f"\n‚úÖ Conversione completata!")
print(f"   ‚úì File salvati: {success}")
print(f"   ‚úó Errori: {errors}\n")

In [None]:
# ============================================================
# STEP 5: Salva metadata.csv
# ============================================================
print("="*60)
print("  üíæ SALVATAGGIO METADATA")
print("="*60)

metadata_path = f"{dataset_dir}/metadata.csv"

with open(metadata_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(metadata_lines))

print(f"\n‚úÖ metadata.csv salvato: {len(metadata_lines)} righe\n")

In [None]:
# ============================================================
# STEP 6: VERIFICA COMPLETA DATASET
# ============================================================
print("="*60)
print("  üîç VERIFICA DATASET")
print("="*60)

import pandas as pd
import wave
from pathlib import Path
import random

# Verifica metadata
metadata = pd.read_csv(metadata_path, sep='|', header=None, names=['filename', 'text'])
num_metadata = len(metadata)

# Verifica WAV
wav_files = list(Path(wavs_dir).glob("*.wav"))
num_wavs = len(wav_files)

print(f"\nüìä Statistiche:")
print(f"   üìÑ Righe metadata: {num_metadata}")
print(f"   üéµ File WAV: {num_wavs}")

if num_metadata == num_wavs:
    print(f"   ‚úÖ Corrispondenza perfetta!")
else:
    print(f"   ‚ö†Ô∏è  Discrepanza: {abs(num_metadata - num_wavs)} file")

# Verifica formato audio su 5 file casuali
print(f"\nüîä Test 5 file audio casuali:")

sample_files = random.sample(wav_files, min(5, len(wav_files)))
all_correct = True

for wav_path in sample_files:
    try:
        with wave.open(str(wav_path), 'rb') as wav:
            sr = wav.getframerate()
            channels = wav.getnchannels()
            sampwidth = wav.getsampwidth()
            frames = wav.getnframes()
            duration = frames / sr
            
            is_correct = (sr == 16000 and channels == 1 and sampwidth == 2)
            status = "‚úÖ" if is_correct else "‚ùå"
            
            print(f"{status} {wav_path.name}: {sr}Hz, {channels}ch, {sampwidth*8}bit, {duration:.2f}s")
            
            if not is_correct:
                all_correct = False
    except Exception as e:
        print(f"‚ùå {wav_path.name}: Errore - {e}")
        all_correct = False

# Mostra prime 5 trascrizioni
print(f"\nüìù Prime 5 trascrizioni:")
for idx, row in metadata.head(5).iterrows():
    text_preview = row['text'][:70] + "..." if len(row['text']) > 70 else row['text']
    print(f"   {row['filename']}: {text_preview}")

# Calcola durata totale
print(f"\n‚è±Ô∏è  Calcolo durata totale...")
total_duration = 0
sample_size = min(100, len(wav_files))

for wav_file in tqdm(wav_files[:sample_size], desc="Campionamento"):
    try:
        with wave.open(str(wav_file), 'rb') as wav:
            frames = wav.getnframes()
            rate = wav.getframerate()
            total_duration += frames / rate
    except:
        pass

# Estrapola durata totale
if sample_size > 0:
    avg_duration = total_duration / sample_size
    estimated_total = avg_duration * len(wav_files)
    
    hours = int(estimated_total // 3600)
    minutes = int((estimated_total % 3600) // 60)
    seconds = int(estimated_total % 60)
    
    print(f"\nüìä Durata stimata totale: {hours}h {minutes}m {seconds}s")
    print(f"üìä Durata media per file: {avg_duration:.2f}s")

# RIEPILOGO FINALE
print("\n" + "="*60)
print("  üìä RIEPILOGO FINALE")
print("="*60)

if all_correct and num_metadata == num_wavs and num_wavs > 0:
    print("\nüéâ DATASET PRONTO PER IL TRAINING!")
    print(f"\n‚úÖ Tutto corretto:")
    print(f"   ‚úì Dataset: giacomoarienti/female-LJSpeech-italian")
    print(f"   ‚úì {num_wavs} file audio")
    print(f"   ‚úì {num_metadata} trascrizioni")
    print(f"   ‚úì Formato: 16000Hz, Mono, 16-bit")
    if 'hours' in locals():
        print(f"   ‚úì Durata: ~{hours}h {minutes}m")
    print(f"\nüìç Path dataset: {dataset_dir}")
    print(f"\nüí° Usa questo path nel notebook di training:")
    print(f'   DATASET_DIR = "{dataset_dir}"')
else:
    print("\n‚ö†Ô∏è  Alcuni problemi rilevati:")
    if not all_correct:
        print("   ‚Ä¢ Formato audio non uniforme")
    if num_metadata != num_wavs:
        print(f"   ‚Ä¢ Discrepanza metadata/audio: {num_metadata} vs {num_wavs}")
    print("\nVerifica i dettagli sopra prima di procedere.")

print("\n" + "="*60)