{
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Whisper Technical Terms Fine-tuning Demo\n",
    "\n",
    "This notebook demonstrates how to fine-tune Whisper models for better recognition of technical terms like Maven, GitHub, Git, Portkey, OpenAI, ChatGPT, LLM, Groq, and Grok.\n",
    "\n",
    "## Setup\n",
    "\n",
    "Make sure you've installed all dependencies and have a GPU available for training."
   ]
  }

In [18]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "from pathlib import Path\n",
    "import torch\n",
    "import yaml\n",
    "import json\n",
    "import asyncio\n",
    "\n",
    "# Add project root to path\n",
    "project_root = Path().cwd().parent if Path().cwd().name == 'notebooks' else Path().cwd()\n",
    "sys.path.append(str(project_root))\n",
    "\n",
    "print(f\"Project root: {project_root}\")\n",
    "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
    "if torch.cuda.is_available():\n",
    "    print(f\"GPU: {torch.cuda.get_device_name()}\")\n",
    "    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['import sys\n',
   'import os\n',
   'from pathlib import Path\n',
   'import torch\n',
   'import yaml\n',
   'import json\n',
   'import asyncio\n',
   '\n',
   '# Add project root to path\n',
   "project_root = Path().cwd().parent if Path().cwd().name == 'notebooks' else Path().cwd()\n",
   'sys.path.append(str(project_root))\n',
   '\n',
   'print(f"Project root: {project_root}")\n',
   'print(f"CUDA available: {torch.cuda.is_available()}")\n',
   'if torch.cuda.is_available():\n',
   '    print(f"GPU: {torch.cuda.get_device_name()}")\n',
   '    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")']},)

{
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Configuration\n",
    "\n",
    "Load the project configuration and examine the technical terms we'll be working with."
   ]
  },

In [19]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load configuration\n",
    "config_path = project_root / \"config\" / \"config.yaml\"\n",
    "with open(config_path, 'r') as f:\n",
    "    config = yaml.safe_load(f)\n",
    "\n",
    "print(\"Configuration loaded:\")\n",
    "print(f\"Model: {config['model']['name']}\")\n",
    "print(f\"Training epochs: {config['training']['num_epochs']}\")\n",
    "print(f\"Batch size: {config['training']['batch_size']}\")\n",
    "print(f\"Learning rate: {config['training']['learning_rate']}\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['# Load configuration\n',
   'config_path = project_root / "config" / "config.yaml"\n',
   "with open(config_path, 'r') as f:\n",
   '    config = yaml.safe_load(f)\n',
   '\n',
   'print("Configuration loaded:")\n',
   'print(f"Model: {config[\'model\'][\'name\']}")\n',
   'print(f"Training epochs: {config[\'training\'][\'num_epochs\']}")\n',
   'print(f"Batch size: {config[\'training\'][\'batch_size\']}")\n',
   'print(f"Learning rate: {config[\'training\'][\'learning_rate\']}")']},)

In [20]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load and display technical terms\n",
    "terms_file = project_root / \"data\" / \"tech_terms.json\"\n",
    "if terms_file.exists():\n",
    "    with open(terms_file, 'r') as f:\n",
    "        tech_terms = json.load(f)\n",
    "    \n",
    "    print(\"Technical Terms by Category:\")\n",
    "    for category, terms in tech_terms['technical_terms'].items():\n",
    "        if isinstance(terms, list):\n",
    "            print(f\"\\n{category.upper()}:\")\n",
    "            for term_data in terms:\n",
    "                term = term_data['term']\n",
    "                variations = term_data.get('variations', [])\n",
    "                print(f\"  - {term} ({', '.join(variations) if variations else 'no variations'})\")\nelse:\n",
    "    print(\"Technical terms file not found. Please run the setup first.\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['# Load and display technical terms\n',
   'terms_file = project_root / "data" / "tech_terms.json"\n',
   'if terms_file.exists():\n',
   "    with open(terms_file, 'r') as f:\n",
   '        tech_terms = json.load(f)\n',
   '    \n',
   '    print("Technical Terms by Category:")\n',
   "    for category, terms in tech_terms['technical_terms'].items():\n",
   '        if isinstance(terms, list):\n',
   '            print(f"\\n{category.upper()}:")\n',
   '            for term_data in terms:\n',
   "                term = term_data['term']\n",
   "                variations = term_data.get('variations', [])\n",
   '                print(f"  - {term} ({\', \'.join(variations) if variations else \'no variations\'})")\nelse:\n',
   '    print("Technical terms file not found. Please run the setup first.")']},)

{
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Generate Training Data\n",
    "\n",
    "Generate synthetic audio data using Text-to-Speech for training the model."
   ]
  },

In [21]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "from data.audio_generator import TechTermAudioGenerator\n",
    "\n",
    "# Configure audio generation\n",
    "audio_config = {\n",
    "    'voices': ['en-US-AriaNeural', 'en-US-JennyNeural'],  # Reduced for demo\n",
    "    'samples_per_term': 5,  # Reduced for demo\n",
    "    'sample_rate': 16000,\n",
    "    'apply_noise': True,\n",
    "    'speed_perturbation': True,\n",
    "    'templates': [\n",
    "        \"I'm using {term} for this project\",\n",
    "        \"Can you help me with {term}?\",\n",
    "        \"The {term} documentation is great\",\n",
    "        \"Let's configure {term} properly\"\n",
    "    ]\n",
    "}\n",
    "\n",
    "print(\"Audio generation configuration:\")\n",
    "for key, value in audio_config.items():\n",
    "    print(f\"  {key}: {value}\")"
   ]
 }

{'cell_type': 'code',
 'execution_count': None,
 'metadata': {},
 'outputs': [],
 'source': ['from data.audio_generator import TechTermAudioGenerator\n',
  '\n',
  '# Configure audio generation\n',
  'audio_config = {\n',
  "    'voices': ['en-US-AriaNeural', 'en-US-JennyNeural'],  # Reduced for demo\n",
  "    'samples_per_term': 5,  # Reduced for demo\n",
  "    'sample_rate': 16000,\n",
  "    'apply_noise': True,\n",
  "    'speed_perturbation': True,\n",
  "    'templates': [\n",
  '        "I\'m using {term} for this project",\n',
  '        "Can you help me with {term}?",\n',
  '        "The {term} documentation is great",\n',
  '        "Let\'s configure {term} properly"\n',
  '    ]\n',
  '}\n',
  '\n',
  'print("Audio generation configuration:")\n',
  'for key, value in audio_config.items():\n',
  '    print(f"  {key}: {value}")']}

In [22]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate training data (this will take a few minutes)\n",
    "async def generate_demo_data():\n",
    "    generator = TechTermAudioGenerator(audio_config)\n",
    "    generator.load_technical_terms(str(terms_file))\n",
    "    \n",
    "    print(\"Generating training data...\")\n",
    "    dataset = await generator.generate_audio_samples(str(project_root / \"data\"), train_split=0.8)\n",
    "    generator.save_dataset_metadata(dataset, str(project_root / \"data\"))\n",
    "    \n",
    "    print(f\"Generated {len(dataset['train'])} training samples\")\n",
    "    print(f\"Generated {len(dataset['validation'])} validation samples\")\n",
    "    return dataset\n",
    "\n",
    "# Run if data doesn't exist\n",
    "train_file = project_root / \"data\" / \"train_transcripts.json\"\n",
    "if not train_file.exists():\n",
    "    dataset = await generate_demo_data()\n",
    "else:\n",
    "    print(\"Training data already exists. Loading...\")\n",
    "    with open(train_file, 'r') as f:\n",
    "        train_data = json.load(f)\n",
    "    with open(project_root / \"data\" / \"val_transcripts.json\", 'r') as f:\n",
    "        val_data = json.load(f)\n",
    "    dataset = {'train': train_data, 'validation': val_data}\n",
    "    print(f\"Loaded {len(dataset['train'])} training samples\")\n",
    "    print(f\"Loaded {len(dataset['validation'])} validation samples\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['# Generate training data (this will take a few minutes)\n',
   'async def generate_demo_data():\n',
   '    generator = TechTermAudioGenerator(audio_config)\n',
   '    generator.load_technical_terms(str(terms_file))\n',
   '    \n',
   '    print("Generating training data...")\n',
   '    dataset = await generator.generate_audio_samples(str(project_root / "data"), train_split=0.8)\n',
   '    generator.save_dataset_metadata(dataset, str(project_root / "data"))\n',
   '    \n',
   '    print(f"Generated {len(dataset[\'train\'])} training samples")\n',
   '    print(f"Generated {len(dataset[\'validation\'])} validation samples")\n',
   '    return dataset\n',
   '\n',
   "# Run if data doesn't exist\n",
   'train_file = project_root / "data" / "train_transcripts.json"\n',
   'if not train_file.exists():\n',
   '    dataset = await generate_demo_data()\n',
   'else:\n',
   '    print("Train

{
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Examine Generated Data\n",
    "\n",
    "Let's look at some examples of the generated training data."
   ]
  },

In [23]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import random\n",
    "\n",
    "# Display sample training data\n",
    "sample_data = random.sample(dataset['train'], min(10, len(dataset['train'])))\n",
    "df = pd.DataFrame(sample_data)\n",
    "\n",
    "print(\"Sample Training Data:\")\n",
    "for i, row in df.iterrows():\n",
    "    print(f\"\\n{i+1}. Term: {row['term']}\")\n",
    "    print(f\"   Text: {row['transcription']}\")\n",
    "    print(f\"   Voice: {row['voice']}\")\n",
    "    print(f\"   Audio: {Path(row['audio_path']).name}\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['import pandas as pd\n',
   'import random\n',
   '\n',
   '# Display sample training data\n',
   "sample_data = random.sample(dataset['train'], min(10, len(dataset['train'])))\n",
   'df = pd.DataFrame(sample_data)\n',
   '\n',
   'print("Sample Training Data:")\n',
   'for i, row in df.iterrows():\n',
   '    print(f"\\n{i+1}. Term: {row[\'term\']}")\n',
   '    print(f"   Text: {row[\'transcription\']}")\n',
   '    print(f"   Voice: {row[\'voice\']}")\n',
   '    print(f"   Audio: {Path(row[\'audio_path\']).name}")']},)

In [24]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze data distribution\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Count samples per term\n",
    "train_df = pd.DataFrame(dataset['train'])\n",
    "term_counts = train_df['term'].value_counts()\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.subplot(1, 2, 1)\n",
    "term_counts.plot(kind='bar')\n",
    "plt.title('Samples per Technical Term')\n",
    "plt.xlabel('Terms')\n",
    "plt.ylabel('Number of Samples')\n",
    "plt.xticks(rotation=45)\n",
    "\n",
    "# Count samples per voice\n",
    "plt.subplot(1, 2, 2)\n",
    "voice_counts = train_df['voice'].value_counts()\n",
    "voice_counts.plot(kind='pie', autopct='%1.1f%%')\n",
    "plt.title('Distribution by Voice')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(f\"\\nDataset Statistics:\")\n",
    "print(f\"Total training samples: {len(dataset['train'])}\")\n",
    "print(f\"Total validation samples: {len(dataset['validation'])}\")\n",
    "print(f\"Unique terms: {len(term_counts)}\")\n",
    "print(f\"Voices used: {len(voice_counts)}\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['# Analyze data distribution\n',
   'import matplotlib.pyplot as plt\n',
   'import seaborn as sns\n',
   '\n',
   '# Count samples per term\n',
   "train_df = pd.DataFrame(dataset['train'])\n",
   "term_counts = train_df['term'].value_counts()\n",
   '\n',
   'plt.figure(figsize=(12, 6))\n',
   'plt.subplot(1, 2, 1)\n',
   "term_counts.plot(kind='bar')\n",
   "plt.title('Samples per Technical Term')\n",
   "plt.xlabel('Terms')\n",
   "plt.ylabel('Number of Samples')\n",
   'plt.xticks(rotation=45)\n',
   '\n',
   '# Count samples per voice\n',
   'plt.subplot(1, 2, 2)\n',
   "voice_counts = train_df['voice'].value_counts()\n",
   "voice_counts.plot(kind='pie', autopct='%1.1f%%')\n",
   "plt.title('Distribution by Voice')\n",
   '\n',
   'plt.tight_layout()\n',
   'plt.show()\n',
   '\n',
   'print(f"\\nDataset Statistics:")\n',
   'print(f"Total training samples: {len(dataset[\'train\'])}

{
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Initialize and Train Model\n",
    "\n",
    "Now let's set up the Whisper model for fine-tuning with LoRA."
   ]
  },

In [25]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "from src.model.whisper_trainer import WhisperTechTrainer\n",
    "\n",
    "# Adjust config for demo (smaller values for faster training)\n",
    "demo_config = config.copy()\n",
    "demo_config['training'].update({\n",
    "    'num_epochs': 3,  # Reduced for demo\n",
    "    'batch_size': 2,  # Reduced for demo\n",
    "    'save_steps': 50,\n",
    "    'eval_steps': 50,\n",
    "    'logging_steps': 25,\n",
    "    'output_dir': str(project_root / \"models\" / \"whisper-demo-finetuned\")\n",
    "})\n",
    "\n",
    "print(\"Demo training configuration:\")\n",
    "for key, value in demo_config['training'].items():\n",
    "    print(f\"  {key}: {value}\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['from src.model.whisper_trainer import WhisperTechTrainer\n',
   '\n',
   '# Adjust config for demo (smaller values for faster training)\n',
   'demo_config = config.copy()\n',
   "demo_config['training'].update({\n",
   "    'num_epochs': 3,  # Reduced for demo\n",
   "    'batch_size': 2,  # Reduced for demo\n",
   "    'save_steps': 50,\n",
   "    'eval_steps': 50,\n",
   "    'logging_steps': 25,\n",
   '    \'output_dir\': str(project_root / "models" / "whisper-demo-finetuned")\n',
   '})\n',
   '\n',
   'print("Demo training configuration:")\n',
   "for key, value in demo_config['training'].items():\n",
   '    print(f"  {key}: {value}")']},)

In [26]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize trainer\n",
    "print(\"Initializing Whisper trainer...\")\n",
    "trainer = WhisperTechTrainer(demo_config)\n",
    "trainer.setup_model()\n",
    "print(\"✓ Model initialized with LoRA configuration\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['# Initialize trainer\n',
   'print("Initializing Whisper trainer...")\n',
   'trainer = WhisperTechTrainer(demo_config)\n',
   'trainer.setup_model()\n',
   'print("✓ Model initialized with LoRA configuration")']},)

In [27]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare datasets\n",
    "print(\"Preparing datasets...\")\n",
    "train_dataset = trainer.load_dataset(\n",
    "    str(project_root / \"data\" / \"train_audio\"),\n",
    "    str(project_root / \"data\" / \"train_transcripts.json\")\n",
    ")\n",
    "train_dataset = trainer.prepare_dataset(train_dataset)\n",
    "\n",
    "val_dataset = trainer.load_dataset(\n",
    "    str(project_root / \"data\" / \"val_audio\"),\n",
    "    str(project_root / \"data\" / \"val_transcripts.json\")\n",
    ")\n",
    "val_dataset = trainer.prepare_dataset(val_dataset)\n",
    "\n",
    "print(f\"✓ Training dataset: {len(train_dataset)} samples\")\n",
    "print(f\"✓ Validation dataset: {len(val_dataset)} samples\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['# Prepare datasets\n',
   'print("Preparing datasets...")\n',
   'train_dataset = trainer.load_dataset(\n',
   '    str(project_root / "data" / "train_audio"),\n',
   '    str(project_root / "data" / "train_transcripts.json")\n',
   ')\n',
   'train_dataset = trainer.prepare_dataset(train_dataset)\n',
   '\n',
   'val_dataset = trainer.load_dataset(\n',
   '    str(project_root / "data" / "val_audio"),\n',
   '    str(project_root / "data" / "val_transcripts.json")\n',
   ')\n',
   'val_dataset = trainer.prepare_dataset(val_dataset)\n',
   '\n',
   'print(f"✓ Training dataset: {len(train_dataset)} samples")\n',
   'print(f"✓ Validation dataset: {len(val_dataset)} samples")']},)

In [28]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Start training (this will take some time)\n",
    "print(\"Starting training...\")\n",
    "print(\"Note: This may take 15-30 minutes depending on your hardware\")\n",
    "\n",
    "try:\n",
    "    trainer.train(train_dataset, val_dataset)\n",
    "    print(\"\\n✓ Training completed successfully!\")\nexcept Exception as e:\n",
    "    print(f\"Training failed: {str(e)}\")\n",
    "    print(\"This might be due to insufficient GPU memory or other hardware limitations.\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['# Start training (this will take some time)\n',
   'print("Starting training...")\n',
   'print("Note: This may take 15-30 minutes depending on your hardware")\n',
   '\n',
   'try:\n',
   '    trainer.train(train_dataset, val_dataset)\n',
   '    print("\\n✓ Training completed successfully!")\nexcept Exception as e:\n',
   '    print(f"Training failed: {str(e)}")\n',
   '    print("This might be due to insufficient GPU memory or other hardware limitations.")']},)

{
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Evaluate the Model\n",
    "\n",
    "Test the fine-tuned model and compare it with the original Whisper model."
   ]
  },

In [29]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test inference on a sample\n",
    "if val_dataset and len(val_dataset) > 0:\n",
    "    # Get a random validation sample\n",
    "    import random\n",
    "    sample_idx = random.randint(0, len(val_dataset) - 1)\n",
    "    sample = val_dataset[sample_idx]\n",
    "    \n",
    "    # Get the audio path from the original validation data\n",
    "    val_data = dataset['validation']\n",
    "    audio_path = val_data[sample_idx]['audio_path']\n",
    "    reference_text = val_data[sample_idx]['transcription']\n",
    "    \n",
    "    print(f\"Testing on sample: {Path(audio_path).name}\")\n",
    "    print(f\"Reference text: {reference_text}\")\n",
    "    \n",
    "    # Transcribe with fine-tuned model\n",
    "    prediction = trainer.inference(audio_path)\n",
    "    print(f\"Fine-tuned prediction: {prediction}\")\n",
    "    \n",
    "    # Calculate simple accuracy\n",
    "    import jiwer\n",
    "    wer = jiwer.wer([reference_text], [prediction])\n",
    "    print(f\"Word Error Rate: {wer:.2%}\")\nelse:\n",
    "    print(\"No validation data available for testing\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['# Test inference on a sample\n',
   'if val_dataset and len(val_dataset) > 0:\n',
   '    # Get a random validation sample\n',
   '    import random\n',
   '    sample_idx = random.randint(0, len(val_dataset) - 1)\n',
   '    sample = val_dataset[sample_idx]\n',
   '    \n',
   '    # Get the audio path from the original validation data\n',
   "    val_data = dataset['validation']\n",
   "    audio_path = val_data[sample_idx]['audio_path']\n",
   "    reference_text = val_data[sample_idx]['transcription']\n",
   '    \n',
   '    print(f"Testing on sample: {Path(audio_path).name}")\n',
   '    print(f"Reference text: {reference_text}")\n',
   '    \n',
   '    # Transcribe with fine-tuned model\n',
   '    prediction = trainer.inference(audio_path)\n',
   '    print(f"Fine-tuned prediction: {prediction}")\n',
   '    \n',
   '    # Calculate simple accuracy\n',
   '    import jiwer\n',
  

{
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Test on Custom Audio (Optional)\n",
    "\n",
    "If you have your own audio files, you can test them here."
   ]
  },

In [30]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test on custom audio file\n",
    "# Replace with your own audio file path\n",
    "custom_audio_path = \"/path/to/your/audio/file.wav\"\n",
    "\n",
    "if Path(custom_audio_path).exists():\n",
    "    print(f\"Testing on custom audio: {Path(custom_audio_path).name}\")\n",
    "    \n",
    "    # Transcribe\n",
    "    result = trainer.inference(custom_audio_path)\n",
    "    print(f\"Transcription: {result}\")\n",
    "    \n",
    "    # Check for technical terms\n",
    "    found_terms = []\n",
    "    result_lower = result.lower()\n",
    "    \n",
    "    for category in tech_terms['technical_terms'].values():\n",
    "        if isinstance(category, list):\n",
    "            for term_data in category:\n",
    "                term = term_data['term'].lower()\n",
    "                variations = [v.lower() for v in term_data.get('variations', [])]\n",
    "                \n",
    "                if term in result_lower or any(var in result_lower for var in variations):\n",
    "                    found_terms.append(term_data['term'])\n",
    "    \n",
    "    if found_terms:\n",
    "        print(f\"Technical terms detected: {', '.join(found_terms)}\")\n",
    "    else:\n",
    "        print(\"No technical terms detected in transcription\")\nelse:\n",
    "    print(\"Custom audio file not found. Skipping custom test.\")\n",
    "    print(\"To test your own audio, replace 'custom_audio_path' with your file path.\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['# Test on custom audio file\n',
   '# Replace with your own audio file path\n',
   'custom_audio_path = "/path/to/your/audio/file.wav"\n',
   '\n',
   'if Path(custom_audio_path).exists():\n',
   '    print(f"Testing on custom audio: {Path(custom_audio_path).name}")\n',
   '    \n',
   '    # Transcribe\n',
   '    result = trainer.inference(custom_audio_path)\n',
   '    print(f"Transcription: {result}")\n',
   '    \n',
   '    # Check for technical terms\n',
   '    found_terms = []\n',
   '    result_lower = result.lower()\n',
   '    \n',
   "    for category in tech_terms['technical_terms'].values():\n",
   '        if isinstance(category, list):\n',
   '            for term_data in category:\n',
   "                term = term_data['term'].lower()\n",
   "                variations = [v.lower() for v in term_data.get('variations', [])]\n",
   '                \n',
   '             

{
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Model Analysis\n",
    "\n",
    "Analyze the trained model's performance and characteristics."
   ]
  },

In [31]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Model information\n",
    "print(\"Model Information:\")\n",
    "print(f\"Base model: {demo_config['model']['name']}\")\n",
    "print(f\"Language: {demo_config['model']['language']}\")\n",
    "print(f\"Task: {demo_config['model']['task']}\")\n",
    "print(f\"Training epochs: {demo_config['training']['num_epochs']}\")\n",
    "print(f\"LoRA enabled: {demo_config['training']['use_lora']}\")\n",
    "\n",
    "if demo_config['training']['use_lora']:\n",
    "    print(f\"LoRA rank: {demo_config['training']['lora_rank']}\")\n",
    "    print(f\"LoRA alpha: {demo_config['training']['lora_alpha']}\")\n",
    "\n",
    "# Model size information\n",
    "model_path = Path(demo_config['training']['output_dir'])\n",
    "if model_path.exists():\n",
    "    model_size = sum(f.stat().st_size for f in model_path.rglob('*') if f.is_file())\n",
    "    print(f\"\\nModel directory size: {model_size / (1024**2):.1f} MB\")\n",
    "    print(f\"Model saved to: {model_path}\")\nelse:\n",
    "    print(\"\\nModel directory not found - training may not have completed successfully\")"
   ]
  },

({'cell_type': 'code',
  'execution_count': None,
  'metadata': {},
  'outputs': [],
  'source': ['# Model information\n',
   'print("Model Information:")\n',
   'print(f"Base model: {demo_config[\'model\'][\'name\']}")\n',
   'print(f"Language: {demo_config[\'model\'][\'language\']}")\n',
   'print(f"Task: {demo_config[\'model\'][\'task\']}")\n',
   'print(f"Training epochs: {demo_config[\'training\'][\'num_epochs\']}")\n',
   'print(f"LoRA enabled: {demo_config[\'training\'][\'use_lora\']}")\n',
   '\n',
   "if demo_config['training']['use_lora']:\n",
   '    print(f"LoRA rank: {demo_config[\'training\'][\'lora_rank\']}")\n',
   '    print(f"LoRA alpha: {demo_config[\'training\'][\'lora_alpha\']}")\n',
   '\n',
   '# Model size information\n',
   "model_path = Path(demo_config['training']['output_dir'])\n",
   'if model_path.exists():\n',
   "    model_size = sum(f.stat().st_size for f in model_path.rglob('*') if f.is_file())\n",
   '    print(f"\\nModel directory size: {model_size /

{
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Next Steps\n",
    "\n",
    "This demo showed you how to:\n",
    "1. Generate training data with technical terms\n",
    "2. Fine-tune Whisper using LoRA\n",
    "3. Test the fine-tuned model\n",
    "\n",
    "For production use:\n",
    "- Increase `samples_per_term` and `num_epochs` for better results\n",
    "- Add more diverse sentence templates\n",
    "- Include real recorded audio in your training data\n",
    "- Run full evaluation with the evaluation script\n",
    "- Compare with the original model using the comparison tools"
   ]
  },

In [32]:
{
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Demo completed! 🎉\")\n",
    "print(\"\\nTo run full training and evaluation:\")\n",
    "print(\"1. python scripts/generate_training_data.py\")\n",
    "print(\"2. python scripts/train_model.py --model_size large-v3 --epochs 10\")\n",
    "print(\"3. python scripts/evaluate_model.py --compare_original\")\n",
    "print(\"4. python scripts/inference.py --audio_path your_audio.wav\")"
   ]
  }

{'cell_type': 'code',
 'execution_count': None,
 'metadata': {},
 'outputs': [],
 'source': ['print("Demo completed! 🎉")\n',
  'print("\\nTo run full training and evaluation:")\n',
  'print("1. python scripts/generate_training_data.py")\n',
  'print("2. python scripts/train_model.py --model_size large-v3 --epochs 10")\n',
  'print("3. python scripts/evaluate_model.py --compare_original")\n',
  'print("4. python scripts/inference.py --audio_path your_audio.wav")']}

In [33]:
{
  "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }

{'metadata': {'kernelspec': {'display_name': 'Python 3',
   'language': 'python',
   'name': 'python3'},
  'language_info': {'codemirror_mode': {'name': 'ipython', 'version': 3},
   'file_extension': '.py',
   'mimetype': 'text/x-python',
   'name': 'python',
   'nbconvert_exporter': 'python',
   'pygments_lexer': 'ipython3',
   'version': '3.8.0'}},
 'nbformat': 4,
 'nbformat_minor': 4}