In [None]:
import os
import pandas as pd
import whisper
import requests

AUDIO_FOLDER = "Public_Airwaves"  
OUTPUT_CSV = "transcriptions.csv"

# Loading Whisper model (use "small" or "base" for faster processing)
model = whisper.load_model("small")

def transcribe_audio(file_path):
    """Transcribe speech from an audio file using Whisper."""
    result = model.transcribe(file_path)
    return result["text"]

def fetch_external_text():
    """Fetch external news headlines as additional text data."""
    url = "https://newsapi.org/v2/top-headlines?country=us&apiKey=YOUR_NEWSAPI_KEY"
    response = requests.get(url)
    if response.status_code == 200:
        articles = response.json().get("articles", [])
        return [article["title"] for article in articles if "title" in article][:5]  # Return top 5 titles
    return []

data = []
audio_files = [f for f in os.listdir(AUDIO_FOLDER) if f.endswith(".wav") or f.endswith(".mp3")]

for file in audio_files:
    file_path = os.path.join(AUDIO_FOLDER, file)
    text = transcribe_audio(file_path)
    external_text = fetch_external_text()
    
    data.append({
        "File": file,
        "Transcription": text,
        "External_Text": " | ".join(external_text)
    })

df = pd.DataFrame(data)
df.to_csv(OUTPUT_CSV, index=False)

print(f"Transcriptions saved to {OUTPUT_CSV}")
