In [31]:
!pip install -q --upgrade bitsandbytes accelerate

In [32]:
# imports

import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from huggingface_hub import login
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

In [33]:
# Constants
load_dotenv()

# LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
LLAMA = "meta-llama/Llama-3.1-70B-Instruct"

audio_filename  = r"C:\Users\ukorm\Documents\AI Large Language Models Projects\myllm_engineering\denver_extract.mp3"

In [34]:
# Sign in to HuggingFace Hub

hf_token = os.getenv('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

# Open the file

audio_file = open(audio_filename, "rb")


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [35]:
# Using Open Source for Transcription - Hugging Face Pipelines

from transformers import pipeline

pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-medium.en",
    dtype=torch.float16,
    device='cpu',
    return_timestamps=True
)

result = pipe(audio_filename)
transcription = result["text"]
print(transcription)

Device set to use cpu
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.


KeyboardInterrupt: 

In [None]:
open_source_transcription = transcription

In [None]:
# Using OpenAI

AUDIO_MODEL = "gpt-4o-mini-transcribe"

openai_api_key = os.getenv('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)
openai_transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=audio_file, response_format="text")
print(transcription)

In [None]:
display(Markdown(open_source_transcription))
print("\n\n")
display(Markdown(openai_transcription))

In [None]:
# Analizing and Generating report

system_message = """
You produce minutes of meetings from transcripts, with summary, key discussion points,
takeaways and action items with owners, in markdown format without code blocks.
"""

user_prompt = f"""
Below is an extract transcript of a Denver council meeting.
Please write minutes in markdown without code blocks, including:
- a summary with attendees, location and date
- discussion points
- takeaways
- action items with owners

Transcription:
{transcription}
"""

messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
  ]


In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
streamer = TextStreamer(tokenizer)
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)
outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)

In [None]:
response = tokenizer.decode(outputs[0])

In [None]:
display(Markdown(response))