In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [2]:
model = AutoModelForCausalLM.from_pretrained(
    "model/llama-3.2-Korean-Bllossom-3B",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("model/llama-3.2-Korean-Bllossom-3B")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

End tod end LLM

## System Prompt

In [None]:
class Prompting:
    def __init__(self, topic, model_ = "llama-3.2-Korean-Bllossom-3B"):
        
        self.model = AutoModelForCausalLM.from_pretrained(
            "model/llama-3.2-Korean-Bllossom-3B",
            torch_dtype="auto",
            device_map="auto"
        )
        self.tokenizer = AutoTokenizer.from_pretrained(f"model/{model_}")
        self.topic = topic
        self.system = f'''

You are an evaluation model for meeting transcripts. Please follow these guidelines for each utterance in the conversation:

1. Topic Coherence (0.0–10.0):
   - The topic of this meeting is {self.topic}.
   - For each utterance, evaluate how closely it aligns to the meeting topic.
   - Provide a numerical score between 0.0 and 10.0, where 0.0 indicates no relevance and 10.0 indicates strong, direct relevance.
   - Alongside the score, explain briefly why you assigned that score by korean.

2. Repetition Avoidance (0.0–10.0):
   - Compare each utterance with mentioned ideas, suggestions, or statements in previously utterances.
   - If the content is largely duplicative, assign a lower score. If it adds new perspectives or content, assign a higher score.
   - Provide a numerical score between 0.0 and 10.0, along with a short rationale explaining the basis for your judgment by korean.

3. Additional Considerations:
   - Base your evaluations primarily on these two criteria, maintaining consistency.
   - After evaluating all utterances, compile an overall summary or feedback addressing which parts of the conversation showed strong topic coherence and introduced fresh insights, and which parts repeated prior points.

Your output for each utterance should follow only this format:

- Topic Coherence: <Score> (Rationale: <Reasoning>)
- Repetition Avoidance: <Score> (Rationale: <Reasoning>)
'''
        self.utterances = ""

    def eval(self, text):
        
        chat = [
            {"role": "system", "content": self.system},
            {"role": "user", "content": text},
            {"role": "previous utterances", "content": self.utterances}
        ]
        
        prompt= self.tokenizer.apply_chat_template(chat, tokenize = False, add_generation_prompt=True)

        model_inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            add_special_tokens=True
        )
        
        terminators = [
            self.tokenizer.convert_tokens_to_ids("<|end_of_text|>"),
            self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]
        
        if "token_type_ids" in model_inputs:
            del model_inputs["token_type_ids"]
        
        model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
        
        generated_ids = self.model.generate(
            **model_inputs,
            max_new_tokens=1024,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.9
        )

        prompt_length = model_inputs["input_ids"].shape[1]
        
        result = self.tokenizer.decode(generated_ids[0][prompt_length:], skip_special_tokens=True)
        
        self.utterances = self.utterances + text +  "\n"

        return result

In [6]:
topic = "현대 대한민국 사회는 행복한가?"
model = Prompting(topic=topic)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model.eval()

## 음성데이터 -> whishper -> txt 파일 -> LLM: 이문장의 주제를 한문장으로 알려줘 = Topic
## topic, txt파일 -> LLM:Evaluation = 평가 결과 (model의 prediction)
## txt 파일 -> Humen feedback = 정답지

In [None]:
import torch
import librosa
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration


model_name = "openai/whisper-small"
model = WhisperForConditionalGeneration.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name)

# 모델 평가 모드
model.eval()

# 무음인지 판단하는 함수
def is_silent(audio_chunk, threshold=0.01):
    """
    오디오 청크가 무음인지 확인
    - threshold: 무음으로 간주할 최대 진폭 값 (기본값 0.01)
    """
    return np.max(np.abs(audio_chunk)) < threshold

# 4초 단위 음성 인식 함수 (한국어만 인식)
def fast_transcription(audio_path, chunk_size=7.0, sampling_rate=16000):
    """
    4초 단위로 오디오 데이터를 처리하여 빠른 텍스트 출력
    - chunk_size: 처리할 오디오 길이 (초 단위, 기본값 4초)
    - 한국어만 인식되도록 강제 설정
    """
    # 오디오 파일 로드
    audio, sr = librosa.load(audio_path, sr=sampling_rate)
    
    # 4초 단위 처리
    chunk_length = int(chunk_size * sampling_rate)  # 4초에 해당하는 샘플 수
    total_chunks = len(audio) // chunk_length  # 총 처리할 청크 수

    for i in range(total_chunks + 1):  # 마지막 청크까지 포함
        # 현재 청크 추출
        start = i * chunk_length
        end = start + chunk_length
        
        # 마지막 청크 처리 (길이가 chunk_length보다 짧을 수 있음)
        if start >= len(audio):
            break
        if end > len(audio):
            end = len(audio)
        
        audio_chunk = audio[start:end]
        
        # 무음 청크 건너뛰기
        if is_silent(audio_chunk):
            continue
        
        # 입력 데이터 준비
        input_features = processor(audio_chunk, sampling_rate=sampling_rate, return_tensors="pt").input_features
        
        # 모델 추론 (한국어 강제 설정)
        with torch.no_grad():
            predicted_ids = model.generate(
                input_features,
                forced_decoder_ids=processor.get_decoder_prompt_ids(language="ko", task="transcribe")
            )
        
        # 결과 디코딩
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        # 결과 출력
        print(f"[{start / sampling_rate:.2f}s - {end / sampling_rate:.2f}s]: {transcription}")

# 실행
audio_path = "data1.wav"  # 음원 파일 경로
fast_transcription(audio_path, chunk_size=4.0, sampling_rate=16000)
