In [None]:
import os
import torch
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

from tqdm import tqdm
from transformers import set_seed
from itertools import permutations
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, set_seed

# 시드 고정 함수
def set_all_seeds(seed=42):
    """모든 라이브러리 시드 고정"""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    set_seed(seed)  # Transformers 시드 고정

# 시드 고정
set_all_seeds(42)

# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 한글 폰트 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

In [None]:
# 데이터 로드
test = pd.read_csv('/content/drive/MyDrive/project/sentence/data/test.csv')
submission = pd.read_csv('/content/drive/MyDrive/project/sentence/data/sample_submission.csv')

In [None]:
# 저장된 모델과 토크나이저 로드
bert_output_dir = f"/content/drive/MyDrive/project/sentence/bert-kor-base-pt"
roberta_output_dir = f"/content/drive/MyDrive/project/sentence/roberta-large-pt"
electra_output_dir = f"/content/drive/MyDrive/project/sentence/electra-kor-base-pt"

bert_tokenizer = AutoTokenizer.from_pretrained(bert_output_dir)
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_output_dir)

roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_output_dir)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_output_dir)

electra_tokenizer = AutoTokenizer.from_pretrained(electra_output_dir)
electra_model = AutoModelForSequenceClassification.from_pretrained(electra_output_dir)

In [None]:
# 모델들을 디바이스로 이동
bert_model.to(device).eval()
roberta_model.to(device).eval()
electra_model.to(device).eval()

# 데이터셋 클래스 (예측용)
class SentenceOrderDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_len):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        sent_A, sent_B = self.pairs[idx]

        encoding = self.tokenizer.encode_plus(
            sent_A, sent_B,
            add_special_tokens=True, max_length=self.max_len,
            padding='max_length', truncation=True,
            return_token_type_ids=True, return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten()
        }

# 예측 함수
def predict_probabilities(model, tokenizer, sentence_pairs, device, max_length=256, batch_size=64):
    """문장 쌍에 대한 예측 확률 반환"""
    all_probs = []

    dataset = SentenceOrderDataset(sentence_pairs, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )

            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            all_probs.extend(probs[:, 1].cpu().numpy())

    return np.array(all_probs)

In [None]:
# 문장 순서 예측 함수 수정
def predict_sentence_order_ensemble_with_probs(sentences, models, tokenizers, device, weights=None):
    """앙상블로 문장 순서를 예측하고 확률 정보 반환"""
    if weights is None:
        weights = [1.0/len(models)] * len(models)

    all_perms = list(permutations(range(4)))
    perm_scores = np.zeros(len(all_perms))

    for perm_idx, perm in enumerate(all_perms):
        pairs = []
        for i in range(3):
            pairs.append((sentences[perm[i]], sentences[perm[i+1]]))

        model_scores = []
        for model, tokenizer in zip(models, tokenizers):
            probs = predict_probabilities(model, tokenizer, pairs, device)
            model_score = np.prod(probs)
            model_scores.append(model_score)

        perm_scores[perm_idx] = np.average(model_scores, weights=weights)

    # 정규화하여 확률로 변환
    perm_probs = perm_scores / perm_scores.sum()

    best_perm_idx = np.argmax(perm_scores)
    best_perm = list(all_perms[best_perm_idx])
    best_prob = perm_probs[best_perm_idx]

    return best_perm, best_prob, perm_probs, all_perms

# 테스트 데이터 예측 (확률 포함)
predictions = []
confidence_scores = []

print("테스트 데이터 예측 시작...")
for idx, row in tqdm(test.iterrows(), total=len(test)):
    sentences = [row[f'sentence_{i}'] for i in range(4)]

    predicted_order, confidence, all_probs, all_perms = predict_sentence_order_ensemble_with_probs(
        sentences, models, tokenizers, device, weights
    )

    predictions.append(predicted_order)
    confidence_scores.append(confidence)

    # 상위 3개 순열 출력 (디버깅용)
    if idx < 5:  # 처음 5개만 출력
        print(f"\n샘플 {idx}:")
        sorted_indices = np.argsort(all_probs)[::-1][:3]
        for rank, idx_perm in enumerate(sorted_indices):
            print(f"  {rank+1}위: {list(all_perms[idx_perm])} - {all_probs[idx_perm]:.4%}")

# 평균 신뢰도 출력
print(f"\n평균 예측 신뢰도: {np.mean(confidence_scores):.4%}")
print(f"최소 신뢰도: {np.min(confidence_scores):.4%}")
print(f"최대 신뢰도: {np.max(confidence_scores):.4%}")

In [None]:
# 예측 완료 후 신뢰도 분석
# 1. 히스토그램 시각화
plt.figure(figsize=(10, 6))
plt.hist(confidence_scores, bins=10, range=(0, 1), edgecolor='black', alpha=0.7)
plt.xlabel('예측 신뢰도 (%)')
plt.ylabel('데이터 개수')
plt.title('테스트 데이터 예측 신뢰도 분포')

# x축을 백분율로 표시
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{int(x*100)}%'))

# 각 구간에 개수 표시
counts, bins, patches = plt.hist(confidence_scores, bins=10, range=(0, 1), edgecolor='black', alpha=0.7)
for count, bin_center, patch in zip(counts, (bins[:-1] + bins[1:]) / 2, patches):
    plt.text(bin_center, count + max(counts)*0.01, f'{int(count)}',
             ha='center', va='bottom', fontsize=10)

plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# 2. 구간별 개수 데이터프레임
confidence_df = pd.DataFrame({
    '신뢰도 구간': [f'{i*10}%-{(i+1)*10}%' for i in range(10)],
    '데이터 개수': [0] * 10,
    '비율(%)': [0.0] * 10
})

# 각 구간별로 카운트
for score in confidence_scores:
    bin_idx = min(int(score * 10), 9)  # 0-9 사이의 인덱스
    confidence_df.loc[bin_idx, '데이터 개수'] += 1

# 비율 계산
total_count = len(confidence_scores)
confidence_df['비율(%)'] = (confidence_df['데이터 개수'] / total_count * 100).round(2)

# 누적 비율 추가
confidence_df['누적 비율(%)'] = confidence_df['비율(%)'].cumsum().round(2)

print("\n=== 신뢰도 구간별 분포 ===")
print(confidence_df)

# 3. 기초 통계 정보
print("\n=== 신뢰도 통계 정보 ===")
print(f"평균 신뢰도: {np.mean(confidence_scores):.2%}")
print(f"중앙값: {np.median(confidence_scores):.2%}")
print(f"표준편차: {np.std(confidence_scores):.2%}")
print(f"최소값: {np.min(confidence_scores):.2%}")
print(f"최대값: {np.max(confidence_scores):.2%}")

# 4. 50% 미만 신뢰도를 가진 데이터 분석
low_confidence_mask = np.array(confidence_scores) < 0.5
low_confidence_count = np.sum(low_confidence_mask)

print(f"\n50% 미만 신뢰도 데이터: {low_confidence_count}개 ({low_confidence_count/total_count*100:.1f}%)")

In [None]:
# submission 데이터프레임에 예측과 신뢰도 추가
for i in range(4):
    submission[f'answer_{i}'] = [pred[i] for pred in predictions]

submission['confidence'] = confidence_scores

# 신뢰도가 0.5 이하인 데이터 인덱스 리스트
low_confidence_indices = submission[submission['confidence'] <= 0.5].index.tolist()
print(f"신뢰도 0.5 이하인 데이터의 개수: {len(low_confidence_indices)}")

In [None]:
# Qwen3-14B 모델 로드
qwen_model_name = "Qwen/Qwen3-14B"  # Instruct 버전 사용
print(f"Qwen 모델 로드 중: {qwen_model_name}")
qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name)
qwen_model = AutoModelForCausalLM.from_pretrained(
    qwen_model_name,
    torch_dtype=torch.float16,  # 메모리 절약
    device_map="auto"
)
print("모델 로드 완료!")

In [None]:
# Qwen3-14B를 사용한 문장 순서 예측 함수
def predict_sentence_order_with_qwen(sentences, model, tokenizer):
    """Qwen 모델로 직접 문장 순서 예측"""

    # 모든 가능한 순열 생성
    all_perms = list(permutations(range(4)))

    # 프롬프트 생성 (영어)
    prompt = f"""You are given 4 Korean sentences that are currently in a scrambled order. Your task is to rearrange them in the most natural and logical sequence.

[Scrambled Sentences]
Sentence 0: {sentences[0]}
Sentence 1: {sentences[1]}
Sentence 2: {sentences[2]}
Sentence 3: {sentences[3]}

[Instructions]
1. Consider temporal order, cause-and-effect relationships, and logical flow.
2. Your answer must be ONLY the sentence numbers separated by commas, like "0,1,2,3".
3. Do not provide any explanation, just the sequence numbers.

The correct order is:"""

    # 토크나이즈
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # 생성
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=20,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # 응답 파싱
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    try:
        # "0,1,2,3" 형식에서 순서 추출
        order_str = response.strip().split('\n')[0]  # 첫 줄만 사용
        order = [int(x.strip()) for x in order_str.split(',')]

        # 유효성 검사
        if len(order) == 4 and set(order) == {0, 1, 2, 3}:
            return order
        else:
            print(f"Invalid response format: {response}")
            # 폴백: 점수 기반 방법 사용
            return predict_with_scoring(sentences, model, tokenizer)
    except:
        print(f"Parsing failed: {response}")
        return predict_with_scoring(sentences, model, tokenizer)

def predict_with_scoring(sentences, model, tokenizer):
    """점수 기반 예측 (폴백 방법)"""
    all_perms = list(permutations(range(4)))
    best_score = -float('inf')
    best_perm = list(range(4))

    for perm in all_perms[:6]:
        # 순열에 따른 텍스트 생성
        ordered_text = " ".join([sentences[i] for i in perm])

        prompt = f"""Please evaluate if the following Korean sentences are logically well-connected in the given order:

{ordered_text}

Is this sequence natural and coherent? Answer with only 'Yes' or 'No':"""

        inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            # 마지막 토큰의 logit을 점수로 사용
            score = outputs.logits[0, -1, :].max().item()

        if score > best_score:
            best_score = score
            best_perm = list(perm)

    return best_perm

In [None]:
# 신뢰도가 0.5 이하인 데이터 재예측
new_predictions = []
original_predictions = []  # 기존 예측 저장

print("\nQwen 모델로 재예측 시작...")
progress_bar = tqdm(low_confidence_indices, desc="Qwen 재예측")

for idx in progress_bar:
    # 기존 예측 저장
    original_order = [submission.loc[idx, f'answer_{j}'] for j in range(4)]
    original_predictions.append(original_order)

    test_idx = idx
    sentences = [test.iloc[test_idx][f'sentence_{i}'] for i in range(4)]

    # Qwen으로 예측
    predicted_order = predict_sentence_order_with_qwen(sentences, qwen_model, qwen_tokenizer)
    new_predictions.append(predicted_order)

    # 진행 상황 업데이트
    progress_bar.set_postfix({
        "처리 개수": f"{len(new_predictions)}/{len(low_confidence_indices)}",
    })

In [None]:
# 결과 업데이트
changed_count = 0
changed_indices = []

for i, idx in enumerate(low_confidence_indices):
    # 변경 여부 확인
    if original_predictions[i] != new_predictions[i]:
        changed_count += 1
        changed_indices.append((idx, i))

    # 업데이트
    for j in range(4):
        submission.loc[idx, f'answer_{j}'] = new_predictions[i][j]

In [None]:
# 최종 제출 파일 저장
submission_final = submission[['ID', 'answer_0', 'answer_1', 'answer_2', 'answer_3']]
submission_final.to_csv('./project/sentence/submission_final.csv', index=False)