In [None]:
import sys 
import os 
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
sys.path.append('/ocean/projects/cis250068p/jhwang4/idl-project')
import torch as t
import numpy as np
import pandas as pd
import torch.nn.functional as F
from tqdm.auto import tqdm

from transformers import GPTNeoXForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer, util

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize

from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
import re
from scipy.spatial.distance import cosine
from src.model import load_fo_model, load_ba_model, DEVICE
from src.data import load_cnn_dataset
#from src.utils import *
from src.utils_batch_v2 import *
#from src.search import *
from src.search_batch_v2 import *


In [None]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', '{:.4f}'.format)
nltk.download("punkt_tab")


In [None]:
DEVICE

In [None]:
# Load the models
fo_model, fo_tokenizer = load_fo_model()
ba_model, ba_tokenizer = load_ba_model()
fo_model = fo_model.to(DEVICE)
ba_model = ba_model.to(DEVICE)
fo_model.half()
ba_model.half()

# 멀티-GPU 자동 적용
if t.cuda.device_count() > 1:
    print(t.cuda.device_count())
    fo_model = t.nn.DataParallel(fo_model)
    ba_model = t.nn.DataParallel(ba_model)

In [None]:
dataset = load_cnn_dataset(num_samples=95704)
dataset = pd.DataFrame(dataset)

In [None]:
dataset.head(5)

In [None]:
print(dataset.shape[0])

In [None]:
linear_results = linear_attribution_search(dataset, fo_model, fo_tokenizer, ba_model, ba_tokenizer, sentence_batch_size=1000)

In [None]:
EMB_MODEL    = SentenceTransformer('all-MiniLM-L6-v2')
ROUGE_SCORER = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
TFIDF_VEC    = TfidfVectorizer(
    token_pattern=r"(?u)\b\w+\b",  # 좀 더 느슨한 토큰화
    stop_words=None               # 불용어 필터링 끄기
)

def calculate_embedding_similarity(highlight, citation):
    he = EMB_MODEL.encode([highlight])[0]
    ce = EMB_MODEL.encode([citation])[0]
    return 1 - cosine(he, ce)

def calculate_rouge_score(highlight, citation):
    return ROUGE_SCORER.score(highlight, citation)['rougeL'].fmeasure

def calculate_tfidf_score(highlight, citation):
    try:
        mat = TFIDF_VEC.fit_transform([highlight, citation]).toarray()
        return 1 - cosine(mat[0], mat[1])
    except ValueError:
        # 어휘가 하나도 없으면 0.0
        return 0.0

def process_data(data):
    results = []
    for item in data:
        hl = item['highlight']
        rec = {'id': item['id'], 'highlight': hl}
        for citation_type in ['base_citation','fo_citation','ba_citation']:
            prefix = citation_type.split('_')[0]
            cit = item.get(citation_type, None)
            if cit:
                rec[f'{prefix}_emb_similarity'] = calculate_embedding_similarity(hl, cit)
                rec[f'{prefix}_rouge_score']     = calculate_rouge_score(hl, cit)
                rec[f'{prefix}_tfidf_score']     = calculate_tfidf_score(hl, cit)
            else:
                rec[f'{prefix}_emb_similarity'] = None
                rec[f'{prefix}_rouge_score']     = None
                rec[f'{prefix}_tfidf_score']     = None
        results.append(rec)
    return results

linear_final_results    = process_data(linear_results)

In [None]:
r = pd.DataFrame(linear_final_results) 
r.drop(['id', 'highlight'], axis=1).mean()

In [None]:
df_linear = pd.DataFrame(linear_final_results)
mean_series = df_linear.drop(['id', 'highlight'], axis=1).mean()

data = {
    'Base_linear': [
        mean_series['base_emb_similarity_linear'],
        mean_series['base_rouge_score_linear'],
        mean_series['base_tfidf_score_linear']
    ],
    'Fo_linear': [
        mean_series['fo_emb_similarity_linear'],
        mean_series['fo_rouge_score_linear'],
        mean_series['fo_tfidf_score_linear']
    ],
    'Ba_linear': [
        mean_series['ba_emb_similarity_linear'],
        mean_series['ba_rouge_score_linear'],
        mean_series['ba_tfidf_score_linear']
    ]
}

table_df = pd.DataFrame(data, index=['Embedding', 'Rouge', 'Tfidf'])
table_df
table_df.to_csv("/ocean/projects/cis250068p/jhwang4/idl-project/notebooks/linear_result.csv", index=True)