In [19]:
import pandas as pd
import os
from dotenv import load_dotenv
import openai
from rouge import Rouge # 모델의 성능을 평가하기 위한 라이브러리입니다.

# --- Configuration ---
# Assuming the script is run from the root of the project
PROJECT_DIR = "/mnt/c/SKH/ai_lab_13/projects/nlp-text-summarization/song"
DATA_DIR = os.path.join(PROJECT_DIR, 'data')
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
DEV_CSV = os.path.join(DATA_DIR, "dev.csv")

load_dotenv(os.path.join(PROJECT_DIR, ".env"))
UPSTAGE_API_KEY = os.getenv("UPSTAGE_API_KEY")
print(UPSTAGE_API_KEY[:10],"...")

train_df = pd.read_csv(TRAIN_CSV)
val_df = pd.read_csv(DEV_CSV)

up_ITw2Ag9 ...


In [20]:
def get_NER(dialogue):
    # not implemented
    return None
def get_sample_by_index(dataset_name, index, use_NER=True):
    df = train_df if dataset_name == "train" else val_df
    index = int(index)
    if 0 <= index < len(df):
        sample = df.iloc[index]

        # get NER by NER-model if exists.
        ner = get_NER(sample['dialogue']) if use_NER else None
        if ner is None:
            ner = "NER model is not ready."

        return sample['fname'], sample['dialogue'], sample['summary'], ner, sample['topic'], index
    return "N/A", "N/A", "N/A", "N/A", "N/A", index

def call_Solar(dataset_name, index):
    fname, dialogue, _, _, _, _ = get_sample_by_index(dataset_name, index, use_NER=False)
    
    return fname, f"```\nSolar API is not ready.\n```",f"```\nSolar API is not ready.\n```",f"```\nSolar API is not ready.\n```",f"```\nSolar API is not ready.\n```"

In [7]:
fname, dialogue, _, _, _, _ = get_sample_by_index('train', 0, use_NER=False)

In [9]:
print(dialogue)

#Person1#: 안녕하세요, 스미스씨. 저는 호킨스 의사입니다. 오늘 왜 오셨나요?
#Person2#: 건강검진을 받는 것이 좋을 것 같아서요.
#Person1#: 그렇군요, 당신은 5년 동안 건강검진을 받지 않았습니다. 매년 받아야 합니다.
#Person2#: 알고 있습니다. 하지만 아무 문제가 없다면 왜 의사를 만나러 가야 하나요?
#Person1#: 심각한 질병을 피하는 가장 좋은 방법은 이를 조기에 발견하는 것입니다. 그러니 당신의 건강을 위해 최소한 매년 한 번은 오세요.
#Person2#: 알겠습니다.
#Person1#: 여기 보세요. 당신의 눈과 귀는 괜찮아 보입니다. 깊게 숨을 들이쉬세요. 스미스씨, 담배 피우시나요?
#Person2#: 네.
#Person1#: 당신도 알다시피, 담배는 폐암과 심장병의 주요 원인입니다. 정말로 끊으셔야 합니다. 
#Person2#: 수백 번 시도했지만, 습관을 버리는 것이 어렵습니다.
#Person1#: 우리는 도움이 될 수 있는 수업과 약물들을 제공하고 있습니다. 나가기 전에 더 많은 정보를 드리겠습니다.
#Person2#: 알겠습니다, 감사합니다, 의사선생님.


- Solar Chat API

In [21]:
client = openai.OpenAI(
    api_key=UPSTAGE_API_KEY,
    base_url="https://api.upstage.ai/v1/solar"
)

In [104]:
# Prompt를 생성하는 함수를 수정합니다.
def build_prompt(dialogue, type='summarization'):
    if type=='summarization':
        system_prompt = "You are a expert in the field of dialogue summarization, summarize the given dialogue in a concise manner. Follow the user's instruction carefully and provide a summary that is relevant to the dialogue."

        user_prompt = (
            "Following the instructions below, summarize the given document.\n"
            "Instructions:\n"
            "1. Read the dialogue carefully.\n"
            "2. Preserve named entities in the summary.\n"
            "3. Among special characters and symbols, only Arabic numerals, commas, and periods may be used.\n"
            "4. Reflect discourse relations, speech acts, and conversational intentions in the summary.\n"
            "5. Keep the summary concise and brief.\n"
            "6. Response in KOREAN.\n\n"
            "Dialogue:\n"
            f"{dialogue}\n\n"
            "Summary:\n"
        )
    elif type=='ko2en':
        system_prompt = "You are a expert in the field of translation. Translate the given Korean dialogue into English. Follow the user's instruction carefully and provide a translation that is relevant to the original korean dialogue."

        user_prompt = (
            "Following the instructions below, translate the given dialogue.\n"
            "Instructions:\n"
            "1. Read the dialogue carefully.\n"
            "2. Preserve named entities or english name in the dialogue.\n"
            "3. Each turn is distinguished by line feed, preserve the number of turns and representation of speaker such as #Person1#.\n"
            "4. Translate Korean to English.\n\n"
            "Korean Dialogue:\n"
            f"{dialogue}\n\n"
            "Translation:\n"
        )
    elif type=='en2ko':
        system_prompt = "You are a expert in the field of translation. Translate the given English dialogue into Korean. Follow the user's instruction carefully and provide a translation that is relevant to the original english dialogue."

        user_prompt = (
            "Following the instructions below, translate the given dialogue.\n"
            "Instructions:\n"
            "1. Read the dialogue carefully.\n"
            "2. Preserve named entities or english name in the dialogue.\n"
            "3. Each turn is distinguished by line feed, preserve the number of turns and representation of speaker such as #Person1#.\n"
            "4. Preserve Personal Identity Information masking such as #Person1#, #Email#, #Address#, etc."
            "5. Translate English to Korean.\n\n"
            "English Dialogue:\n"
            f"{dialogue}\n\n"
            "Translation:\n"
        )
    elif type=='topic':
        system_prompt = "You are a expert in the field of topic classification. Extract discourse relations, speech acts, and conversational intentions in the summary and represents it as topic. Follow the user's instruction carefully and provide a topic that is relevant to the dialogue."

        user_prompt = (
            "Following the instructions below, extract topic in the given dialogue.\n"
            "Instructions:\n"
            "1. Read the dialogue carefully.\n"
            "2. Focus on named entities in the dialogue.\n"
            "3. Topic must be at most 3 words.\n"
            "4. Response in KOREAN with no prefix or suffix, only the topic.\n\n"
            "Dialogue:\n"
            f"{dialogue}\n\n"
        )
    
    return [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": user_prompt
        }
    ]



In [40]:
# 모델 성능에 대한 평가 지표를 정의합니다. 본 대회에서는 ROUGE 점수를 통해 모델의 성능을 평가합니다.
rouge = Rouge()
def compute_metrics(pred, gold):
    results = rouge.get_scores(pred, gold, avg=True)
    result = {key: value["f"] for key, value in results.items()}
    return result

In [69]:
len(train_df.iloc[0]['dialogue'].split(' ')) * 5.2

566.8000000000001

In [105]:
def chat_solar(dialogue, type='summarization'):
    max_tokens = 170
    if type in ['en2ko', 'ko2en']:
        max_tokens = None # 따로 설정하지 않는다.
    elif type == 'topic':
        max_tokens = 15
    prompt = build_prompt(dialogue, type)
    # print(prompt)
    if max_tokens is not None:
        output = client.chat.completions.create(
            model="solar-mini",
            messages=prompt,
            temperature=0.2,
            top_p=0.3,
            max_tokens=max_tokens,
        )
    else:
        output = client.chat.completions.create(
            model="solar-mini",
            messages=prompt,
            temperature=0.2,
            top_p=0.3,
        )

    return output.choices[0].message.content

In [80]:
# Train data 중 처음 3개의 대화를 요약합니다.
def test_on_train_data(num_samples=3):
    for idx, row in train_df[:num_samples].iterrows():
        dialogue = row['dialogue']
        summary = chat_solar(dialogue)
        print(f"Dialogue:\n{dialogue}\n")
        print(f"Pred Summary: {summary}\n")
        print(f"Gold Summary: {row['summary']}\n")
        print("=="*50)

In [106]:
from tqdm import tqdm
def retranslate_all(df):
    results = []
    idx = -1  # 초기화 필요 (finally에서 사용되므로)
    try:
        for idx,row in tqdm(df.iterrows()):
            dialogue = row['dialogue']
            summary, ko2en, en2ko, re_summary, topic = None, None, None, None, None

            # 각 처리 결과에 대해 예외 처리
            try:
                summary = chat_solar(dialogue, type='summarization')
                # print(summary)
            except Exception as e:
                print(f"[{idx}] Error in summarization: {e}")
                results.append([summary, ko2en, en2ko, re_summary, topic])
                continue

            try:
                ko2en = chat_solar(dialogue, type='ko2en') # 한국어를 영어로
            except Exception as e:
                print(f"[{idx}] Error in ko2en: {e}")
                results.append([summary, None, None, None, None])
                continue

            try:
                en2ko = chat_solar(ko2en, type='en2ko') # 영어를 입력받아 다시 한국어로
            except Exception as e:
                print(f"[{idx}] Error in en2ko: {e}")
                results.append([summary, ko2en, None, None, None])
                continue

            try:
                re_summary = chat_solar(en2ko, type='summarization') # en2ko로 요약문 생성
            except Exception as e:
                print(f"[{idx}] Error in re_summary: {e}")
                results.append([summary, ko2en, en2ko, None, None])
                continue

            try:
                topic = chat_solar(en2ko, type='topic') # en2ko로 토픽 생성
            except Exception as e:
                print(f"[{idx}] Error in topic: {e}")
                results.append([summary, ko2en, en2ko, re_summary, None])
                continue

            results.append([summary, topic, ko2en, en2ko, re_summary])
    except Exception as e:
        print("Error:", e)
    finally:
        print(f"Finished at index: {idx} / {len(df)-1}")
        results_df = pd.DataFrame(
            results, columns=['summary_solar', 'dialogue_ko2en', 'dialogue_en2ko', 're_summary_solar', 'topic_solar']
        )
        return results_df

In [57]:
display(train_df.head(2))
display(val_df.head(2))

Unnamed: 0,fname,dialogue,summary,topic
0,train_0,"#Person1#: 안녕하세요, Mr. Smith. 저는 Dr. Hawkins입니다...","Mr. Smith는 Dr. Hawkins에게 건강검진을 받으러 와서, 매년 검진 필...",건강검진
1,train_1,"#Person1#: 안녕하세요, Mrs. Parker. 잘 지내셨나요?\n#Pers...","Mrs. Parker가 Ricky와 함께 백신 접종을 위해 방문하였고, Dr. Pe...",백신 접종


Unnamed: 0,fname,dialogue,summary,topic
0,dev_0,"#Person1#: 안녕하세요, 오늘 기분이 어떠세요?\n#Person2#: 요즘 ...",#Person2#는 숨쉬기 어려워합니다. 의사는 #Person2#에게 증상을 확인하...,의사 상담
1,dev_1,"#Person1#: 야 Jimmy, 오늘 좀 이따 운동하러 가자.\n#Person2...",#Person1#는 Jimmy를 운동하러 초대하고 팔과 복근 운동을 하도록 설득합니다.,운동 계획


In [None]:
train_results = retranslate_all(train_df)

746it [2:31:00,  9.18s/it]

In [None]:
train_results.head(3)

Unnamed: 0,summary_solar,dialogue_ko2en,dialogue_en2ko,re_summary_solar,topic_solar
0,Dr. Hawkins는 Mr. Smith를 반갑게 맞이하며 건강검진을 받으러 온 이...,"주제: 건강검진, 흡연 경고","#Person1#: Hello, Mr. Smith. I'm Dr. Hawkins. ...",아래의 지침을 따라 주어진 영어 대화를 한국어로 번역해드리겠습니다.\n지침:\n1....,"#Person1#: 안녕하세요, 스미스씨. 저는 박사 호킨스입니다. 오늘은 무슨 일..."


In [None]:
train_results['fname'] = train_df['fname'].iloc[:len(train_results)]
train_results.to_csv(os.path.join(PROJECT_DIR,"data","train_solar_results.csv"), index=False)

In [None]:
val_results = retranslate_all(val_df)

In [None]:
val_results.head(3)

In [None]:
val_results['fname'] = val_df['fname'].iloc[:len(val_results)]
val_results.to_csv(os.path.join(PROJECT_DIR,"data","val_solar_results.csv"), index=False)

In [73]:
topics = train_df[['topic']]
topics['len'] = topics['topic'].apply(lambda x: len(x.split(" ")))
topics['보정'] = topics['len'] * 5.2
topics.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topics['len'] = topics['topic'].apply(lambda x: len(x.split(" ")))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topics['보정'] = topics['len'] * 5.2


Unnamed: 0,topic,len,보정
0,건강검진,1,5.2
1,백신 접종,2,10.4
2,열쇠 분실,2,10.4
3,여자친구와의 결혼,2,10.4
4,춤 제안,2,10.4


In [74]:
topics[topics['len']==topics['len'].max()].head()

Unnamed: 0,topic,len,보정
6236,비행기 내 식사 주문 및 자리 변경 요청,8,41.6
6449,비행기 좌석 조절 및 귀 압력 문제 해결,8,41.6
