# 0. Import

In [1]:
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def switch_from_string_to_integer(mungchi_string):
    # 슬래시와 공백을 제외한 글자 수를 계산
    parts = mungchi_string.split('/')  # 슬래시를 기준으로 문자열을 나눔
    mungchi_integer = [len(part.strip()) for part in parts]  # 각 부분을 공백 제거 후 길이 계산
    return mungchi_integer

# 1. Load Dataset

In [3]:
dataset = load_dataset("AcapeLlama/AcapeLlama_v2.0_test", 'line')
dataframe = pd.DataFrame(dataset['test'])
print(dataframe.columns)
print(dataframe.shape)

Index(['title', 'genre', 'mungchi', 'output', 'instruction',
       '__index_level_0__', 'lyrics'],
      dtype='object')
(47678, 7)


In [4]:
df = dataframe.copy()

# 2. Inference
- 가상의 테스트셋에 해당하는 노래 10곡
- 여기서 input으로 요청하는 음절수가 golden_mungchi_integer에 저장되어야 함.

In [5]:
temp_test_df = df.sample(n=10, random_state=42)
temp_test_df.shape

(10, 7)

# 3. Evaluation

In [7]:
def switch_from_string_to_integer(mungchi_string):
    # 슬래시와 공백을 제외한 글자 수를 계산
    parts = mungchi_string.split('/')  # 슬래시를 기준으로 문자열을 나눔
    mungchi_integer = [len(part.strip()) for part in parts]  # 각 부분을 공백 제거 후 길이 계산
    return mungchi_integer

In [9]:
from transformers import AutoModel, AutoTokenizer
import similarity_metric as sm
import formality_metric as fm

def evaluation(golden_lyrics_list, golden_mungchi_integer_list, predict_mungchi_string_list, result_dir, sample_strategy):
    predict_mungchi_integer_list = []
    semantic_sim_list = []
    lexical_sim_list = []
    acc_form_list = []
    mse_form_list = []
    our_form_list = []

    # 의미 유사도 산출을 위한 encoder 불러오기
    model = AutoModel.from_pretrained("klue/roberta-base")
    tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

    for golden_lyrics, golden_mungchi_integer, predict_mungchi_string in zip(golden_lyrics_list, golden_mungchi_integer_list, predict_mungchi_string_list):
        
        # switch from predict_mungchi string to integer
        predict_mungchi_integer = switch_from_string_to_integer(predict_mungchi_string)
        predict_mungchi_integer_list.append(predict_mungchi_integer)
        
        # evaluate test data
        semantic_sim = sm.eval_semantic_sim(model, tokenizer, golden_lyrics, predict_mungchi_string)
        lexical_sim = sm.eval_lexical_sim_precision(golden_lyrics, predict_mungchi_string)
        acc_form, mse_form = fm.eval_form(golden_mungchi_integer, predict_mungchi_integer)
        our_form = fm.eval_our_form(golden_mungchi_integer, predict_mungchi_integer)
        
        # save scores
        semantic_sim_list.append(semantic_sim)
        lexical_sim_list.append(lexical_sim)
        acc_form_list.append(acc_form)
        mse_form_list.append(mse_form)
        our_form_list.append(our_form)

    # create evaluation dataframe to matching with the original lyrics
    eval_df = pd.DataFrame({'original_lyrics' : temp_test_df['lyrics'],
                            'input_mungchi_integer' : golden_mungchi_integer_list,
                            'generated_mungchi_string' : predict_mungchi_string_list,
                            'generated_mungchi_integer' : predict_mungchi_integer_list,
                            'semantic_sim' : semantic_sim_list,
                            'lexical_sim' : lexical_sim_list,
                            'acc_form' : acc_form_list,
                            'mse_form' : mse_form_list,
                            'our_form' : our_form_list})

    # get average scores
    keys_to_average = ['semantic_sim', 'lexical_sim', 'acc_form', 'mse_form', 'our_form']
    averages_dict = {key: eval_df[key].mean() for key in keys_to_average}

    # save evaluation result
    eval_df.to_json(f'{result_dir}/eval_df.json', orient='records', lines=True)
    return averages_dict

In [10]:
temp_test_df.sample()

Unnamed: 0,title,genre,mungchi,output,instruction,__index_level_0__,lyrics
15122,기대해도 좋은 날,댄스,[7],기대해도 좋은 날,다음 조건에 어울리는 가사를 써주실 수 있나요? 주어진 음절 수를 절대 벗어나면 안...,415182,Woo Hoo\nWoo Hoo Hoo Hoo\nWoo Hoo Hoo Hoo\n저기 ...


In [11]:
golden_lyrics_list = temp_test_df['lyrics'].tolist()
golden_mungchi_integer_list = temp_test_df['mungchi'].tolist()
predict_mungchi_string_list = temp_test_df['output'].tolist()
result_dir = '/workspace/codes/evaluation'
sample_strategy = 'line'
    
evaluation(golden_lyrics_list, golden_mungchi_integer_list, predict_mungchi_string_list, result_dir, sample_strategy)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'semantic_sim': 0.97839755,
 'lexical_sim': 1.0,
 'acc_form': 0.3,
 'mse_form': 4.8,
 'our_form': 0.45}