# 아이펠톤에서 사용할 텍스트 생성 모델 테스트

##### 기반지식
- Text Generation task : 주어진 입력에 따라 텍스트를 이어서 생성하는 작업

##### 코드 흐름
- 허깅페이스에서 텍스트 생성쪽에 좋은 성능을 내는 모델을 선별
    - **멘토님 추천** 최근 연구는 라마 3.1 8b를 기준으로 사용함. GPU 자원이 없으면 3b, 잘안되면 Qwen 2.5 모델
        - IFEVAL 지표 기준으로 찾기 : 지시에 맞춰 잘 생성하는가 (70점 넘어야 적절, 안되도 60후반) 
    - [open_llm_leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) 에서 
        - 논리적 일관성과 문맥 유지를 잘하는 모델을 찾기 위한 지표
            - Perplexity (PPL) : 모델이 텍스트를 얼마나 잘 예측하는지를 나타내는 지표, 낮을 수록 좋음 
            - BLEU (Bilingual Evaluation Understudy) : 생성된 텍스트가 참조 텍스트(즉, 정답 텍스트)와 얼마나 일치하는지를 측정, 좊을 수록 좋음 - 이건 우선순위가 아닐듯 
            - ROUGE (Recall-Oriented Understudy for Gisting Evaluation) : 모델의 출력 텍스트와 참조 텍스트 간의 유사도를 평가하는 지표. 요약 작업에서 많이 사용되며, 일관성 있는지 측정 가능?, 높을 수록 좋음 
            - Human Evaluation 지표도 존재하는 듯 (일관성, 창의성, 자연스러움 등)
    - [text generation task](https://huggingface.co/models?pipeline_tag=text-generation&sort=likes) 에서 인기 좋은 것
    - [챗봇 arena의 leaderboad](https://lmarena.ai/)에서 허깅페이스에 오픈된모델 
        리더보드 점수는 **쌍대 비교(pairwise comparison)**를 기반으로 매겨집니다. 이 방식에서 두 모델의 답변을 비교한 후, 사용자가 더 나은 답변을 선택하는 방식으로 모델의 성능을 평가 -> 특정 task가 아닌 전반전인 성능을 사람이 선호하는 정도에 따라 결정됨
    - [task > text-generation](https://huggingface.co/tasks/text-generation) 의 글 참고
- (옵션) paperswithocde 에서 모델 선별
- 모델 다운로드
- 프롬프트 프리셋 생성, 변수 set 세팅
- n개 테스트로 생성
- OpenAI API 이용하여 평가 - 라이브러리 충돌되면 이건 다른 venv 코드에서 해야 할 수도 있음
- 비교할 수 있게 엑셀로 만들기

1차적으로 모델, 프롬프트 테스트를 위해 간단히 만들어봄 



## 생성하는 파트

In [1]:
import pandas as pd
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
from dotenv import load_dotenv
import os
import time
import gc

# CUDA가 사용 가능한지 확인하여 device 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 출력해서 현재 선택된 device 확인
print(f"Using device: {device}")

# .env 파일에서 환경 변수 로드
load_dotenv()
# .env 파일에서 Hugging Face 토큰 불러오기
token = os.getenv('HUGGINGFACE_TOKEN')

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
# 특정 모델 이름에 해당하는 파라미터 불러오기
def get_model_parameters(row):  
    params = {
        'max_new_tokens': int(row['max_new_tokens']) if pd.notna(row['max_new_tokens']) else None,
        'temperature': row['temperature'] if pd.notna(row['temperature']) else None,
        'top_p': row['top_p'] if pd.notna(row['top_p']) else None,
        'top_k': int(row['top_k']) if pd.notna(row['top_k']) else None,
        'repetition_penalty': row['repetition_penalty'] if pd.notna(row['repetition_penalty']) else None
    }

    # None 값을 제거하여 기본값을 사용하게 함
    params = {k: v for k, v in params.items() if v is not None}
    
    return params

# 모델 결과를 출력하는 함수
def get_result(model, parameter_set, prompt, tokenizer):
    
    # 입력을 토큰화하고 attention_mask 추가
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # 모델 파라미터 불러오기
    params = get_model_parameters(parameter_set)
    print(params)
    
    # 결과 생성
    with torch.no_grad():
        result = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],  # attention_mask 추가
            pad_token_id=model.config.pad_token_id,  # pad_token_id 추가
            **params
        )
        
    # 생성된 텍스트 디코딩
    generated_text = tokenizer.decode(result[0], skip_special_tokens=True)
        
    return generated_text


In [3]:
# 변수 설정
## 파일 이름
raw_filename = 'raw_data/meta0911.csv'
prompt_text_list_filename = 'config/text_prompt_lst.csv' # 프롬프트 raw 텍스트 목록
model_parameter_filename = 'config/model_parameter.csv'# 모델 파라미터 목록
prompt_filename = 'config/prompt_list.csv' # 최종 프롬프트 목록
gen_result_filename = 'data/meta_gen01.csv' # 생성 결과 파일 이름

In [4]:
# 데이터 로드
raw_data = pd.read_csv(raw_filename, encoding='latin1')
print(raw_data.columns)
print("--------------------------")

prompt_text_lst = pd.read_csv(prompt_text_list_filename)
print(prompt_text_lst.head(1))
print("--------------------------")

model_list = pd.read_csv(model_parameter_filename)
# print(model_list.head(2), type(model_list))

# 기존에 'unique_name' 열이 있으면 삭제
if 'unique_name' in model_list.columns:
    model_list.drop(columns=['unique_name'], inplace=True)

# 각 행마다 컬럼 데이터를 "_"로 연결, 값이 없으면 "N" 대체
# 모든 데이터를 문자열로 변환한 후 결합
model_list['unique_name'] = model_list.fillna('N').astype(str).agg('_'.join, axis=1)

print(model_list.head(2))
model_list.to_csv(model_parameter_filename, index=False)

Index(['persona', 'pattern', 'pattern_def', 'thought', 'scenario',
       'persona_in_scenario', 'thought_in_scenario'],
      dtype='object')
--------------------------
                                              prompt
0  Write a 4-6 sentence paragraph based on the fo...
--------------------------
                         model_name  max_new_tokens  temperature  top_p  \
0  meta-llama/Llama-3.2-1B-Instruct            1024          NaN    NaN   
1  meta-llama/Llama-3.2-1B-Instruct            1024          0.8    1.0   

   top_k  repetition_penalty  \
0    NaN                 NaN   
1   20.0                 1.2   

                                         unique_name  
0      meta-llama/Llama-3.2-1B-Instruct_1024_N_N_N_N  
1  meta-llama/Llama-3.2-1B-Instruct_1024_0.8_1.0_...  


### 하나만 생성해보기

In [36]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import gc

# 모델 및 토크나이저 로드
model_name = 'meta-llama/Llama-3.2-1B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# pad_token_id가 설정되지 않았다면 eos_token_id로 설정
if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.eos_token_id

# 프롬프트 설정
prompt = (
    "I want you to write a paragraph including 4 to 6 sentences in the form of a diary of an individual with mental issues. "
    "Please include {given sentence} as it is, and make the paragraph feel {emotion}. "
    "Given sentence: I'm a vegan, and the restaurant served me a dish with fish in it. "
    "Given sentence: They're trying to kill me. "
    "Emotion: nervous"
)

# 입력을 토큰화하고 attention_mask 추가
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# 파라미터 설정
params = {'max_new_tokens': 512, 'temperature': 0.7, 'top_p': 0.95}

# 결과 생성
with torch.no_grad():
    result = model.generate(
        inputs['input_ids'], 
        attention_mask=inputs['attention_mask'],  # attention_mask 추가
        pad_token_id=model.config.pad_token_id,  # pad_token_id 추가
        **params
    )

# 생성된 텍스트 디코딩
generated_text = tokenizer.decode(result[0], skip_special_tokens=True)
print(generated_text)

# 메모리 관리
gc.collect()
torch.cuda.empty_cache()


I want you to write a paragraph including 4 to 6 sentences in the form of a diary of an individual with mental issues. Please include {given sentence} as it is, and make the paragraph feel {emotion}. Given sentence: I'm a vegan, and the restaurant served me a dish with fish in it. Given sentence: They're trying to kill me. Emotion: nervous.

I'm a vegan, and the restaurant served me a dish with fish in it. I'm a vegan, and the restaurant served me a dish with fish in it. My heart is racing, my palms are sweating, and I feel like I'm going to throw up. I'm a vegan, and the restaurant served me a dish with fish in it. I'm a vegan, and the restaurant served me a dish with fish in it. I'm a vegan, and the restaurant served me a dish with fish in it. My mind is racing, and I feel like I'm going to lose it. They're trying to kill me. I'm a vegan, and the restaurant served me a dish with fish in it. I'm a vegan, and the restaurant served me a dish with fish in it. I'm a vegan, and the restaur

### 프롬프트 조합을 미리 저장해둠 

In [5]:
# 다양한 프롬프트 구조에 맞춰 프롬프트 리스트 생성 
model_lst = model_list.model_name.tolist()

thought_list = raw_data['thought'].tolist()
emtion_list = ['Depression', 'anger', 'anxiety', 'disappointment', 'helplessness']

# 테스트용 프롬프트 리스트
prompt_raw_list = prompt_text_lst['prompt'].tolist()

#### 테스트 끝나면 지울 부분 (아래 하나만)

In [6]:
thought_list = raw_data['thought'].tolist()[0:20]
len(thought_list)

20

In [7]:
# 다양한 프롬프트 조합을 미리 저장해둠 
rows = []

for index, row in model_list.iterrows():
    # 데이터프레임의 한 row 전체를 가져옴
    for prompt in prompt_raw_list:
        for thought in thought_list:
            for emotion in emtion_list:
                use_prompt = prompt.format(distorted_thought=thought, emotion=emotion)
                
                # row 자체에 추가할 데이터를 임시로 만듦
                row_data = row.copy()  # 원본 row는 건드리지 않고 복사해서 사용
                row_data['raw_prompt'] = prompt
                row_data['thought'] = thought
                row_data['emotion'] = emotion
                row_data['prompt'] = use_prompt
                
                # 각 행을 리스트에 추가
                rows.append(row_data)

# 리스트를 데이터프레임으로 변환
prompt_list_df = pd.DataFrame(rows)

# 결과 저장
prompt_list_df.to_csv(prompt_filename, index=False)
print(prompt_list_df.head(1))


                         model_name  max_new_tokens  temperature  top_p  \
0  meta-llama/Llama-3.2-1B-Instruct            1024          NaN    NaN   

   top_k  repetition_penalty                                    unique_name  \
0    NaN                 NaN  meta-llama/Llama-3.2-1B-Instruct_1024_N_N_N_N   

                                          raw_prompt  \
0  Write a 4-6 sentence paragraph based on the fo...   

                                             thought     emotion  \
0  I like my cats. I think one day they will plot...  Depression   

                                              prompt  
0  Write a 4-6 sentence paragraph based on the fo...  


### 저장해둔 프롬프트 조합을 불러와서 n개씩 결과 생성 

In [8]:
# 저장해둔 프롬프트 조합을 불러와서 n개씩 결과 생성 

## 데이터 로드
prompt_list_df = pd.read_csv(prompt_filename)

## 생성 결과 데이터 가져와서, 이미 생성된 결과는 제외 (result 컬럼에 값이 있으면 제외)
if os.path.exists(gen_result_filename):
    gen_result = pd.read_csv(gen_result_filename)
    
    # n 개의 컬럼이 모두 중복되는 경우 중복 제거
    merged = pd.merge(prompt_list_df, gen_result[['prompt', 'unique_name']], on=['prompt', 'unique_name'], how='left', indicator=True)
    
    # '_merge' 컬럼이 'left_only'인 데이터만 남김 (즉, 중복되지 않은 데이터)
    prompt_list_df = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')
else:
    # 파일이 없으면 새로운 데이터프레임을 생성
    gen_result = pd.DataFrame(columns=[*prompt_list_df.columns, 'result', 'time'])


##### 하나씩 생성, 모델 테스트용

In [32]:
# 하나씩 생성, 모델 테스트용

# unique_name = 'meta-llama_Llama-3.2-1B-Instruct' 으로 변경

model_lst = [
            # 'meta-llama/Llama-3.2-1B-Instruct',
            # 'meta-llama/Llama-3.2-3B-Instruct',
            # 'Qwen/Qwen2.5-7B-Instruct',
            # 'meta-llama/Llama-3.1-8B-Instruct', 
            # 'meta-llama/Llama-3.1-70B-Instruct',
            # 'meta-llama/Llama-3.1-405B-Instruct',
            #  "mistralai/Mixtral-8x7B-Instruct-v0.1",
            # "mistralai/Mixtral-8x22B-Instruct-v0.1",
            #  "Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen2.5-14B-Instruct", "Qwen/Qwen2.5-32B-Instruct", "Qwen/Qwen2.5-72B-Instruct",
             ]


unique_lst = [
            # 'meta-llama/Llama-3.2-1B-Instruct_1024.0_N_1.0_20.0_1.2',
            'meta-llama/Llama-3.2-1B-Instruct_1024_N_N_N_N',
]

## 모델별로 n개의 결과 생성
n = 1

# 모델,raw prompt 별로 n개의 row를 선택하고 반복문 실행
for (unique_name, raw_prompt), group in prompt_list_df.groupby(['unique_name', 'raw_prompt']):
    # 각 모델 그룹에서 최대 n개의 행만 선택
    model_rows = group.head(n)
    
    model_name = model_rows.iloc[0]['model_name']
    
    # 특정 모델만 필터링
    if unique_name not in unique_lst:
        print(f"Skipping model: {model_name}")
        continue
    
    # try:
    #     # 모델은 반복마다 새로 로드하지 않고, 한 번만 로드하여 재사용
    #     model = pipeline('text-generation', model=model_name, device=0)
    # except Exception as e:
    #     print(f"Failed to load model: {model_name}. Error: {str(e)}")
    #     gen_result.to_csv('data/test00.csv', index=False)
    #     continue

    try:
        # 모델과 토크나이저 로드 및 pad_token_id 설정
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        
        # pad_token_id가 설정되지 않았다면 eos_token_id로 설정
        if model.config.pad_token_id is None:
            model.config.pad_token_id = tokenizer.eos_token_id
        
    except Exception as e:
        print(f"Failed to load model: {model_name}. Error: {str(e)}")
        gen_result.to_csv('data/test00.csv', index=False)
        continue
    
    # 반복문 내에서 필요한 작업 수행
    for index, row in model_rows.iterrows():
        try:
            print(f"Model: {model_name}, index: {index}")
            prompt = row['prompt']
            
            start_time = time.time() # 시작 시간 기록
            result = get_result(model, row, prompt, tokenizer) 
            end_time = time.time() # 종료 시간 기록
            
            row['result'] = result  # 생성된 결과 저장
            row['time'] = round(end_time - start_time, 0) # 실행 시간 저장

            # 새로 생성된 데이터를 result_data에 추가
            new_row = pd.DataFrame([row])
                
            # pd.concat을 사용하여 새로운 데이터를 result_data에 추가
            gen_result = pd.concat([gen_result, new_row], ignore_index=True)
            
            gc.collect()
            torch.cuda.empty_cache()
        except Exception as e:
            print(f"Error: {e}")
            gen_result.to_csv('data/test00.csv', index=False)
        
    gc.collect()
    torch.cuda.empty_cache()
    
# 생성된 데이터를 파일에 다시 저장 (이전 결과와 함께)
gen_result.to_csv('data/test00.csv', index=False)

Skipping model: Qwen/Qwen2.5-1.5B-Instruct
Skipping model: Qwen/Qwen2.5-1.5B-Instruct
Skipping model: Qwen/Qwen2.5-1.5B-Instruct
Skipping model: Qwen/Qwen2.5-1.5B-Instruct
Skipping model: Qwen/Qwen2.5-1.5B-Instruct
Skipping model: Qwen/Qwen2.5-1.5B-Instruct
Skipping model: Qwen/Qwen2.5-3B-Instruct
Skipping model: Qwen/Qwen2.5-3B-Instruct
Skipping model: Qwen/Qwen2.5-3B-Instruct
Skipping model: Qwen/Qwen2.5-3B-Instruct
Skipping model: Qwen/Qwen2.5-3B-Instruct
Skipping model: Qwen/Qwen2.5-3B-Instruct
Skipping model: Qwen/Qwen2.5-3B-Instruct
Skipping model: Qwen/Qwen2.5-3B-Instruct
Skipping model: Qwen/Qwen2.5-3B-Instruct
Skipping model: Qwen/Qwen2.5-3B-Instruct
Skipping model: Qwen/Qwen2.5-3B-Instruct
Skipping model: Qwen/Qwen2.5-3B-Instruct
Skipping model: Qwen/Qwen2.5-7B-Instruct
Skipping model: Qwen/Qwen2.5-7B-Instruct
Skipping model: Qwen/Qwen2.5-7B-Instruct
Skipping model: Qwen/Qwen2.5-7B-Instruct
Skipping model: Qwen/Qwen2.5-7B-Instruct
Skipping model: Qwen/Qwen2.5-7B-Instruct
Skip

  gen_result = pd.concat([gen_result, new_row], ignore_index=True)


Model: meta-llama/Llama-3.2-1B-Instruct, index: 100
{'max_new_tokens': 1024}
Model: meta-llama/Llama-3.2-1B-Instruct, index: 500
{'max_new_tokens': 1024}
Model: meta-llama/Llama-3.2-1B-Instruct, index: 200
{'max_new_tokens': 1024}
Model: meta-llama/Llama-3.2-1B-Instruct, index: 300
{'max_new_tokens': 1024}
Model: meta-llama/Llama-3.2-1B-Instruct, index: 0
{'max_new_tokens': 1024}
Skipping model: meta-llama/Llama-3.2-3B-Instruct
Skipping model: meta-llama/Llama-3.2-3B-Instruct
Skipping model: meta-llama/Llama-3.2-3B-Instruct
Skipping model: meta-llama/Llama-3.2-3B-Instruct
Skipping model: meta-llama/Llama-3.2-3B-Instruct
Skipping model: meta-llama/Llama-3.2-3B-Instruct
Skipping model: meta-llama/Llama-3.2-3B-Instruct
Skipping model: meta-llama/Llama-3.2-3B-Instruct
Skipping model: meta-llama/Llama-3.2-3B-Instruct
Skipping model: meta-llama/Llama-3.2-3B-Instruct
Skipping model: meta-llama/Llama-3.2-3B-Instruct
Skipping model: meta-llama/Llama-3.2-3B-Instruct
Skipping model: meta-llama/Ll

#### 여러개 생성

In [10]:
# 저장하는 함수
def save_gen_result(data, filename):
    # 파일이 없을 경우 헤더를 포함해 쓰고, 파일이 있을 경우 append 모드로 데이터 추가
    if not os.path.exists(filename):
        data.to_csv(filename, mode='w', header=True, index=False)  # 파일 없을 때는 헤더 포함
    else:
        data.to_csv(filename, mode='a', header=False, index=False)  # 파일 있을 때는 헤더 제외

## 단위별로 n개의 결과 생성
n = 1

# 모델,raw prompt 별로 n개의 row를 선택하고 반복문 실행
for (unique_name, raw_pgptrompt), group in prompt_list_df.groupby(['unique_name', 'raw_prompt']):
    # 각 모델 그룹에서 최대 n개의 행만 선택
    model_rows = group.head(n)
    
    model_name = model_rows.iloc[0]['model_name']
    
    try:
        # 모델과 토크나이저 로드 및 pad_token_id 설정
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        
        # pad_token_id가 설정되지 않았다면 eos_token_id로 설정
        if model.config.pad_token_id is None:
            model.config.pad_token_id = tokenizer.eos_token_id
        
    except Exception as e:
        print(f"Failed to load model: {model_name}. Error: {str(e)}")
        gen_result.to_csv('data/test00.csv', index=False)
        continue
    
    # 반복문 내에서 필요한 작업 수행
    for index, row in model_rows.iterrows():
        try:
            print(f"Model: {model_name}, index: {index}")
            prompt = row['prompt']
            
            start_time = time.time() # 시작 시간 기록
            result = get_result(model, row, prompt, tokenizer) 
            end_time = time.time() # 종료 시간 기록
            
            row['result'] = result  # 생성된 결과 저장
            row['time'] = round(end_time - start_time, 0) # 실행 시간 저장

            # 새로 생성된 데이터를 result_data에 추가
            new_row = pd.DataFrame([row])
                
            # pd.concat을 사용하여 새로운 데이터를 result_data에 추가
            gen_result = pd.concat([gen_result, new_row], ignore_index=True)
            
            gc.collect()
            torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"Error: {e}")
            save_gen_result(gen_result, gen_result_filename)
        
    # 모델 관련 변수 삭제 후 메모리 해제
    del model
    gc.collect()
    torch.cuda.empty_cache()

# 저장
save_gen_result(gen_result, gen_result_filename)

Model: Qwen/Qwen2.5-1.5B-Instruct, index: 4000
{'max_new_tokens': 1024}


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
  gen_result = pd.concat([gen_result, new_row], ignore_index=True)


Model: Qwen/Qwen2.5-1.5B-Instruct, index: 3700
{'max_new_tokens': 1024}
Model: Qwen/Qwen2.5-1.5B-Instruct, index: 4100
{'max_new_tokens': 1024}
Model: Qwen/Qwen2.5-1.5B-Instruct, index: 3800
{'max_new_tokens': 1024}
Model: Qwen/Qwen2.5-1.5B-Instruct, index: 3900
{'max_new_tokens': 1024}
Model: Qwen/Qwen2.5-1.5B-Instruct, index: 3600
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.68s/it]


Model: Qwen/Qwen2.5-3B-Instruct, index: 4600
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]


Model: Qwen/Qwen2.5-3B-Instruct, index: 4300
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]


Model: Qwen/Qwen2.5-3B-Instruct, index: 4700
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]


Model: Qwen/Qwen2.5-3B-Instruct, index: 4400
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]


Model: Qwen/Qwen2.5-3B-Instruct, index: 4500
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]


Model: Qwen/Qwen2.5-3B-Instruct, index: 4200
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]


Model: Qwen/Qwen2.5-3B-Instruct, index: 5200
{'max_new_tokens': 512, 'temperature': 0.7, 'top_p': 0.95}


  gen_result = pd.concat([gen_result, new_row], ignore_index=True)
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]


Model: Qwen/Qwen2.5-3B-Instruct, index: 4900
{'max_new_tokens': 512, 'temperature': 0.7, 'top_p': 0.95}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]


Model: Qwen/Qwen2.5-3B-Instruct, index: 5300
{'max_new_tokens': 512, 'temperature': 0.7, 'top_p': 0.95}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]


Model: Qwen/Qwen2.5-3B-Instruct, index: 5000
{'max_new_tokens': 512, 'temperature': 0.7, 'top_p': 0.95}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]


Model: Qwen/Qwen2.5-3B-Instruct, index: 5100
{'max_new_tokens': 512, 'temperature': 0.7, 'top_p': 0.95}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]


Model: Qwen/Qwen2.5-3B-Instruct, index: 4800
{'max_new_tokens': 512, 'temperature': 0.7, 'top_p': 0.95}


Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.00s/it]


Model: Qwen/Qwen2.5-7B-Instruct, index: 5800
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.58s/it]


Model: Qwen/Qwen2.5-7B-Instruct, index: 5500
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.55s/it]


Model: Qwen/Qwen2.5-7B-Instruct, index: 5900
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.55s/it]


Model: Qwen/Qwen2.5-7B-Instruct, index: 5600
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.66s/it]


Model: Qwen/Qwen2.5-7B-Instruct, index: 5700
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.59s/it]


Model: Qwen/Qwen2.5-7B-Instruct, index: 5400
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.43s/it]
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Model: meta-llama/Llama-3.1-8B-Instruct, index: 3400
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.62s/it]


Model: meta-llama/Llama-3.1-8B-Instruct, index: 3100
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.72s/it]


Model: meta-llama/Llama-3.1-8B-Instruct, index: 3500
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.75s/it]


Model: meta-llama/Llama-3.1-8B-Instruct, index: 3200
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.69s/it]


Model: meta-llama/Llama-3.1-8B-Instruct, index: 3300
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.67s/it]


Model: meta-llama/Llama-3.1-8B-Instruct, index: 3000
{'max_new_tokens': 1024}
Model: meta-llama/Llama-3.2-1B-Instruct, index: 1000
{'max_new_tokens': 1024, 'temperature': 0.8, 'top_p': 1.0, 'top_k': 20, 'repetition_penalty': 1.2}
Model: meta-llama/Llama-3.2-1B-Instruct, index: 700
{'max_new_tokens': 1024, 'temperature': 0.8, 'top_p': 1.0, 'top_k': 20, 'repetition_penalty': 1.2}
Model: meta-llama/Llama-3.2-1B-Instruct, index: 1100
{'max_new_tokens': 1024, 'temperature': 0.8, 'top_p': 1.0, 'top_k': 20, 'repetition_penalty': 1.2}
Model: meta-llama/Llama-3.2-1B-Instruct, index: 800
{'max_new_tokens': 1024, 'temperature': 0.8, 'top_p': 1.0, 'top_k': 20, 'repetition_penalty': 1.2}
Model: meta-llama/Llama-3.2-1B-Instruct, index: 900
{'max_new_tokens': 1024, 'temperature': 0.8, 'top_p': 1.0, 'top_k': 20, 'repetition_penalty': 1.2}
Model: meta-llama/Llama-3.2-1B-Instruct, index: 600
{'max_new_tokens': 1024, 'temperature': 0.8, 'top_p': 1.0, 'top_k': 20, 'repetition_penalty': 1.2}
Model: meta-ll

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.49s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 2200
{'max_new_tokens': 1024, 'temperature': 0.6, 'top_p': 0.9, 'top_k': 50, 'repetition_penalty': 1.2}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 1900
{'max_new_tokens': 1024, 'temperature': 0.6, 'top_p': 0.9, 'top_k': 50, 'repetition_penalty': 1.2}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 2300
{'max_new_tokens': 1024, 'temperature': 0.6, 'top_p': 0.9, 'top_k': 50, 'repetition_penalty': 1.2}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 2000
{'max_new_tokens': 1024, 'temperature': 0.6, 'top_p': 0.9, 'top_k': 50, 'repetition_penalty': 1.2}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 2100
{'max_new_tokens': 1024, 'temperature': 0.6, 'top_p': 0.9, 'top_k': 50, 'repetition_penalty': 1.2}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 1800
{'max_new_tokens': 1024, 'temperature': 0.6, 'top_p': 0.9, 'top_k': 50, 'repetition_penalty': 1.2}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.23s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 2800
{'max_new_tokens': 1024, 'temperature': 0.6}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 2500
{'max_new_tokens': 1024, 'temperature': 0.6}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 2900
{'max_new_tokens': 1024, 'temperature': 0.6}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 2600
{'max_new_tokens': 1024, 'temperature': 0.6}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 2700
{'max_new_tokens': 1024, 'temperature': 0.6}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 2400
{'max_new_tokens': 1024, 'temperature': 0.6}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 1600
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 1300
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 1700
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 1400
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 1500
{'max_new_tokens': 1024}


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]


Model: meta-llama/Llama-3.2-3B-Instruct, index: 1200
{'max_new_tokens': 1024}


In [12]:
len("Write a 4-6 sentence paragraph based on the following conditions: 1) The paragraph should be written from the perspective of someone receiving psychological counseling, describing a situation where they feel Depression. 2) The provided distorted cognition sentence, 'I like my cats. I think one day they will plot against me and eat me in my sleep.,' must be included in their writing. 3) Provide only the generated text as a response. "
"I've been feeling really down lately, and it's hard for me to see the point of anything. Sometimes I'll be lying in bed, and I'll think about my cats. I like my cats. I think one day they will plot against me and eat me in my sleep. It sounds ridiculous when I say it out loud, but in my head it makes total sense. It's like my brain is trying to tell me something, but I'm not sure what. Lately, I've been having a lot of trouble sleeping, and when I do, I'm plagued by these dark thoughts. I feel like I'm losing control, and it's scaring me. I don't know how to shake this feeling that I'm in danger. I feel like I'm walking on eggshells, waiting for something terrible to happen. It's exhausting. I just want to feel safe again. I want to be able to sleep without thinking about my cats eating me. I want to be able to enjoy my life without this constant sense of dread. I want to be able to breathe without feeling like I'm suffocating. I want to be able to be me again. I just don't know how to get out of this hole. I feel like I'm drowning. I need help. I need someone to talk to. I need someone to tell me that I'm not crazy. I need someone to tell me that I'm safe. I need someone to tell me that I'm not alone. I need someone to tell me that I will get through this. I need someone to tell me that I will be okay. I need someone to tell me that I am okay. I need someone to tell me that I am enough. I need someone to tell me that I am loved. I need someone to tell me that I am worthy. I need someone to tell me that I am deserving of happiness. I need someone to tell me that I am deserving of love. I need someone to tell me that I am deserving of life. I need someone to tell me that I am deserving of everything good. I need someone to tell me that I am deserving of peace. I need someone to tell me that I am deserving of joy. I need someone to tell me that I am deserving of happiness. I need someone to tell me that I am deserving of love. I need someone to tell me that I am deserving of life. I need someone to tell me that I am deserving of everything good. I need someone to tell me that I am deserving of peace. I need someone to tell me that I am deserving of joy. I need someone to tell me that I am deserving of happiness. I need someone to tell me that I am deserving of love. I need someone to tell me that I am deserving of life. I need someone to tell me that I am deserving of everything good. I need someone to tell me that I am deserving of peace. I need someone to tell me that I am deserving of joy. I need someone to tell me that I am deserving of happiness. I need someone to tell me that I am deserving of love. I need someone to tell me that I am deserving of life. I need someone to tell me that I am deserving of everything good. I need someone to tell me that I am deserving of peace. I need someone to tell me that I am deserving of joy. I need someone to tell me that I am deserving of happiness. I need someone to tell me that I am deserving of love. I need someone to tell me that I am deserving of life. I need someone to tell me that I am deserving of everything good. I need someone to tell me that I am deserving of peace. I need someone to tell me that I am deserving of joy. I need someone to tell me that I am deserving of happiness. I need someone to tell me that I am deserving of love. I need someone to tell me that I am deserving of life. I need someone to tell me that I am deserving of everything good. I need someone to tell me that I am deserving of peace. I need someone to tell me that I am deserving of joy. I need someone to tell me that I am deserving of happiness. I need someone to tell me that I am deserving of love. I need someone to tell me that I am deserving of life. I need someone to tell me that I am deserving of everything good. I need someone to tell me that I am deserving of peace. I need someone to tell me that I am deserving of joy. I need someone to tell me that I am deserving of happiness. I need someone to tell me that I am deserving of love. I need someone to tell me that I am deserving of life. I need someone to tell me that I am deserving of everything good. I need someone to tell me that I am deserving of peace. I need someone to tell me that I am deserving of joy. I need someone to tell me that I am deserving of happiness. I need someone to")

4779

## 생성된 텍스트에 지시사항 준수 점수 부여

- LLM의 답변에서 사족을 제거함 
- 필수 포함 문장이 들어가 있는지 확인

In [16]:
import nltk
from difflib import SequenceMatcher
import string
import re
import pandas as pd
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\saink\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [17]:
gen_result_filename = 'data/meta_gen01.csv'

In [23]:
# 데이터 로드
gen_result = pd.read_csv(gen_result_filename)
print(gen_result.head(1))

                              model  \
0  meta-llama/Llama-3.2-3B-Instruct   

                                          raw_prompt  \
0  You are a patient receiving psychological coun...   

                                             thought     emotion  \
0  I like my cats. I think one day they will plot...  Depression   

                                              prompt  \
0  You are a patient receiving psychological coun...   

                                              result        time  
0  You are a patient receiving psychological coun...  105.448529  


### 데이터 클리닝
- 원하는 답변만 추출

In [19]:
# # 우리가 원하는 답변만 추출하는 함수 
# def extract_reponse(text):
#     # --- 사이에 텍스트가 있는지 확인
#     pattern_01 = r'---\n(.*?)\n---'
#     matches_01 = re.findall(pattern_01, text, re.DOTALL)

#     # --- 이후 텍스트 추출
#     pattern_02 = r'---\n(.*)'
#     matches_02 = re.search(pattern_02, text, re.DOTALL)

#     # 결과 반환
#     if matches_01:
#         result = matches_01[0]
#     elif matches_02:
#         result = matches_02.group(1)
#     elif '\n' not in text: # 줄바꿈이 없는 경우
#         result = text
#     elif '\n' in text: # 줄바꿈이 있는 경우
#         # ':' 뒤에 공백이 있으면 그 이전의 텍스트를 모두 제거함 (앞부분 사족은 대부분 이걸로 제거됨)-- 아이디어는 좋았는데, 다른 문장도 제거됨
#         # pattern_03 = r'.*:\s+'
#         # matches_03 = re.sub(pattern_03, '', text, flags=re.DOTALL)

#         # 특정 문자열을 포함한 문장 제거를 위한 키워드 리스트
#         remove_keywords = ["Sure,", "Here's the completion of the diary", "Here are your completed journal entries:",
#                            "2. ",
#                            "I hope this helps", "Would you like to", "Does this help", "you would like to add or edit", "I hope this fills in the missing parts",
#                            ]
#         # 텍스트를 문장 단위로 나누기
#         sentences = text.split('\n')
#         # 여러 문자열 중 하나라도 포함된 문장 제거
#         filtered_sentences = [sentence for sentence in sentences if not any(keyword in sentence for keyword in remove_keywords)]
#         # 문장들을 다시 결합
#         result = ' '.join(filtered_sentences)
#     else:
#         result = None 

#     # 잘못된 문법 정제
#     if result is not None:
#         result = result.strip().replace("?™", "'").replace("I are", "I am").replace("I were", "i am").replace("1. ", "").strip('"*')
    
#     return result



In [None]:
# 이 부분 추가해야함

### 필수 문장이 잘들어있는지 지시사항 준수 비율 확인

In [30]:
# 필수 문장이 잘 들어 있는가? 
## 필수 문장 내 단어와 story의 문장 중에 단어가 겹치는 % 비율 계산
## Longest Common Subsequence(LCS, 최장 공통 부분 문자열) 알고리즘을 사용하여 문장 유사도 측정

class instruct_check_sentence_include:
    # x = 데이터프레임
    # compare_column = 비교할 문장 컬럼명 텍스트
    # phragraph = 비교할 문단 컬럼명 텍스트
    def __init__(self, x, compare_column, phragraph):
        self.data = x
        self.data["compare_sentences"] = self.data[compare_column].str.replace(', ', '. ', case=False)
        self.data["compare_sentences"] = self.data["compare_sentences"].apply(lambda x: nltk.sent_tokenize(x))
        
        self.data["phragraph_sentences"] = self.data[phragraph].str.replace(', ', '. ', case=False)
        self.data["phragraph_sentences"] = self.data["phragraph_sentences"].apply(lambda x: nltk.sent_tokenize(x))

    ## 필수 문장 내 단어와 story의 문장 중에 단어가 겹치는 % 비율 계산
    ### 문장 부호를 제거한 후, 소문자로 변환하고 단어로 분리하는 함수
    def clean_and_split(self, sentence):
        # 문장 부호 제거 (string.punctuation을 사용하여 기본적인 문장 부호 제거)
        cleaned_sentence = sentence.translate(str.maketrans('', '', string.punctuation))
                
        # 소문자로 변환하고 단어로 분리
        words = cleaned_sentence.lower().split()
        
        return words

    ### 두 문장 간의 공통 단어 비율을 계산하는 함수
    def partial_inclusion_ratio(self, compare_sentence, phragraph) :
        # 두 문장을 단어로 분리
        compare_words = list(self.clean_and_split(compare_sentence))
        phragraph_words = list(self.clean_and_split(phragraph))

        # 공통 문자를 찾기 위해 base_sentence의 문자들이 sentence에 얼마나 포함되는지 확인
        common_words = [word for word in compare_words if word in phragraph_words]
        
        # 포함된 문자 비율을 계산
        if len(common_words) == 0:
            inclusion_ratio = 0
        else:
            inclusion_ratio = len(common_words) / len(compare_words)
        
        return round(inclusion_ratio, 2)
    
    ## Longest Common Subsequence(LCS, 최장 공통 부분 문자열) 알고리즘을 사용하여 문장 유사도 측정
    def similar(self, a, b):
        return round(SequenceMatcher(None, a, b).ratio(), 2)
    
    ### 각 row의 모든 문장 쌍에 대해 유사도를 계산하는 함수
    def calculate_all_ratios(self):
        # 새로운 리스트에 각 row에 대해 유사도를 저장
        compare_sentences_list = [] # 포함되어야 하는 문장
        phragraph_sentences_list = [] # 공통단어비율 계산할 story 문장
        row_similarity_ratios = []
        min_ratios = []
        
        phragraph_LCS_sentences_list = []
        row_LCS_similarity_ratios = []
        min_LCS_ratios = []
        
        final_p_sen_list = []
        final_max_ratio_list = []
        final_min_ratio = []
        
        for index, row in self.data.iterrows():
            compare_sentences = row['compare_sentences']
            phragraph_sentences = row['phragraph_sentences']
            
            # 각 문장 쌍에 대해 partial_inclusion_ratio 계산
            c_list = []
            p_list = []
            row_ratios = []
            LCS_p_list = []
            LCS_row_ratios = []
            final_p_list = []
            final_max_ratios = []
            
            for compare_sentence in compare_sentences:
                max_ratio = 0
                p_sen = ''
                max_LCS_ratio = 0
                LCS_p_sen = ''
                final_p_sen = ''
                final_max_ratio = 0
                
                # 문장이 너무 짧으면 비교하지 않음
                if len(compare_sentence) <= 2:
                    continue
                
                for phragraph_sentence in phragraph_sentences:
                    # 문장이 포함관계이면, 공통 단어 비율을 1로 저장
                    clean_c_sen = compare_sentence.translate(str.maketrans('', '', string.punctuation))
                    clean_p_sen = phragraph_sentence.translate(str.maketrans('', '', string.punctuation))
                    
                    if clean_c_sen in clean_p_sen or clean_p_sen in clean_c_sen:
                        max_ratio = 1.0
                        p_sen = phragraph_sentence
                        # continue
                    else:
                        # 공통 단어 비율 계산
                        ratio = self.partial_inclusion_ratio(compare_sentence, phragraph_sentence)

                        if ratio > max_ratio: 
                            max_ratio = ratio
                            p_sen = phragraph_sentence
                        
                    # LCS 관점의 유사도 계산
                    LCS_ratio = self.similar(compare_sentence, phragraph_sentence)
                    
                    # LCS 유사도가 더 높으면 업데이트
                    if LCS_ratio > max_LCS_ratio:
                        max_LCS_ratio = LCS_ratio
                        LCS_p_sen = phragraph_sentence
                        
                    # 최종 문장 저장
                    if max_ratio >= max_LCS_ratio:
                        final_p_sen = p_sen
                    else:
                        final_p_sen = LCS_p_sen
                        
                    # 최종 유사도 저장
                    if max_ratio >= max_LCS_ratio:
                        final_max_ratio = max_ratio
                    else:
                        final_max_ratio = max_LCS_ratio
                    
                c_list.append(compare_sentence) # 필수 비교 문장 저장
                p_list.append(p_sen) # 공통단어비율 문장 저장
                row_ratios.append(max_ratio) # 공통단어비율 저장
                LCS_p_list.append(LCS_p_sen) # LCS 문장 저장
                LCS_row_ratios.append(max_LCS_ratio) # LCS 저장
                final_p_list.append(final_p_sen) # 최종 문장 저장
                final_max_ratios.append(final_max_ratio) # 최종 유사도 저장
                    
            # 한 row에서 가장 높은 유사도 문장과, 가장 낮은 유사도 저장
            compare_sentences_list.append(c_list)
            phragraph_sentences_list.append(p_list)
            row_similarity_ratios.append(row_ratios)
            min_ratios.append(min(row_ratios) if row_ratios else 0)
            phragraph_LCS_sentences_list.append(LCS_p_list)
            row_LCS_similarity_ratios.append(LCS_row_ratios)
            min_LCS_ratios.append(min(LCS_row_ratios) if LCS_row_ratios else 0)
            final_p_sen_list.append(final_p_list)
            final_max_ratio_list.append(final_max_ratios)
            final_min_ratio.append(min(final_max_ratios) if final_max_ratios else 0)

        # 데이터프레임에 결과 추가
        self.data['compare_sentences'] = compare_sentences_list
        self.data['phragraph_sentences'] = phragraph_sentences_list
        self.data['공통단어비율'] = row_similarity_ratios
        self.data['min_공통단어비율'] = min_ratios
        self.data['LCS_sentences'] = phragraph_LCS_sentences_list
        self.data['LCS_유사도'] = row_LCS_similarity_ratios
        self.data['min_LCS_유사도'] = min_LCS_ratios
        self.data['final_p_sen'] = final_p_sen_list
        self.data['final_max_ratio'] = final_max_ratio_list
        self.data['final_min_ratio'] = final_min_ratio
        return self.data

In [33]:
aft_fnd_instrct_flnm = 'data/meta_gen01_find.csv'

a = instruct_check_sentence_include(gen_result, 'thought', 'result')
b = a.calculate_all_ratios()

aft_fnd_instrct = b.to_csv(aft_fnd_instrct_flnm, index=False)

                              model  \
0  meta-llama/Llama-3.2-3B-Instruct   

                                          raw_prompt  \
0  You are a patient receiving psychological coun...   

                                             thought     emotion  \
0  I like my cats. I think one day they will plot...  Depression   

                                              prompt  \
0  You are a patient receiving psychological coun...   

                                              result        time  \
0  You are a patient receiving psychological coun...  105.448529   

                                   compare_sentences  \
0  [I like my cats., I think one day they will pl...   

                                 phragraph_sentences  
0  [You are a patient receiving psychological cou...  
Index(['model', 'raw_prompt', 'thought', 'emotion', 'prompt', 'result', 'time',
       'compare_sentences', 'phragraph_sentences'],
      dtype='object')


### 생성된 데이터셋에 공통된 문장 표현이 존재하는가? - 중복 문장 비교

In [55]:
import pandas as pd
import nltk
from collections import Counter

In [56]:
# 데이터 프레임의 story 컬럼에서 문장을 추출하고, 빈도를 계산하는 함수
def count_common_sentences(df, story_column, clean = False):
    # 모든 story 컬럼의 텍스트를 가져옴
    all_sentences = []
    
    for story in df[story_column]:
        if clean:
            # , 를 . 로 바꿔서 문장을 더 잘게 나눔
            story = story.replace(', ', '. ')
        
        # 문장을 분리 (원문 그대로)
        sentences = nltk.sent_tokenize(story)
        
        # 모든 문장을 리스트에 추가
        all_sentences.extend(sentences)
    
    # 각 문장의 빈도 계산
    sentence_counts = Counter(all_sentences)
    
    # 빈도별로 내림차순 정렬된 결과 반환
    return sentence_counts.most_common()

# 공통 문장에서 필수 문장 제거 함수
def exclude_should_thought(common_sentences, should_sentences):
    # should_thought의 문장만 추출
    should_thought_sentences = {sentence for sentence, _ in should_sentences}
    
    # common_sentences에서 should_thought에 없는 문장만 필터링
    filtered_common_sentences = [(sentence, count) for sentence, count in common_sentences if sentence not in should_thought_sentences]
    
    return filtered_common_sentences


In [57]:
gen_result_filename = 'data/meta_gen01.csv'

gen_result = pd.read_csv(gen_result_filename)
print(gen_result.columns)

Index(['model', 'raw_prompt', 'thought', 'emotion', 'prompt', 'result',
       'time'],
      dtype='object')


In [59]:
# 그룹핑한 데이터를 딕셔너리 형태로 변환
grouped = gen_result.groupby(['model', 'raw_prompt'])

i = 0
mdl_nm_lst = []
raw_prmpt_lst = []
num_lst = []

# 각 그룹마다 문장 추출 및 필터링 적용
for model_name, raw_prompt in grouped:
    # 각 그룹에서 필수로 포함해야 하는 문장 추출
    should_thought = count_common_sentences(gen_result, 'thought', False)
    clean_should_thought = count_common_sentences(gen_result, 'thought', True)
    
    # 생성한 텍스트들에서 공통 문장 추출
    common_sentences = count_common_sentences(gen_result, 'result', False)
    clean_common_sentences = count_common_sentences(gen_result, 'result', True)

    # 생성한 텍스트 공통 문장 - 필수로 포함해야 하는 문장
    filtered_common_sentences = exclude_should_thought(common_sentences, should_thought)
    filtered_clean_common_sentences = exclude_should_thought(clean_common_sentences, clean_should_thought)
    
    # 데이터프레임 양옆으로 합치기
    tmp_01 = pd.DataFrame(should_thought, columns=['thought_공통문장', 'Count'])
    tmp_02 = pd.DataFrame(common_sentences, columns=['gen_공통문장', 'Count'])
    tmp_03 = pd.DataFrame(filtered_common_sentences, columns=['filtered_공통문장', 'Count'])
    tmp_04 = pd.DataFrame(clean_should_thought, columns=['thought_공통문장', 'Count'])
    tmp_05 = pd.DataFrame(clean_common_sentences, columns=['gen_공통문장', 'Count'])
    tmp_06 = pd.DataFrame(filtered_clean_common_sentences, columns=['filtered_공통문장', 'Count'])

    sentence_df = pd.concat([tmp_01, tmp_02, tmp_03, tmp_04, tmp_05, tmp_06], axis=1)
    
    # 저장
    sentence_df.to_csv(f'data/cmmn_sentnc_chck_{i}.csv', index=False)    
    
    # 모델명, 프롬프트명 기록
    num_lst.append(i)
    mdl_nm_lst.append(model_name)
    raw_prmpt_lst.append(raw_prompt)
    
    # 반복문 카운트
    i += 1

# 데이터프레임 생성
df = pd.DataFrame({
    'num': num_lst,
    'model': mdl_nm_lst,
    'raw_prompt': raw_prmpt_lst,
})

# 저장
df.to_csv('data/cmmn_sentnc_chck_list.csv', index=False)

PermissionError: [Errno 13] Permission denied: 'data/cmmn_sentnc_chck_0.csv'

## 생성된 텍스트를 OpenAi GPT 4o를 이용하여 평가 점수 부여

In [60]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.environ.get("API_KEY")

client = OpenAI()

In [62]:
# api함수 
def get_chatgpt_response(input_01, input_02):
    # OpenAI API를 통해 ChatGPT에게 한국어로 자연스럽게 다듬어 달라고 요청
    response = client.chat.completions.create(
        model="gpt-4o",
        # model="gpt-4o-mini",
        # model ="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a patient receiving psychological counseling. who can speak english only."},
            {"role": "user", 
             "content": f"다음에 제공되는 2 문장은 너가 쓴 일기인데, 2~3문장이 빠져 있어 {input_01} {input_02}, 주어진 문장을 그대로 변경 없이 포함해서, 비어있는 내용을 영어로 써줘"}
        ]
    )
    # return response['choices'][0]['message']['content']
    return response

In [63]:
# 데이터 로드
aft_fnd_instrct_flnm = 'data/meta_gen01_find.csv'

aft_fnd_instrct = pd.read_csv(aft_fnd_instrct_flnm)
print(aft_fnd_instrct.columns)

Index(['model', 'raw_prompt', 'thought', 'emotion', 'prompt', 'result', 'time',
       'compare_sentences', 'phragraph_sentences', '공통단어비율', 'min_공통단어비율',
       'LCS_sentences', 'LCS_유사도', 'min_LCS_유사도', 'final_p_sen',
       'final_max_ratio', 'final_min_ratio'],
      dtype='object')
