# 문장 유사도를 여러 방식으로 비교

### 발생한 이슈
- 같은 텍스트를 넣었는데, openai api에서 포함해야 하는 텍스트를 제외하는 일이 발생함 -> 문장유사도를 판단하여, 유사한 문장을 포함하지 않으면 제외하는 방식 사용

In [1]:
import pandas as pd
import nltk
from difflib import SequenceMatcher

# NLTK 패키지에서 Punkt tokenizer를 다운로드 (한 번만 실행)
# nltk.download('punkt')

### 레벤슈타인 거리 방식으로 유사도를 판단

**레벤슈타인 거리(Levenshtein distance)**는 두 문자열 사이의 차이를 측정하는 방법 중 하나로, 한 문자열을 다른 문자열로 변환하는 데 필요한 편집 작업의 최소 개수를 계산합니다. 이 편집 작업에는 삽입, 삭제, 대체가 포함됩니다.

- 아래에서 사용된 데이터는 2개 문장을 이용하여 생성한 것으로, 랜덤 샘플링하여 확인했을때 잘 만들었다고 느껴졌었음
- 문장 유사도가 어느정도 되어야 하는지 확인하는데 사용함
- 결론 : 0.6 정도는 되야 쓸만함 -> 0.6 미만인 문장이 존재한다면 필터링 할 것

In [2]:
# 문장유사도 비교
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# '포함해야 하는 문장들' 내의 1개의 문장과 '만들어진 장문' 내 문장들을 비교하여, 가장 유사한 문장을 1개 반환 
def find_similar_sentences(base_sentence, long_text, threshold=0):
    # 긴 텍스트를 문장으로 분리
    sentences = nltk.sent_tokenize(long_text)
    
    # 유사한 문장 추출
    base = ""
    long = ""
    similarity_final = 0
    for sentence in sentences:
        similarity = similar(base_sentence, sentence)
        if similarity >= threshold and similarity > similarity_final:
            base = base_sentence
            long = sentence
            similarity_final = similarity
    return base, long, similarity_final

In [3]:
raw = pd.read_csv("data/c2d2_refined_0_500_cleaned.csv")
raw.head()

Unnamed: 0,Num,Scenario,Thought,Label,Refined_Thought,Cleaned_Refined_Thought
0,1,"I'm an introverted person, and I've just arriv...",Are the people in this environment unfriendly?,Overgeneralization,"Sure, I can help with that. Here's a possible ...","I'm an introverted person, and I've just arriv..."
1,2,"Recently, I feel dizzy sometimes when I stand ...",I'm so dizzy. Am I sick? I should probably go ...,No Distortion,"Recently, I feel dizzy sometimes when I stand ...","Recently, I feel dizzy sometimes when I stand ..."
2,3,"I'm walking down the street and feel hungry, b...","I'm tired and there's no place to rest, I'm hu...",Overgeneralization,"I'm walking down the street and feel hungry, b...","I'm walking down the street and feel hungry, b..."
3,4,"Work has been busy lately, but I have caught a...",Why did I catch a cold at this time? I feel so...,No Distortion,"Work has been busy lately, but I have caught a...","Work has been busy lately, but I have caught a..."
4,5,My mom and I are discussing future plans. She ...,"Mom is trying to control my life again, wantin...",Fortune-telling,My mom and I are discussing future plans. She ...,My mom and I are discussing future plans. She ...


In [4]:
# raw['Scenario_Sentences'] = raw['Scenario'].apply(lambda x: x.split('. '))
raw['Scenario_Sentences'] = raw['Scenario'].apply(lambda x: nltk.sent_tokenize(x))
raw['Thought_Sentences'] = raw['Thought'].apply(lambda x: nltk.sent_tokenize(x))
raw['all_Sentences'] = raw['Scenario_Sentences'] + raw['Thought_Sentences']
raw['all_Sentences']

0      [I'm an introverted person, and I've just arri...
1      [Recently, I feel dizzy sometimes when I stand...
2      [I'm walking down the street and feel hungry, ...
3      [Work has been busy lately, but I have caught ...
4      [My mom and I are discussing future plans., Sh...
                             ...                        
495    [Ir parents are dissatisfied with I because I ...
496    [When someone says I look unhappy, it's as if ...
497    [I are pushed to the point of explosion by Ir ...
498    [I were preparing to run for president of the ...
499    [The takeaway order was stolen., Who is so wic...
Name: all_Sentences, Length: 500, dtype: object

In [5]:
print(raw.iloc[2]['Scenario_Sentences']), print(raw.iloc[2]['Thought_Sentences']), print(raw.iloc[2]['all_Sentences'])

["I'm walking down the street and feel hungry, but there's no restaurant around."]
["I'm tired and there's no place to rest, I'm hungry and there's no place to eat.", 'Why do unfortunate things always happen to me?']
["I'm walking down the street and feel hungry, but there's no restaurant around.", "I'm tired and there's no place to rest, I'm hungry and there's no place to eat.", 'Why do unfortunate things always happen to me?']


(None, None, None)

In [6]:
# # 결과 저장할 데이터프레임 생성
# result_df = pd.DataFrame()
# result_df_2 = pd.DataFrame()

# # 각 행에 대해 a 컬럼의 텍스트와 b 컬럼의 문장들 간 유사도 비교
# for index, row in raw.iterrows():
    
#     base_list = []
#     long_list = []
#     similarity_list = []
    
#     long_text = row['Cleaned_Refined_Thought']
    
#     for base_text in row['all_Sentences']:
#         # print(base_text)
#         base, long, similarity = find_similar_sentences(base_text, long_text)
    
#         base_list.append(base)
#         long_list.append(long)
#         similarity_list.append(similarity)
    
#     # 유사도가 있는 경우 평균 계산, 없으면 0으로 설정
#     if similarity_list:
#         average_similarity = sum(similarity_list) / len(similarity_list)
#     else:
#         average_similarity = 0
        
#     # 데이터프레임으로 만들어서 비교
#     result = pd.DataFrame({
#         'base': [base_list],
#         'long': [long_list],
#         'similarity': [similarity_list],
#         'average_similarity': average_similarity
#     })
    
#     result_df = pd.concat([result_df, result], ignore_index=True)
    
#     # 데이터프레임으로 만들어서 비교 (threshold 비교하기 쉽게 만든 버전)
#     result_2 = pd.DataFrame({
#         'base': base_list,
#         'long': long_list,
#         'similarity': similarity_list,
#         'average_similarity': average_similarity
#     })
    
#     result_df_2 = pd.concat([result_df_2, result_2], ignore_index=True)


In [18]:
class SentenceSimilarity:
    def __init__(self, raw_data, story_text_col_nm):
        self.raw = raw_data
        self.result_df = pd.DataFrame()
        self.result_df_2 = pd.DataFrame()
        self.find_similar_sentences = find_similar_sentences
        self.story_text_col_nm = story_text_col_nm
    
    def calculate_similarity(self):
        for index, row in self.raw.iterrows():
            base_list = []
            long_list = []
            similarity_list = []

            long_text = row[self.story_text_col_nm]

            for base_text in row['all_Sentences']:
                base, long, similarity = self.find_similar_sentences(base_text, long_text)

                base_list.append(base)
                long_list.append(long)
                similarity_list.append(similarity)

            # 유사도가 있는 경우 평균 계산, 없으면 0으로 설정
            if similarity_list:
                average_similarity = sum(similarity_list) / len(similarity_list)
            else:
                average_similarity = 0

            # 결과 저장 (result_df)
            result = pd.DataFrame({
                'base': [base_list],
                'long': [long_list],
                'similarity': [similarity_list],
                'average_similarity': average_similarity
            })
            self.result_df = pd.concat([self.result_df, result], ignore_index=True)

            # 결과 저장 (result_df_2, threshold 비교하기 쉽게 만든 버전)
            result_2 = pd.DataFrame({
                'base': base_list,
                'long': long_list,
                'similarity': similarity_list,
                'average_similarity': average_similarity
            })
            self.result_df_2 = pd.concat([self.result_df_2, result_2], ignore_index=True)

    def get_results(self):
        return self.result_df, self.result_df_2


In [11]:
# 클래스 인스턴스 생성
similarity_checker = SentenceSimilarity(raw, 'Cleaned_Refined_Thought')

# 유사도 계산 실행
similarity_checker.calculate_similarity()

# 결과 가져오기
result_df, result_df_2 = similarity_checker.get_results()

In [13]:
# 저장
result_df.to_csv('data/Sentence_similarity_comparison.csv', index=False)
result_df_2.to_csv('data/Sentence_similarity_comparison_easy.csv', index=False)

# 결과를 raw 데이터프레임에 추가
raw['base'] = result_df['base']
raw['long'] = result_df['long']
raw['similarity'] = result_df['similarity']
raw['average_similarity'] = result_df['average_similarity']

raw.to_csv('data/c2d2_0_500_similarity.csv', index=False)

print(result_df)

                                                  base  \
0    [I'm an introverted person, and I've just arri...   
1    [Recently, I feel dizzy sometimes when I stand...   
2    [I'm walking down the street and feel hungry, ...   
3    [Work has been busy lately, but I have caught ...   
4    [My mom and I are discussing future plans., Sh...   
..                                                 ...   
495  [Ir parents are dissatisfied with I because I ...   
496  [When someone says I look unhappy, it's as if ...   
497  [I are pushed to the point of explosion by Ir ...   
498  [I were preparing to run for president of the ...   
499  [The takeaway order was stolen., Who is so wic...   

                                                  long  \
0    [I'm an introverted person, and I've just arri...   
1    [Recently, I feel dizzy sometimes when I stand...   
2    [I'm walking down the street and feel hungry, ...   
3    [Work has been busy lately, but I have caught ...   
4    [My mom 

### mete를 이용해 생성한 데이터도 확인
- 잘 모르겠다!

In [14]:
raw = pd.read_csv("data/meta_refined01.csv")
raw.head()

Unnamed: 0,persona,pattern,pattern_def,thought,scenario,persona_in_scenario,thought_in_scenario,Refined_Thought
0,i have a pencil thin mustache . i have six tat...,Catastrophizing,Catastrophizing: Giving greater weight to the ...,I like my cats. I think one day they will plot...,i have a pencil thin mustache . i have six tat...,True,True,"Sure, I can help you with that. Here's a recon..."
1,i like visiting art museums in my spare time ....,Overgeneralization,Someone who overgeneralizes makes faulty gener...,"I'm a vegan, and the restaurant served me a di...",i like visiting art museums in my spare time ....,True,True,"Sure, I understand. Here is the completed diar..."
2,i like to train dogs . i like to make cookies ...,Jumping to conclusions: mind reading,Inferring a person's possible or probable (usu...,The cashier at the bagel store messed up my or...,i like to train dogs . i like to make cookies ...,True,True,"Sure, here is a possible continuation of your ..."
3,i like to eat tune . i've two cats . i like to...,Black-and-white or polarized thinking / All or...,Looking at life in all-or-nothing categories. ...,One of my cats is sick so I'll never adopt ano...,i like to eat tune . i've two cats . i like to...,True,True,"Sure, here's the diary entry with additional s..."
4,i like visiting art museums in my spare time ....,Jumping to conclusions: Fortune-telling,Predicting outcomes (usually negative) of events.,My new boyfriend is going to dump me once they...,i like visiting art museums in my spare time ....,True,True,My new boyfriend is going to dump me once they...


In [16]:
raw['all_Sentences'] = raw['thought'].apply(lambda x: nltk.sent_tokenize(x))
raw['all_Sentences']

0       [I like my cats., I think one day they will pl...
1       [I'm a vegan, and the restaurant served me a d...
2       [The cashier at the bagel store messed up my o...
3       [One of my cats is sick so I'll never adopt an...
4       [My new boyfriend is going to dump me once the...
                              ...                        
1721    [I have an appointment with Jim's dog Sparky t...
1722    [My performance in the church choir was praise...
1723    [I hiked in the past on the Appalachian trials...
1724    [I have four siblings and one of them was mean...
1725    [I found a new spot to get hamburgers., I'm th...
Name: all_Sentences, Length: 1726, dtype: object

In [19]:
# 클래스 인스턴스 생성
similarity_checker = SentenceSimilarity(raw, 'Refined_Thought')

# 유사도 계산 실행
similarity_checker.calculate_similarity()

# 결과 가져오기
result_df, result_df_2 = similarity_checker.get_results()

In [20]:
# 저장
result_df.to_csv('data/meta_Sentence_similarity_comparison.csv', index=False)
result_df_2.to_csv('data/meat_Sentence_similarity_comparison_easy.csv', index=False)

# 결과를 raw 데이터프레임에 추가
raw['base'] = result_df['base']
raw['long'] = result_df['long']
raw['similarity'] = result_df['similarity']
raw['average_similarity'] = result_df['average_similarity']

raw.to_csv('data/meta_similarity.csv', index=False)

print(result_df)

                                                   base  \
0     [I like my cats., I think one day they will pl...   
1     [I'm a vegan, and the restaurant served me a d...   
2     [The cashier at the bagel store messed up my o...   
3     [One of my cats is sick so I'll never adopt an...   
4     [My new boyfriend is going to dump me once the...   
...                                                 ...   
1721  [I have an appointment with Jim's dog Sparky t...   
1722  [My performance in the church choir was praise...   
1723  [I hiked in the past on the Appalachian trials...   
1724  [I have four siblings and one of them was mean...   
1725  [I found a new spot to get hamburgers., I'm th...   

                                                   long  \
0     [Here's a reconstructed version of the diary e...   
1     [I'm a vegan, and the restaurant served me a d...   
2     [The cashier at the bagel store messed up my o...   
3     [One of my cats is sick so I'll never adopt an...