In [2]:
import openai
import json
import requests
from openai import OpenAI

import pandas as pd
import glob,os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import re

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 142

In [111]:
def split_by_pages(filepath, encoding='utf-8'):
    """
    filepath : ocr 결과 txt 파일 경로 전달

    return : page별로 구분된 하나의 리스트를 반환
    """
    with open(filepath,'r',encoding=encoding) as f:
        data = f.readlines()
    text = []
    page_text = []
    for d in data:
        if re.compile(PAGE_PATTERN).match(d):
            if len(page_text)>0:
                text.append(' '.join(page_text))
            page_text = []
        page_text.append(d)
    text.append(''.join(page_text))
    return text

def erase_tag(text, tag):
    """
    text : split_by_page로 얻은 텍스트 리스트 또는 텍스트
    tag : 지우고 싶은 <tag>
    """
    tag_pattern = re.compile(f'<{tag}>|</{tag}>')
    if isinstance(text, list):
        text_ = [tag_pattern.sub('',x) for x in text]
        return text_
    else:
        text_ = tag_pattern.sub('',text)
        return text_

def extract_text_between_tag(text, tag):
    """
    text : split_by_page로 얻은 텍스트 리스트
    tag : <tag> 사이의 텍스트를 추출
    """
    # Create a regex pattern for the specified tag
    pattern = f'<{tag}>(.*?)</{tag}>'
    # Use re.findall to extract all occurrences between the specified tags
    matches = re.findall(pattern, text, re.DOTALL)
    return matches

def extract_numbers_from_string(text):
    # \d+ 는 하나 이상의 숫자에 매치됨
    return re.findall(r'\d+', text)

def remove_xml_tags(text, tag):
    """
    주어진 문자열에서 XML 태그와 그 사이의 내용을 제거합니다.
    
    :param text: 태그를 제거할 입력 문자열
    :return: XML 태그가 제거된 문자열
    """
    # <와 > 사이에 있는 내용을 제거합니다.
    # 정규 표현식 패턴: <[^>]+>.*?</[^>]+>
    return re.sub(rf'<{tag}>.*?</{tag}>', '', text)

In [107]:
class GPT():
    __classname__ = "OpenAI"
    api_key = ''
    client = None    
    def __init__(self, api_filepath):
        with open(api_filepath,'r') as f:
            ak = json.load(f)
        self.api_key = ak['OPENAI_API_KEY']
        self.client = OpenAI(api_key=self.api_key)
        self.EVALUATION_PROMPT_TEMPLATE = """
You will be given one summary written for an article. Your task is to rate the summary on one metric.
Please make sure you read and understand these instructions very carefully. 
Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:

{criteria}

Evaluation Steps:

{steps}

Source Text:

{document}

Summary:

{summary}

Evaluation Form : 'INTEGER SCORE ONLY'

"""
# - {metric_name}
        self.RELEVANCY_SCORE_CRITERIA = """
Relevance(1-5) - selection of important content from the source. \
The summary should include only important information from the source document. \
Annotators were instructed to penalize summaries which contained redundancies and excess information.
"""
        self.RELEVANCY_SCORE_STEPS = """
1. Read the summary and the source document carefully.
2. Compare the summary to the source document and identify the main points of the article.
3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.
4. Assign a relevance score from 1 to 5.
"""
        self.COHERENCE_SCORE_CRITERIA = """
Coherence(1-5) - the collective quality of all sentences. \
We align this dimension with the DUC quality question of structure and coherence \
whereby "the summary should be well-structured and well-organized. \
The summary should not just be a heap of related information, but should build from sentence to a\
coherent body of information about a topic."
"""
        self.COHERENCE_SCORE_STEPS = """
1. Read the article carefully and identify the main topic and key points.
2. Read the summary and compare it to the article. Check if the summary covers the main topic and key points of the article,
and if it presents them in a clear and logical order.
3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
"""
        self.CONSISTENCY_SCORE_CRITERIA = """
Consistency(1-5) - the factual alignment between the summary and the summarized source. \
A factually consistent summary contains only statements that are entailed by the source document. \
Annotators were also asked to penalize summaries that contained hallucinated facts.
"""
        self.CONSISTENCY_SCORE_STEPS = """
1. Read the article carefully and identify the main facts and details it presents.
2. Read the summary and compare it to the article. Check if the summary contains any factual errors that are not supported by the article.
3. Assign a score for consistency based on the Evaluation Criteria.
"""
        self.FLUENCY_SCORE_CRITERIA = """
Fluency(1-3): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.
1: Poor. The summary has many errors that make it hard to understand or sound unnatural.
2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
3: Good. The summary has few or no errors and is easy to read and follow.
"""
        self.FLUENCY_SCORE_STEPS = """
Read the summary and evaluate its fluency based on the given criteria. Assign a fluency score from 1 to 3.
"""
        
    def get_chat_completion(self, msg, model='gpt-4o-mini', temperature = 0):
        response = self.client.chat.completions.create(
            model = model,
            messages = msg,
            temperature = temperature
        )
        return response.choices[0].message.content
        # return response

    def get_embedding(self, sentence, model="text-embedding-3-small"):
       '''
       - pricing : text-embedding-3-small = $0.02/1M tokens
           텍스트가 많은 pdf는 대략 5,000 tokens -> pdf 200개에 0.02 달러(25~30원).
       text : 한 문장
       return : 한 문장에 대한 embedding (output dimension = 1536)
       '''
       return self.client.embeddings.create(input = sentence, model=model).data[0].embedding

    def get_geval_score(
        self, document: str, summary: str, model: str = 'gpt-4o-mini', n_sampling: int = 20
    ):
        '''
        document : 원본 문서
        summary : 요약 텍스트

        return : 요약 텍스트에 대한 relevance, coherence, consistency, fluency G-EVAL 점수
        '''
        evaluation_metrics = {
            "Relevance": (self.RELEVANCY_SCORE_CRITERIA, self.RELEVANCY_SCORE_STEPS),
            "Coherence": (self.COHERENCE_SCORE_CRITERIA, self.COHERENCE_SCORE_STEPS),
            "Consistency": (self.CONSISTENCY_SCORE_CRITERIA, self.CONSISTENCY_SCORE_STEPS),
            "Fluency": (self.FLUENCY_SCORE_CRITERIA, self.FLUENCY_SCORE_STEPS)
        }
        scores = []
        for evaluation_type, (criteria, steps) in evaluation_metrics.items():
            prompt = self.EVALUATION_PROMPT_TEMPLATE.format(
                criteria=criteria,
                steps=steps,
                metric_name=evaluation_type,
                document=document,
                summary=summary
            )
            response = self.client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=1,
                max_tokens=5,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                n = n_sampling
            )
            # 논문에서는 GPT-4의 parameter를 다음과 같이 설정 : n = 20, temperature = 1, top_p = 1
            score = 0

            # manually sampling
            # for sample in range(n_sampling):
            #     res = response.choices[0].message.content.strip()
            #     print(res)
            #     numlist = extract_numbers_from_string(res)
            #     if len(numlist) == 0 : # 점수가 안 나오는 error인 경우
            #         n_sampling -= 1 # 정규화를 위한 n_sampling 1 줄여주기
            #         continue
            #     score += int(numlist[0])

            # when using n parameter
            for res in response.choices:
                res = res.message.content.strip()
                print(res)
                numlist = extract_numbers_from_string(res)
                if len(numlist) == 0 : # 점수가 안 나오는 error인 경우
                    n_sampling -= 1 # 정규화를 위한 n_sampling 1 줄여주기
                    continue
                score += int(numlist[0])
            score = score / (n_sampling if n_sampling != 0 else 1)
            scores.append(score)
            # scores.append(int(response.choices[0].message.content.strip()))
        return scores

In [108]:
gpt = GPT(api_filepath='../assets/openai_api_key.json')

In [25]:
gpt.get_chat_completion(msg=[{'role':'user','content':'Answer random integer number between 1 to 5'}])

ChatCompletion(id='chatcmpl-9uZIUkZfsm6U7BjedXpNTdxGNDgrx', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Sure! Here’s a random integer between 1 and 5: **3**.', role='assistant', function_call=None, tool_calls=None, refusal=None)), Choice(finish_reason='stop', index=1, logprobs=None, message=ChatCompletionMessage(content='Sure! Here’s a random integer between 1 and 5: **3**.', role='assistant', function_call=None, tool_calls=None, refusal=None)), Choice(finish_reason='stop', index=2, logprobs=None, message=ChatCompletionMessage(content='Sure! Here’s a random integer between 1 and 5: **3**.', role='assistant', function_call=None, tool_calls=None, refusal=None)), Choice(finish_reason='stop', index=3, logprobs=None, message=ChatCompletionMessage(content='Sure! Here’s a random integer between 1 and 5: **3**.', role='assistant', function_call=None, tool_calls=None, refusal=None)), Choice(finish_reason='stop', index=4, logprobs=None, me

In [109]:
# Load Documnets and summaries
ROOT = 'bitamin_auto_readme_generator'
root_absdir = os.getcwd().split(ROOT)[0]+ROOT

doc_dir = os.path.join(root_absdir,'data','object_detection','output','ocr_samples_txt')
doc_files = os.listdir(doc_dir)

summ_dir = os.path.join(root_absdir,'data','text_summarization','output','cluster_n_summary_temp1')

scores_df = pd.DataFrame(columns=['pdf','relevance','coherence','consistency','fluency','summ_length','doc_summ_length_ratio'])
for filename in doc_files:
    print(filename)
    with open(os.path.join(doc_dir, filename), 'r', encoding='utf-8') as f:
        document = f.readlines()
    if os.path.exists(os.path.join(summ_dir, filename)):
        with open(os.path.join(summ_dir, filename), 'r', encoding = 'utf-8') as f:
            summary = f.readlines()
    else:
        print("⚠️Summary doesn't exist")
        break

    # preprocess
    document = ''.join(document)
    document = erase_tag(document, 'p.\d*')
    # document = document.replace('\n','')
    summary = ''.join(summary)
    for tag in ['subject','team','index']:
        summary = remove_xml_tags(summary, tag)
    summary = erase_tag(summary,'[^>]+')    

    # Evaluate
    rel, coh, cons, flu =gpt.get_geval_score(document, summary, model='gpt-4o-mini')

    new_row = pd.DataFrame(data=[[filename.replace('.txt',''),rel,coh,cons,flu,len(summary),round(len(summary)/len(document)*100,2)]],
                          columns=scores_df.columns)
    scores_df = pd.concat([scores_df, new_row], axis=0)
    

arima_text.txt
4
5
4
4
4
4
4
4
4
5
5
4
4
4
4
4
4
4
5
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
4
4
4
5
4
4
4
4
4
4
4
4
4
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
asiancup_text.txt
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
4
4
4
4
4
4
4
4
4
4
5
4
4
4
4
4
4
4
4
4
5
4
4
4
3
3
3
3
3
3
3
3
3
3
3
3
2
3
3
3
3
3
3
3
barbot_text.txt
4
5
4
4
5
4
4
4
4
4
5
4
4
4
4
4
4
4
4
4
4
5
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
4
4
4
4
4
4
5
4
5
5
4
4
4
4
4
4
4
4
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
2
3
3
3
blind_text.txt
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
5
4
4
4
4
4
5
4
4
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
braintumor_text.txt
4
4
4
4
4
4
4
4
4
4
5
4
4
4
5
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
4
4
4
4
4
4
4
4
4
4
4
4
3
3
3
3
3
3
3
3
2
3
2
3
2
2
3
2
3
2
3
2
disease_text.txt
4
4
5
4
4
5
4
4
4
4
4
4
4
4
5
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
5
5
4
4
5

In [110]:
scores_df

Unnamed: 0,pdf,relevance,coherence,consistency,fluency,summ_length,doc_summ_length_ratio
0,arima_text,4.2,4.0,4.1,3.0,2182,73.74
0,asiancup_text,4.05,4.05,4.1,2.95,2090,70.82
0,barbot_text,4.15,4.05,4.2,2.95,2767,47.59
0,blind_text,4.0,4.0,4.15,3.0,1276,51.79
0,braintumor_text,4.1,4.0,4.05,2.65,1331,98.23
0,disease_text,4.15,4.0,4.35,2.05,1654,44.21
0,energy_text,4.25,4.0,4.3,3.0,1407,19.47
0,hangang_text,4.1,3.95,4.15,2.65,1789,86.63
0,insideout_text,4.25,4.05,4.4,2.95,1530,33.56
0,interior_text,4.45,4.1,4.2,3.0,1235,69.23


In [None]:
# Load Documnets and summaries
ROOT = 'bitamin_auto_readme_generator'
root_absdir = os.getcwd().split(ROOT)[0]+ROOT

doc_dir = os.path.join(root_absdir,'data','object_detection','output','ocr_samples_txt')
doc_files = os.listdir(doc_dir)

summ_dir = os.path.join(root_absdir,'data','text_summarization','output','cluster_n_summary_temp0')

scores_df = pd.DataFrame(columns=['pdf','relevance','coherence','consistency','fluency','summ_length','doc_summ_length_ratio'])
for filename in doc_files:
    with open(os.path.join(doc_dir, filename), 'r', encoding='utf-8') as f:
        document = f.readlines()
    if os.path.exists(os.path.join(summ_dir, filename)):
        with open(os.path.join(summ_dir, filename), 'r', encoding = 'utf-8') as f:
            summary = f.readlines()
    else:
        print("⚠️Summary doesn't exist")
        break

    # preprocess
    document = ''.join(document)
    document = erase_tag(document, 'p.\d*')
    # document = document.replace('\n','')
    summary = ''.join(summary)
    for tag in ['subject','team','index']:
        summary = remove_xml_tags(summary, tag)
    

    # Evaluate
    rel, coh, cons, flu =gpt.get_geval_score(document, summary, model='gpt-4o-mini')

    new_row = pd.DataFrame(data=[[filename.replace('.txt',''),rel,coh,cons,flu,len(summary),round(len(summary)/len(document)*100,2)]],
                          columns=scores_df.columns)
    scores_df = pd.concat([scores_df, new_row], axis=0)
    

In [None]:
scores_df

In [None]:
# Load Documnets and summaries
ROOT = 'bitamin_auto_readme_generator'
root_absdir = os.getcwd().split(ROOT)[0]+ROOT

doc_dir = os.path.join(root_absdir,'data','object_detection','output','ocr_samples_txt')
doc_files = os.listdir(doc_dir)

summ_dir = os.path.join(root_absdir,'data','text_summarization','output','method3')
summ_files = os.listdir(summ_dir)

scores_df = pd.DataFrame(columns=['pdf','relevance','coherence','consistency','fluency','summ_length','doc_summ_length_ratio'])
i = 0 
for filename in doc_files:
    if filename not in ['lier-detector_text.txt','netflix_text.txt','webtoon_text.txt']:
        continue
    with open(os.path.join(doc_dir, filename), 'r', encoding='utf-8') as f:
        document = f.readlines()

    summ_file = summ_files[i]
    i+=1
    if os.path.exists(os.path.join(summ_dir, summ_file)):
        with open(os.path.join(summ_dir, summ_file), 'r', encoding = 'utf-8') as f:
            summary = f.readlines()
    else:
        print("⚠️Summary doesn't exist")
        break

    # preprocess
    document = ''.join(document)
    document = erase_tag(document, 'p.\d*')
    # document = document.replace('\n','')
    summary = ''.join(summary)

    # Evaluate
    rel, coh, cons, flu =gpt.get_geval_score(document, summary, model='gpt-4o-mini')

    new_row = pd.DataFrame(data=[[filename.replace('.txt',''),rel,coh,cons,flu,len(summary),round(len(summary)/len(document)*100,2)]],
                          columns=scores_df.columns)
    scores_df = pd.concat([scores_df, new_row], axis=0)
    

In [None]:
scores_df

In [None]:
# Load Documnets and summaries
ROOT = 'bitamin_auto_readme_generator'
root_absdir = os.getcwd().split(ROOT)[0]+ROOT

doc_dir = os.path.join(root_absdir,'data','object_detection','output','ocr_samples_txt')
doc_files = os.listdir(doc_dir)

summ_dir = os.path.join(root_absdir,'data','text_summarization','output','method2')
summ_files = os.listdir(summ_dir)

scores_df = pd.DataFrame(columns=['pdf','relevance','coherence','consistency','fluency','summ_length','doc_summ_length_ratio'])
i = 0 
for filename in doc_files:
    if filename not in ['lier-detector_text.txt','netflix_text.txt','webtoon_text.txt']:
        continue
    with open(os.path.join(doc_dir, filename), 'r', encoding='utf-8') as f:
        document = f.readlines()

    summ_file = summ_files[i]
    i+=1
    if os.path.exists(os.path.join(summ_dir, summ_file)):
        with open(os.path.join(summ_dir, summ_file), 'r', encoding = 'utf-8') as f:
            summary = f.readlines()
    else:
        print("⚠️Summary doesn't exist")
        break

    # preprocess
    document = ''.join(document)
    document = erase_tag(document, 'p.\d*')
    # document = document.replace('\n','')
    summary = ' '.join(summary)
    summary = re.sub(r'<nan>.*?</nan>', '', summary)

    # Evaluate
    rel, coh, cons, flu =gpt.get_geval_score(document, summary, model='gpt-4o-mini')

    new_row = pd.DataFrame(data=[[filename.replace('.txt',''),rel,coh,cons,flu,len(summary),round(len(summary)/len(document)*100,2)]],
                          columns=scores_df.columns)
    scores_df = pd.concat([scores_df, new_row], axis=0)
    

In [None]:
scores_df

In [None]:
# Load Documnets and summaries
ROOT = 'bitamin_auto_readme_generator'
root_absdir = os.getcwd().split(ROOT)[0]+ROOT

doc_dir = os.path.join(root_absdir,'data','object_detection','output','ocr_samples_txt')
doc_files = os.listdir(doc_dir)

summ_dir = os.path.join(root_absdir,'data','text_summarization','output','method1')
summ_files = os.listdir(summ_dir)

scores_df = pd.DataFrame(columns=['pdf','relevance','coherence','consistency','fluency','summ_length','doc_summ_length_ratio'])
i = 0 
for filename in doc_files:
    if filename not in ['lier-detector_text.txt','netflix_text.txt','webtoon_text.txt']:
        continue
    with open(os.path.join(doc_dir, filename), 'r', encoding='utf-8') as f:
        document = f.readlines()

    summ_file = summ_files[i]
    i+=1
    if os.path.exists(os.path.join(summ_dir, summ_file)):
        with open(os.path.join(summ_dir, summ_file), 'r', encoding = 'utf-8') as f:
            summary = f.readlines()
    else:
        print("⚠️Summary doesn't exist")
        break

    # preprocess
    document = ''.join(document)
    document = erase_tag(document, 'p.\d*')
    # document = document.replace('\n','')
    summary = ''.join(summary)

    # Evaluate
    rel, coh, cons, flu =gpt.get_geval_score(document, summary, model='gpt-4o-mini')

    new_row = pd.DataFrame(data=[[filename.replace('.txt',''),rel,coh,cons,flu,len(summary),round(len(summary)/len(document)*100,2)]],
                          columns=scores_df.columns)
    scores_df = pd.concat([scores_df, new_row], axis=0)
    

In [3]:
pd.read_csv('../../data/text_summarization/output/g-evals/g-eval_cluster_n_summary_temp0_1754.csv')

Unnamed: 0,pdf,relevance,coherence,consistency,fluency,summ/doc_ratio
0,arima_text,4.0,4.15,4.1,2.55,70.6
1,asiancup_text,4.2,4.25,4.65,2.1,58.01
2,barbot_text,4.1,4.1,4.2,3.0,31.85
3,blind_text,4.05,4.1,4.15,2.75,45.9
4,braintumor_text,4.05,4.0,4.75,2.0,104.8
5,cartoon_text,4.05,4.0,4.0,2.1,52.62
6,disease_text,4.05,4.05,4.4,2.05,44.67
7,energy_text,4.5,4.15,4.85,3.0,20.15
8,hangang_text,4.3,4.157895,4.421053,2.0,86.63
9,insideout_text,4.0,4.0,4.15,2.6,26.78


In [4]:
pd.read_csv('../../data/text_summarization/output/g-evals/g-eval_cluster_n_summary_temp1_1756.csv')

Unnamed: 0,pdf,relevance,coherence,consistency,fluency,summ/doc_ratio
0,arima_text,4.25,4.2,4.25,3.0,86.72
1,asiancup_text,4.3,4.1,4.55,2.0,53.44
2,barbot_text,4.1,4.0,4.4,3.0,45.99
3,blind_text,4.1,4.2,4.85,2.95,57.59
4,braintumor_text,4.25,4.0,4.65,2.0,91.29
5,cartoon_text,4.0,4.0,4.25,2.0,43.75
6,disease_text,4.0,4.05,4.1,2.0,39.72
7,energy_text,4.05,4.05,4.4,2.65,18.68
8,hangang_text,4.2,4.2,4.5,2.05,101.26
9,insideout_text,4.1,4.1,4.45,2.95,32.44
