In [None]:
# Description: This script filters and summarizes Korean National Assembly speeches related to hydrogen using GPT-4o. It exports a clean Excel file with essential claims only.

import pandas as pd
import openai
import re
from tqdm import tqdm
from google.colab import files

# ✅ 1. Upload and load OpenAI API key
print("🔑 OpenAI API 키(.txt) 파일을 업로드하세요.")  # Upload your OpenAI API key file
uploaded = files.upload()
api_key_file = list(uploaded.keys())[0]

with open(api_key_file, "r") as f:
    openai.api_key = f.read().strip()

# ✅ 2. Upload National Assembly speech data
print("📂 회의록 데이터(.xlsx) 파일을 업로드하세요.")  # Upload National Assembly data
uploaded = files.upload()
file_path = list(uploaded.keys())[0]

# ✅ 3. Load data
df = pd.read_excel(file_path)

# ✅ 4. Column renaming and selection
column_mapping = {
    "키워드": "키워드",
    "회의번호": "회의번호",
    "회의구분": "회의구분",
    "위원회": "위원회",
    "연도": "연도",
    "안건": "안건",
    "발언자": "발언자",
    "발언내용": "발언내용"
}
df.rename(columns=column_mapping, inplace=True)
required_columns = ["키워드", "회의번호", "회의구분", "위원회", "연도", "안건", "발언자", "발언내용"]
df = df[required_columns]

# ✅ 5. Hydrogen relevance classification using GPT-4o
def is_hydrogen_related(text):
    try:
        prompt = f"""
        다음 국회의원 발언이 다음 키워드 중 하나라도 관련이 있는지 판단해줘: '수소', '연료전지', '수소차', '수소경제'.
        # Determine whether the following National Assembly remark is related to: 'hydrogen', 'fuel cell', 'hydrogen vehicle', or 'hydrogen economy'.

        - 단, 키워드가 포함되지 않더라도 맥락적으로 관련이 있다면 'YES'라고 해줘.
        # Even if the keywords are not explicitly mentioned, reply "YES" if contextually related.

        - 관련이 없으면 'NO'라고만 답변해줘.
        # If clearly unrelated, reply only "NO".

        발언 내용: {text}
        # Speech content
        """
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}]
        )
        result = response.choices[0].message.content.strip()
        return result
    except Exception as e:
        print(f"❌ GPT 호출 중 오류 발생: {e}")
        return "NO"

# ✅ 6. Apply GPT classification
df["GPT_검증"] = df["발언내용"].apply(is_hydrogen_related)

# ✅ 7. Filter speeches marked as hydrogen-related
df_filtered = df[df["GPT_검증"].str.contains("YES", case=False, na=False)]

# ✅ 8. Summarize relevant speeches using GPT-4o
def summarize_text(text):
    try:
        prompt = f"""
        다음 국회 발언을 핵심 내용만 유지하면서 요약해줘.
        # Please summarize the following National Assembly remark while preserving only the core message.

✅ **요약 원칙**
# ✅ Summarization Principles

- 발언자의 주요 주장(정책, 기술, 비용, 제도 개선 등)을 포함할 것.
# Include key claims such as policies, technologies, cost, or institutional suggestions.

- 불필요한 감탄사, 인사말, 반복 표현, 연결어는 제거할 것.
# Remove unnecessary expressions, greetings, repeated words, and conjunctions.

- 단정적인 표현은 피하고, 원문의 불확실성은 그대로 유지할 것.
# Avoid strong assertions and retain any uncertainty from the original text.

- 새로운 정보를 추가하지 말 것.
# Do not add any information not in the original speech.

- 논점을 변경하지 말 것.
# Do not shift the original focus or topic.

- 문장 구조를 유지하되, 의미를 압축할 것.
# Keep sentence structure where possible but compress the meaning.

- 길이는 제한하지 않되, 핵심 내용을 간결하게 정리할 것.
# Don’t artificially limit length — just keep it clear and essential.

💡 예시
# 💡 Example

📌 원문: "도시계획에는 탄소중립과 친환경 에너지를 고려해야 한다."
# "Urban planning must consider carbon neutrality and green energy."

📌 요약: "도시계획에는 탄소중립과 친환경 에너지 고려 필요."
# Summary: "Urban planning requires consideration of carbon neutrality and green energy."

---

발언 내용: {text}
# Speech content to summarize
        """
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"❌ GPT 요약 중 오류 발생: {e}")
        return text

# ✅ 9. Run summarization with progress bar
tqdm.pandas()
df_filtered["요약된 발언"] = df_filtered["발언내용"].progress_apply(summarize_text)

# ✅ 10. Drop unnecessary columns
df_filtered = df_filtered.drop(columns=["발언내용", "GPT_검증"])

# ✅ 11. Save and download result
output_file = "gpt4o_hydrogen_related_speech_summary.xlsx"
df_filtered.to_excel(output_file, index=False, engine="openpyxl")
files.download(output_file)

print(f"✅ 전처리 완료! 다운로드 링크: {output_file}")


🔑 OpenAI API 키(.txt) 파일을 업로드하세요.


Saving 가영 API for 논문.txt to 가영 API for 논문.txt
📂 회의록 데이터(.xlsx) 파일을 업로드하세요.


Saving 수소_연료전지_통합_2024만_샘플 제외.xlsx to 수소_연료전지_통합_2024만_샘플 제외.xlsx


100%|██████████| 205/205 [10:31<00:00,  3.08s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["요약된 발언"] = df_filtered["발언내용"].progress_apply(summarize_text)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ 전처리 완료! 다운로드 링크: processed_national_assembly_speeches.xlsx
