<a href="https://colab.research.google.com/github/seulmi0827/fininsight/blob/main/JACE/refine_sentence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q kss flashtext
!pip install -q pandarallel

In [None]:
!apt-get update
!apt-get install g++ openjdk-8-jdk -y
!pip install konlpy
!pip install mecab-python
!apt-get install curl -y
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [None]:
from typing import List
import pandas as pd
import re
from kss import split_sentences
from flashtext import KeywordProcessor
from pandarallel import pandarallel
from konlpy.tag import Mecab

In [None]:
def expand_aspect_terms(aspect_terms: List[str]) -> List[str]:
    expanded = set()

    for term in aspect_terms:
        if '_' in term:
            clean_term = term.replace('_', ' ')
            expanded.add(clean_term) # 언더바를 띄어쓰기로 대체한 단어 추가
            expanded.add(clean_term.replace(" ", ""))  # 띄어쓰기를 제거한 단어 추가
        elif ' ' in term:
            expanded.add(term) # 띄어쓰기 된 원본 단어 추가
            clean_term = term.replace(' ', '')
            expanded.add(clean_term) # 띄어쓰기 제거된 단어 추가
        else:
            clean_term = term # 언더바도 아니고 띄어쓰기도 아닌 원래 단어
            expanded.add(clean_term)

    return list(expanded)


def extract_aspect_sentences(df, aspect_terms):
    pandarallel.initialize(progress_bar=True, verbose=1)

    expanded_terms = expand_aspect_terms(aspect_terms)

    keyword_processor = KeywordProcessor()
    for term in expanded_terms:
        keyword_processor.add_keyword(term)

    # 정규식 패턴 생성
    pattern_dict = {
        term: re.compile(rf'\b{re.escape(term)}\b') for term in expanded_terms
    }

    def process_content(content):
        results = []
        try:
            # KSS로 문장 분리
            sentences = split_sentences(content)

            for sentence in sentences:
                # 조사 제거 처리
                mecab = Mecab()
                processed_sentence = sentence # 원본문장 복제

                # 문장 형태소 분석
                pos_tagged = mecab.pos(sentence)

                # 뒤에서부터 조사 제거 (위치 변경 방지)
                current_pos = len(sentence)
                for i in range(len(pos_tagged)-1, -1, -1):
                    word, pos = pos_tagged[i]
                    if pos.startswith("J"): # "J", "V", "E", "M", "X", "SL", "SW", "SP", "SF", "SS"
                        word_pos = sentence.rfind(word, 0, current_pos)
                        if word_pos != -1:
                            processed_sentence = processed_sentence[:word_pos] + processed_sentence[word_pos+len(word):]
                            current_pos = word_pos

                # 키워드 검색
                if keyword_processor.extract_keywords(processed_sentence):
                    for term, pattern in pattern_dict.items():
                        if pattern.search(processed_sentence):
                            results.append({'original_sentence': sentence, 'processed_sentence': processed_sentence, 'term': term})
        except Exception as e:
            print(f"오류 발생: {e}")
            pass
        return results

    # 병렬 처리
    all_results = df['content'].parallel_apply(process_content)

    # 결과 합치기
    flattened_results = [item for sublist in all_results for item in sublist]
    return pd.DataFrame(flattened_results)

In [None]:
# 1. Aspect terms 불러오기
aspect_df = pd.read_csv('/content/drive/MyDrive/fin/ABSA/사회_both_1.csv')
aspect_df_add = pd.read_csv('/content/drive/MyDrive/fin/ABSA/사회_disagreement_gpt_공유용.csv')
aspect_terms1 = aspect_df['단어'].dropna().astype(str).tolist()
aspect_terms2 = aspect_df_add['단어'].dropna().astype(str).tolist()
aspect_terms = aspect_terms1 + aspect_terms2
print(f"로드된 aspect terms: {len(aspect_terms)}개")
print()

# 2. 기사 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/fin/전남소멸_10000.csv', encoding='utf-8')
df["content"] = df["content"].fillna("").astype(str)
print(df['content'])
print(f"로드된 기사: {len(df)}개")
print()

# 3. 병렬 처리로 문장 추출 실행
print("문장 추출 시작...")
result_df = extract_aspect_sentences(df, aspect_terms)
result_df

# 4. 결과 저장
result_df.to_csv('/content/drive/MyDrive/fin/0420_사회_전남소멸_aspect_sentences.csv', index=False, encoding='utf-8')
print(f"추출된 문장: {len(result_df)}개")

# 5. 결과 샘플 출력
print("\n결과 샘플:")
print(result_df.head())

In [None]:
# term별 문장 수 계산
term_counts = result_df.groupby('term')['target_sentence'].nunique().reset_index()
term_counts.columns = ['term', 'sentence_count']

# 문장 수 기준 내림차순 정렬
term_counts = term_counts.sort_values('sentence_count', ascending=False)

# 결과 저장
term_counts.to_csv('/content/drive/MyDrive/fin/term_sentence_counts.csv', index=False, encoding='utf-8')

# 상위 10개 term 출력
print("상위 10개 term:")
print(term_counts.head())


In [None]:
# 1. Aspect terms 불러오기
aspect_df = pd.read_csv('/content/drive/MyDrive/fin/ABSA/사회_both_1.csv')
aspect_df_add = pd.read_csv('/content/drive/MyDrive/fin/ABSA/사회_disagreement_gpt_공유용.csv')
aspect_terms1 = aspect_df['단어'].dropna().astype(str).tolist()
aspect_terms2 = aspect_df_add['단어'].dropna().astype(str).tolist()
aspect_terms = aspect_terms1 + aspect_terms2
print(f"로드된 aspect terms: {len(aspect_terms)}개")
print()

# 2. 기사 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/fin/경북산불.csv', encoding='utf-8')
df["content"] = df["content"].fillna("").astype(str)
print(df['content'])
print(f"로드된 기사: {len(df)}개")
print()


# 3. 병렬 처리로 문장 추출 실행
print("병렬 문장 추출 시작...")
result_df = extract_aspect_sentences(df, aspect_terms)
print()

# 4. 결과 저장
result_df.to_csv('/content/drive/MyDrive/fin/0420_사회_경북산불_aspect_sentences.csv', index=False, encoding='utf-8')
print(f"추출된 문장: {len(result_df)}개")

# 5. 결과 샘플 출력
print("\n결과 샘플:")
print(result_df.head())

In [None]:
term별 문장 수 계산
term_counts = result_df.groupby('term')['target_sentence'].nunique().reset_index()
term_counts.columns = ['term', 'sentence_count']

# 문장 수 기준 내림차순 정렬
term_counts = term_counts.sort_values('sentence_count', ascending=False)

# 결과 저장
term_counts.to_csv('/content/drive/MyDrive/fin/term_sentence_counts.csv', index=False, encoding='utf-8')

# 상위 10개 term 출력
print("상위 10개 term:")
print(term_counts.head())