In [154]:
import kss
import csv
from transformers import AutoTokenizer, AutoModel
import openpyxl
from konlpy.tag import Komoran
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [155]:
def preprocess_text(text):
    # 특수 문자 제거
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9\s]', '', text)
    
    # 형태소 분석기 초기화
    okt = Okt()
    
    # 형태소 단위로 토큰화하여 리스트로 반환
    tokens = okt.morphs(text)
    
    return tokens

In [156]:
def remove_stopwords(tokens, stopwords):
    # 불용어가 아닌 토큰만 선택
    tokens = [token for token in tokens if token not in stopwords]
    
    return tokens


In [157]:
def komoran_tokenizer(sent):
    re_sent = sent.replace('\n','').replace('\t','').replace('\r','')
    komoran = Komoran()
    words = komoran.morphs(re_sent)
    return words

In [158]:
# 텍스트 요약 함수
def summarize_text(text, num_sentences=3):
    if text == "":
        return "NAN"
    
    # 문장 분리
    sentences = kss.split_sentences(text)
    
    # 토크나이저와 모델 로드
    tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")
    model = AutoModel.from_pretrained("beomi/kcbert-base")
    
    # 입력 문장 토큰화 및 모델에 전달
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    sentence_embeddings = outputs.last_hidden_state.mean(dim=1)
    
    # 문장별 점수 계산
    sentence_scores = sentence_embeddings.sum(dim=1)
    top_indices = sentence_scores.argsort(descending=True)[:num_sentences]
    
    # 상위 문장 선택
    summary = [sentences[i] for i in top_indices]
    return ' '.join(summary)


In [162]:
def extract_keywords(text, top_k=5):
    if text == "":
        return ""

    # 토크나이저 함수
    tokenizer = komoran_tokenizer
    komoran = Komoran()

    stopwords = set()
    with open(r"C:\Users\Minho\Desktop\project\stopwords_kr.txt", "r", encoding="utf-8") as file:
        for line in file:
            stopwords.add(line.strip())

    # CountVectorizer를 사용하여 단어의 빈도 계산
    vectorizer = CountVectorizer(tokenizer=tokenizer)
    count_matrix = vectorizer.fit_transform([text])

    # 단어별 빈도 계산
    feature_names = vectorizer.get_feature_names_out()
    count_scores = count_matrix.toarray()[0]
    keyword_scores = [(feature_names[i], count_scores[i]) for i in range(len(feature_names))]

    # 빈도를 기준으로 상위 K개의 단어 추출
    keywords = []
    for keyword, score in sorted(keyword_scores, key=lambda x: x[1], reverse=True):
        pos_tags = komoran.pos(keyword)
        
        # 단어의 길이가 1보다 큰지 확인하여 한 글자 단어 제외
        if len(keyword) > 1 and keyword not in stopwords:
            keywords.append(keyword)
        
        if len(keywords) == top_k:
            break
    
    return keywords

In [160]:
# 텍스트 읽기 함수
def read_csv_column(file_path, column_index):
    strings = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            if len(row) > column_index:
                strings.append(row[column_index])
    return strings

In [165]:
# 메인
file_path = r'C:\Users\Minho\Desktop\project\missonf.csv'  # CSV 파일의 경로
column_index = 15  # p번째 열의 인덱스 (0부터 시작)

column_strings = read_csv_column(file_path, column_index)

# 요약 및 키워드 추출된 텍스트 저장할 엑셀 파일 생성
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Summary"

for index in range(2, len(column_strings)):
    original_text = column_strings[index]
    
    sentences = kss.split_sentences(original_text)
    text = ' '.join(sentences)  # Concatenate sentences with spaces
    
    summary_text = summarize_text(text)
    keywords = extract_keywords(text)

    ws.cell(row=index+1, column=1, value=original_text)
    ws.cell(row=index+1, column=2, value=summary_text)
    ws.cell(row=index+1, column=3, value=", ".join(keywords))

output_file_path = "summary_with_keywords.xlsx"
wb.save(output_file_path)
wb.close()

print("요약된 텍스트와 핵심어가 %s 파일에 저장되었습니다." % output_file_path)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertModel: ['cls.predictions.deco

요약된 텍스트와 핵심어가 summary_with_keywords.xlsx 파일에 저장되었습니다.
