<a href="https://colab.research.google.com/github/seeedata/BADA/blob/main/%EB%85%B8%ED%8A%B8%EB%B6%81%20%EC%8B%9C%EC%9E%A5%20%ED%8F%AC%EC%A7%80%EC%85%94%EB%8B%9D%20-%20M%EC%82%AC%EB%A5%BC%20%EC%A4%91%EC%8B%AC%EC%9C%BC%EB%A1%9C/M%EC%82%AC%20%EB%85%B8%ED%8A%B8%EB%B6%81%20%EB%A6%AC%EB%B7%B0%20%EA%B0%90%EC%A0%95%EB%B6%84%EC%84%9D(LDA).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from konlpy.tag import Okt
from tqdm import tqdm
import csv
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from collections import Counter



In [None]:
# 텍스트에서 명사와 형용사를 추출하는 함수
def get_nouns(tokenizer, sentence):
    tagged = tokenizer.pos(sentence)
    nouns = [s for s, t in tagged if t in ['Noun', 'Adjective'] and len(s) > 1]
    return nouns

# 데이터 프레임의 텍스트를 토큰화하는 함수
def tokenize(df):
    tokenizer = Okt()  # Okt 객체 생성
    processed_data = []
    for sent in tqdm(df['text']):
        sentence = str(sent).replace("\n", "").strip()
        processed_data.append(get_nouns(tokenizer, sentence))
    return processed_data

# 처리된 데이터를 파일로 저장하는 함수
def save_processed_data(processed_data, title):
    with open("tokenized_data_"+title, 'w', newline="", encoding='utf-8') as f:
        writer = csv.writer(f)
        for data in processed_data:
            writer.writerow(data)

# 메인 함수
if __name__ == '__main__':
    # 데이터 로드
    df = pd.read_csv(r'C:\Users\ghkd1\OneDrive\문서\카카오톡 받은 파일\review_naver최종.csv')
    df.columns = ['rating', 'text']
    df.dropna(how='any', inplace=True)

    # 평점이 4, 5인 리뷰 처리
    high_rating_df = df[df['rating'].isin(['평점4', '평점5'])]
    high_rating_data = tokenize(high_rating_df)
    save_processed_data(high_rating_data, 'naver_review_high_rating')

    # 평점이 1, 2, 3인 리뷰 처리
    low_rating_df = df[df['rating'].isin(['평점1','평점2', '평점4'])]
    low_rating_data = tokenize(low_rating_df)
    save_processed_data(low_rating_data, 'naver_review_low_rating')

100%|████████████████████████████████████████████████████████████████████████████████| 676/676 [00:08<00:00, 77.15it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 210.84it/s]


In [None]:
# 빈 문서를 제거하는 함수
def remove_empty_docs(data):
    return [doc for doc in data if doc]
high_rating_data_clean = remove_empty_docs(high_rating_data)
low_rating_data_clean = remove_empty_docs(low_rating_data)

# 사전과 코퍼스 생성
positive_dictionary = Dictionary(high_rating_data_clean)
positive_corpus = [positive_dictionary.doc2bow(doc) for doc in high_rating_data_clean]

negative_dictionary = Dictionary(low_rating_data_clean)
negative_corpus = [negative_dictionary.doc2bow(doc) for doc in low_rating_data_clean]

In [None]:
# 긍정적 리뷰에 대한 LDA 모델 훈련
positive_model = LdaModel(corpus=positive_corpus, id2word=positive_dictionary)

# 부정적 리뷰에 대한 LDA 모델 훈련
negative_model = LdaModel(corpus=negative_corpus, id2word=negative_dictionary)

# 긍정적 리뷰의 주제와 상위 단어 출력
print("긍정적 리뷰의 주제:")
for idx, topic in positive_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

# 부정적 리뷰의 주제와 상위 단어 출력
print("\n부정적 리뷰의 주제:")
for idx, topic in negative_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

긍정적 리뷰의 주제:
Topic: 0 
Words: 0.033*"서피스" + 0.029*"휴대" + 0.021*"좋습니다" + 0.021*"지름신" + 0.021*"처음" + 0.021*"좀더" + 0.021*"보태" + 0.019*"구매" + 0.015*"좋고" + 0.014*"매우"
Topic: 1 
Words: 0.000*"여기저기" + 0.000*"결재" + 0.000*"입니다요" + 0.000*"코드" + 0.000*"편했거든요" + 0.000*"사지" + 0.000*"신뢰" + 0.000*"어렵지만" + 0.000*"있는데전혀" + 0.000*"도우"
Topic: 2 
Words: 0.035*"만족" + 0.027*"배송" + 0.025*"서피스" + 0.022*"노트북" + 0.022*"입니다" + 0.018*"휴대" + 0.018*"대학생" + 0.018*"선물" + 0.018*"성도" + 0.018*"좋고"
Topic: 3 
Words: 0.022*"서피스" + 0.022*"생각" + 0.017*"고민" + 0.017*"정말" + 0.015*"노트북" + 0.014*"장점" + 0.012*"모델" + 0.012*"성능" + 0.010*"차이" + 0.009*"프로"
Topic: 4 
Words: 0.061*"구입" + 0.051*"모델" + 0.021*"사용" + 0.021*"입니다" + 0.021*"조금" + 0.021*"태블릿" + 0.021*"같습니다" + 0.021*"성능" + 0.021*"노트북" + 0.010*"필요하고"
Topic: 5 
Words: 0.017*"제품" + 0.016*"정말" + 0.015*"좋은" + 0.015*"생각" + 0.015*"차이" + 0.014*"가격" + 0.012*"보드" + 0.012*"세로" + 0.011*"서피스" + 0.011*"입니다"
Topic: 6 
Words: 0.023*"키보" + 0.023*"사용" + 0.023*"기대했던만큼" + 0.023*"노트북" + 0.023*"퀄리티" +

In [None]:
#긍정적 리뷰와 가장 흔한 단어들과 그 빈도수 출력
all_words = [word for doc in high_rating_data_clean for word in doc]
word_counts = Counter(all_words)

# 가장 흔한 단어들과 그 빈도수 출력
most_common_words = word_counts.most_common(30)
for word, count in most_common_words:
    print(f'Word: {word}, Frequency: {count}')

Word: 사용, Frequency: 277
Word: 서피스, Frequency: 203
Word: 구매, Frequency: 161
Word: 배송, Frequency: 161
Word: 노트북, Frequency: 148
Word: 제품, Frequency: 127
Word: 입니다, Frequency: 111
Word: 프로, Frequency: 93
Word: 태블릿, Frequency: 89
Word: 가격, Frequency: 80
Word: 좋아요, Frequency: 79
Word: 좋습니다, Frequency: 76
Word: 윈도우, Frequency: 74
Word: 좋은, Frequency: 74
Word: 생각, Frequency: 72
Word: 보드, Frequency: 67
Word: 만족합니다, Frequency: 65
Word: 있습니다, Frequency: 64
Word: 성능, Frequency: 62
Word: 빠르고, Frequency: 62
Word: 정말, Frequency: 62
Word: 고민, Frequency: 59
Word: 휴대, Frequency: 58
Word: 좋네요, Frequency: 58
Word: 작업, Frequency: 51
Word: 서피스프로, Frequency: 49
Word: 아주, Frequency: 49
Word: 빠른, Frequency: 45
Word: 업무, Frequency: 43
Word: 매우, Frequency: 42


In [None]:
# 부정적 리뷰와 가장 흔한 단어들과 그 빈도수 출력
all_words_negative = [word for doc in low_rating_data_clean for word in doc]
word_counts_negative = Counter(all_words_negative)
most_common_words_negative = word_counts_negative.most_common(20)
for word, count in most_common_words_negative:
    print(f'Word: {word}, Frequency: {count}')

Word: 사용, Frequency: 25
Word: 서피스, Frequency: 22
Word: 제품, Frequency: 19
Word: 노트북, Frequency: 16
Word: 구매, Frequency: 15
Word: 입니다, Frequency: 14
Word: 프로, Frequency: 9
Word: 다만, Frequency: 8
Word: 보드, Frequency: 7
Word: 정도, Frequency: 6
Word: 좋습니다, Frequency: 6
Word: 같아요, Frequency: 6
Word: 가격, Frequency: 6
Word: 좋아요, Frequency: 6
Word: 문제, Frequency: 6
Word: 터치, Frequency: 6
Word: 배송, Frequency: 6
Word: 태블릿, Frequency: 5
Word: 휴대, Frequency: 5
Word: 윈도우, Frequency: 5


In [None]:
# 각 주제별 상위 단어 추출
def get_top_words(model, num_words=10):
    top_words = []
    for idx, topic in model.print_topics(-1):
        for word in topic.split('+'):
            top_word = word.split('*')[1].strip().replace('"', '')
            top_words.append(top_word)
    return top_words

# 전체 데이터, 긍정적 리뷰와 부정적 리뷰 모델에서 상위 단어 추출
top_words_positive = get_top_words(positive_model, 10)
top_words_negative = get_top_words(negative_model, 10)

combined_word_counts = Counter(top_words_positive + top_words_negative)
print("전체 데이터셋에서 일관성 있는 상위 10개 단어:")
for word, count in combined_word_counts.most_common(10):
    print(f'Word: {word}, Frequency: {count}')

# 긍정적 리뷰 데이터셋에서 상위 10개 단어 선택
positive_word_counts = Counter(top_words_positive)
print("\n긍정적 리뷰에서 상위 10개 단어:")
for word, count in positive_word_counts.most_common(10):
    print(f'Word: {word}, Frequency: {count}')

# 부정적 리뷰 데이터셋에서 상위 10개 단어 선택
negative_word_counts = Counter(top_words_negative)
print("\n부정적 리뷰에서 상위 10개 단어:")
for word, count in negative_word_counts.most_common(10):
    print(f'Word: {word}, Frequency: {count}')

전체 데이터셋에서 일관성 있는 상위 10개 단어:
Word: 사용, Frequency: 86
Word: 터치, Frequency: 67
Word: 정확도, Frequency: 62
Word: 테스트, Frequency: 61
Word: 서피스, Frequency: 60
Word: 환경, Frequency: 58
Word: 향상, Frequency: 58
Word: 필요할거고, Frequency: 58
Word: 용감, Frequency: 58
Word: 재생, Frequency: 57

긍정적 리뷰에서 상위 10개 단어:
Word: 사용, Frequency: 64
Word: 서피스, Frequency: 48
Word: 구매, Frequency: 41
Word: 배송, Frequency: 41
Word: 노트북, Frequency: 39
Word: 제품, Frequency: 31
Word: 입니다, Frequency: 29
Word: 프로, Frequency: 24
Word: 태블릿, Frequency: 20
Word: 좋습니다, Frequency: 19

부정적 리뷰에서 상위 10개 단어:
Word: 터치, Frequency: 63
Word: 정확도, Frequency: 62
Word: 테스트, Frequency: 61
Word: 향상, Frequency: 58
Word: 필요할거고, Frequency: 58
Word: 용감, Frequency: 58
Word: 재생, Frequency: 57
Word: 환경, Frequency: 56
Word: 가량, Frequency: 54
Word: 만족스럽니다, Frequency: 51
