In [1]:
sentences = [
    "오늘 날씨가 좋아서 나들이 가고 싶다.",
    "이 영화는 정말 재미있었어요.",
    "맛있는 음식을 먹으러 갈까요?",
    "운동을 하면 건강에 좋아지는 것 같아요.",
    "공부하기 싫어서 미루고 있어요.",
    "여행 계획을 세우고 있는데 어디로 갈까요?",
    "좋은 책을 읽으면 마음이 편안해져요.",
    "오늘은 친구들과 만나서 재미있게 놀았어요.",
    "새로운 언어를 배우는 것은 어려워도 흥미로워요.",
    "주말에 가족들과 함께 시간을 보내기로 했습니다."
]

## 자연어 전처리

In [4]:
# 불용어 리스트 생성 (예시)
stopwords = ['가', '고', '을', '를', '이', '는']
from konlpy.tag import Okt

# Okt 형태소 분석기 인스턴스 생성
okt = Okt()

# 토크나이징 함수 정의
def tokenizer(raw, pos=["Noun","Alpha","Verb","Number"], stopword=stopwords):
    return [
        word for word, tag in okt.pos(
            raw, 
            norm=True,   # normalize 그랰ㅋㅏ -> 그래ㅋㅋ
            stem=True    # stemming 바뀌나->바뀌다
            )
            if len(word) > 1 and tag in pos and word not in stopword

    ]

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenizer, use_idf=True)
features = vectorizer.fit_transform(sentences)
features.toarray()
# 디션너리가 만들어짐 

array([[0.46015789, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.46015789, 0.46015789, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.46015789, 0.        , 0.        ,
        0.        , 0.        , 0.39117625, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.51519219, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
      

In [8]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=3, random_state=111) #인스턴스화 #n_components 토픽의 갯수
lda_model.fit(features) #교육

In [14]:
dictionary_list = vectorizer.get_feature_names_out()

In [17]:
list(zip(dictionary_list, lda_model.components_[0],lda_model.components_[1],lda_model.components_[2])) # concaterate/ zip은 1차원 행렬만 가능

[('가다', 0.33440990731900777, 0.7911909510356125, 0.3345570337725623),
 ('가족', 0.7992672461905107, 0.3346767659914496, 0.33470187011689095),
 ('갈다', 0.33449535961542703, 0.33462884482675337, 1.2372442329094913),
 ('건강', 0.8608367917667525, 0.33482318379868886, 0.33485083017390127),
 ('계획', 0.33438974543108546, 0.3345120091769126, 0.7912561375191847),
 ('공부', 0.9557349114060001, 0.3350367319252789, 0.33506823627416454),
 ('나들이', 0.33440990731900777, 0.7911909510356125, 0.3345570337725623),
 ('날씨', 0.33440990731900777, 0.7911909510356125, 0.3345570337725623),
 ('놀다', 0.33452947443194514, 0.8490693004874539, 0.33469212851253577),
 ('마음', 0.33498097590829673, 0.3351700412752131, 1.0369557640030314),
 ('만나다', 0.33452947443194514, 0.8490693004874539, 0.33469212851253577),
 ('먹다', 0.334677720424076, 0.33483168273932207, 0.9365339155705361),
 ('미루다', 0.9557349114060001, 0.33503673192527883, 0.33506823627416454),
 ('배우다', 1.0362219540441318, 0.335423042225978, 0.3354617849164308),
 ('보내다', 0.799

In [27]:
import pandas as pd
## 상위 단어 추출 
## 0 확률 1은 dictionary
topics_list = list()
for topic in lda_model.components_:
    df_datas = [topic, dictionary_list]
    df_topics = pd.DataFrame(data=df_datas)
    df_topics= df_topics.T
    df_topics = df_topics.sort_values(0, ascending=False)
    # print(df_topics[:3])
    topics_text = ' '.join(df_topics[1].values[:4])# 시리즈 형식으로 출력 get values from series / index 
    print(topics_text)
    topics_list.append(topics_text)
' '.join(df_topics[1].values[:4]) #시리즈를 하나에 문장 생성
topics_list_add = [['Topic0', 'Topic1', 'Topic2'],topics_list]
df_topics_keywords = pd.DataFrame(topics_list_add)
df_topics_keywords.T

하다 언어 배우다 공부
오늘 영화 정말 놀다
갈다 읽다 마음 먹다


Unnamed: 0,0,1
0,Topic0,하다 언어 배우다 공부
1,Topic1,오늘 영화 정말 놀다
2,Topic2,갈다 읽다 마음 먹다


In [31]:
topics_output = lda_model.transform(features)
topics_output # 토픽별 점수

array([[0.10462299, 0.7905512 , 0.10482582],
       [0.13946382, 0.72088285, 0.13965333],
       [0.12347941, 0.12362429, 0.7528963 ],
       [0.77339137, 0.11328861, 0.11332002],
       [0.75164634, 0.12416304, 0.12419063],
       [0.10458222, 0.10474923, 0.79066855],
       [0.13943843, 0.13959605, 0.72096551],
       [0.11268585, 0.77443415, 0.11288   ],
       [0.72035161, 0.13980797, 0.13984042],
       [0.78957076, 0.10519796, 0.10523128]])

In [30]:
df_topics_score = pd.DataFrame(topics_output)
df_topics_score

Unnamed: 0,0,1,2
0,0.104623,0.790551,0.104826
1,0.139464,0.720883,0.139653
2,0.123479,0.123624,0.752896
3,0.773391,0.113289,0.11332
4,0.751646,0.124163,0.124191
5,0.104582,0.104749,0.790669
6,0.139438,0.139596,0.720966
7,0.112686,0.774434,0.11288
8,0.720352,0.139808,0.13984
9,0.789571,0.105198,0.105231


In [37]:
import numpy as np
df_topics_score['dominant_topic_number']=np.argmax(topics_output, axis=1)

In [38]:
df_topics_score

Unnamed: 0,0,1,2,dominant_topic_number
0,0.104623,0.790551,0.104826,1
1,0.139464,0.720883,0.139653,1
2,0.123479,0.123624,0.752896,2
3,0.773391,0.113289,0.11332,0
4,0.751646,0.124163,0.124191,0
5,0.104582,0.104749,0.790669,2
6,0.139438,0.139596,0.720966,2
7,0.112686,0.774434,0.11288,1
8,0.720352,0.139808,0.13984,0
9,0.789571,0.105198,0.105231,0


In [39]:
df_topics_score['sentences']=sentences

In [40]:
df_topics_score

Unnamed: 0,0,1,2,dominant_topic_number,sentences
0,0.104623,0.790551,0.104826,1,오늘 날씨가 좋아서 나들이 가고 싶다.
1,0.139464,0.720883,0.139653,1,이 영화는 정말 재미있었어요.
2,0.123479,0.123624,0.752896,2,맛있는 음식을 먹으러 갈까요?
3,0.773391,0.113289,0.11332,0,운동을 하면 건강에 좋아지는 것 같아요.
4,0.751646,0.124163,0.124191,0,공부하기 싫어서 미루고 있어요.
5,0.104582,0.104749,0.790669,2,여행 계획을 세우고 있는데 어디로 갈까요?
6,0.139438,0.139596,0.720966,2,좋은 책을 읽으면 마음이 편안해져요.
7,0.112686,0.774434,0.11288,1,오늘은 친구들과 만나서 재미있게 놀았어요.
8,0.720352,0.139808,0.13984,0,새로운 언어를 배우는 것은 어려워도 흥미로워요.
9,0.789571,0.105198,0.105231,0,주말에 가족들과 함께 시간을 보내기로 했습니다.


In [42]:
from sklearn.model_selection import GridSearchCV
# GridSearch를 사용하여 최적의 토픽 수 탐색
params = {'n_components': [3, 5, 7, 9]}  # 탐색할 토픽 수의 범위
lda = LatentDirichletAllocation()
grid_search = GridSearchCV(lda, param_grid=params, cv=3, n_jobs=-1)
grid_search.fit(features)
# 각 n_components에 대한 결과 출력
print("각 n_components 값에 대한 토픽 수(값이 클 수록 좋음):")
for n, topic in zip(params['n_components'], grid_search.cv_results_['mean_test_score']):
    print(f"n_components={n}: {topic}")

각 n_components 값에 대한 토픽 수(값이 클 수록 좋음):
n_components=3: -45.45857968294374
n_components=5: -61.271860230338596
n_components=7: -77.16427533028543
n_components=9: -91.02105220932259
