# 토픽모델링 (Topic Modeling)

# 0 환경

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

# 런타임 재시작 !

In [None]:
import matplotlib.pyplot as plt
plt.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus'] = False

import warnings 
warnings.filterwarnings('ignore')

In [None]:
import nltk
nltk.download('stopwords')

# 1 잠재의미분석 (Latent Semantic Analysis LSA)

## 1.1 직접 구현

### 1) 직접 구현

In [None]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import randomized_svd

class LSA :
    def __init__(self, doc_ls, topic_num):
        self.doc_ls = doc_ls
        self.topic_num = topic_num
        self.term2idx, self.idx2term = self.toIdxDict(' '.join(doc_ls).split())
        self.doc2idx, self.idx2doc = self.toIdxDict(doc_ls)
        
        self.tdm = self.TDM(doc_ls)
        self.U, self.s, self.VT = self.SVD(self.tdm)
        
        
        self.term_mat = self.TermVectorMatrix(self.U, self.s, topic_num)
        self.doc_mat = self.DocVectorMatrix(self.s, self.VT, topic_num)
        self.term_doc_mat = self.TermDocVectorMatrix(self.U, self.s, self.VT, topic_num)
        
        
        self.term_sim = self.TermSimilarityMatrix(self.term_mat)
        self.doc_sim = self.DocSimilarityMartrix(self.doc_mat)
      
    # 리스트내 값을 index로 변환하는 dict과
    # index를 리스트내 값으로 변환하는 dict
    def toIdxDict(self, ls) :
        any2idx = defaultdict(lambda : len(any2idx))
        idx2any = defaultdict()

        for item in ls:
            any2idx[item]
            idx2any[any2idx[item]] = item
            
        print(idx2any)
        return any2idx, idx2any
    
    def TDM(self, doc_ls):
        # 행(토큰크기), 열(문서갯수)로 TDM 생성
        tdm = np.zeros([len(self.term2idx.keys()), len(doc_ls)])
        
        for doc_idx, doc in enumerate(doc_ls) :
            for term in doc.split() :
              #등장한 단어를 dictionary에서 위치를 탐색하여 빈도수 세기
              tdm[self.term2idx[term], doc_idx] += 1
        self.tdm = tdm
        
        return tdm
    
    # 특이값 분해
    def SVD(self, tdm):
        U, s, VT = randomized_svd(tdm, 
                                  n_components=self.topic_num,
                                  n_iter=10,
                                  random_state=None)
        
        U, s, VT = np.linalg.svd(tdm, full_matrices=True)
        return U, s, VT
    
    # 토픽별 주요 키워드 출력
    def TopicModeling(self, count = 3) :
        topic_num = self.topic_num
        
        for i in range(topic_num) :
            score = self.U[:,i:i+1].T
            print(score)
            sorted_index = np.flip(np.argsort(-score),0)
            
            a = []
            for j in sorted_index[0,: count] :
                a.append((self.idx2term[j], score[0,j].round(3)))
            
            print("Topic {} - {}".format(i+1,a ))
    
    def vectorSimilarity(self, matrix) :
        similarity = np.zeros([matrix.shape[1], matrix.shape[1]])
        
        for i in range(matrix.shape[1]) :
            for j in range(matrix.shape[1]) :
                similarity[i,j] =  cosine_similarity(matrix[:,i].T, matrix[:,j].T)
          
        return similarity
    
    # 키워드를 입력했을 때 단어 벡터 반환
    def GetTermVector(self, term):
        vec = self.term_mat[self.term2idx[term]:self.term2idx[term]+1,:]
        print('{} = {}'.format(term, vec))
        return vec
    
    # 문서를 입력했을 때 문서 벡터 반환
    def GetDocVector(self, doc):
        vec = self.doc_mat.T[self.doc2idx[doc]:self.doc2idx[doc]+1,:]
        print('{} = {}'.format(doc, vec))
        return vec
    
    def Compression(self, round_num=0) :
        print(self.tdm)
        print(self.term_doc_mat.round(round_num))
    
    def TermVectorMatrix(self, u, s, topic_num):
        term_mat = np.matrix(u[:, :topic_num])# * np.diag(s[:topic_num])
        return term_mat
    
    def DocVectorMatrix(self, s, vt, topic_num):
        doc_mat = np.matrix(vt[:topic_num,:])
        return doc_mat
    
    def TermDocVectorMatrix(self, u, s, vt, topic_num):
        term_doc_mat = np.matrix(u[:, :topic_num]) * np.diag(s[:topic_num])  * np.matrix(vt[:topic_num,:])
        return term_doc_mat
    
    def TermSimilarityMatrix(self, termVectorMatrix):
        return self.vectorSimilarity(termVectorMatrix.T)
    
    def GetTermSimilarity(self, term1, term2):
        sim = self.term_sim[self.term2idx[term1], self.term2idx[term2]]
        print("({},{}) term similarity = {}".format(term1, term2, sim))
        return sim 
    
    def DocSimilarityMartrix(self,docVectorMatrix):    
        return self.vectorSimilarity(docVectorMatrix) 
    
    def GetDocSimilarity(self, doc1, doc2):
        sim = self.doc_sim[self.doc2idx[doc1], self.doc2idx[doc2]]
        print("('{}','{}') doc similarity = {}".format(doc1, doc2, sim))
        return sim 

In [None]:
doc_ls = [
    '바나나 사과 포도 포도 짜장면',
    '사과 포도',
    '포도 바나나',
    '짜장면 짬뽕 탕수육',
    '볶음밥 탕수육',
    '짜장면 짬뽕',
    '라면 스시',
    '스시 짜장면',
    '가츠동 스시 소바',
    '된장찌개 김치찌개 김치',
    '김치 된장 짜장면',
    '비빔밥 김치'
]
lsa = LSA(doc_ls, 4)
print('== 토픽 모델링 ==')
lsa.TopicModeling(3)

In [None]:
lsa.tdm

In [None]:
np.linalg.svd?

In [None]:
doc_ls = [
    '바나나 사과 포도 포도 짜장면',
    '사과 포도',
    '포도 바나나',
    '짜장면 짬뽕 탕수육',
    '볶음밥 탕수육',
    '짜장면 짬뽕',
    '라면 스시',
    '스시 짜장면',
    '가츠동 스시 소바',
    '된장찌개 김치찌개 김치',
    '김치 된장 짜장면',
    '비빔밥 김치'
]
lsa = LSA(doc_ls, 4)
X = lsa.TDM(doc_ls)
print('== 토픽 모델링 ==')
lsa.TopicModeling(4)
print('\n== 단어 벡터 ==')
lsa.GetTermVector('사과')
lsa.GetTermVector('짜장면')
print('\n== 단어 유사도 ==')
lsa.GetTermSimilarity('사과','바나나')
lsa.GetTermSimilarity('사과','짜장면')
lsa.GetTermSimilarity('포도','짜장면')
lsa.GetTermSimilarity('사과','스시')
print('\n== 문서 벡터 ==')
lsa.GetDocVector('사과 포도')
lsa.GetDocVector('짜장면 짬뽕')
print('\n== 문서 유사도 ==')
lsa.GetDocSimilarity('사과 포도', '포도 바나나')
lsa.GetDocSimilarity('사과 포도', '라면 스시')

### 2) 실습 템플릿

In [None]:
class LSA :
    def __init__(self, doc_ls, topic_num):
        pass
    
    # tdm matrix 생성
    def TDM(self, doc_ls):
        pass
    
    # tdm matrix 특이값 분해(SVD)
    # U, s, Vt로 분해
    def SVD(self, tdm):
        pass
    
    # 토픽별 주요 키워드 출력
    def TopicModeling(self) :
        pass
        
    # 단어 벡터 행렬 생성 dot(U,s)  
    def TermVectorMatrix(self, u, s):
        pass
    
    # 문서 벡터 행렬 생성 dot(s,Vt).T 
    def DocVectorMatrix(self, s, vt):
        pass
    
    # 키워드를 입력했을 때 단어 벡터 반환
    def GetTermVector(self, term):
        pass
    
    # 문서를 입력했을 때 문서 벡터 반환
    def GetDocVector(self, doc):
        pass
    
    # 단어-문서 벡터 행렬 생성
    def TermDocVectorMatrix(self, u, s, vt):
        pass
    
    # 단어 벡터 행렬에서 단어 간 코사인 유사도 측정하여 행렬형태로 반환
    def TermSimilarityMatrix(self, term_vec_matrix):
        pass
    
    # 두개 단어를 입력했을 때 코사인 유사도 반환
    def GetTermSimilarity(self, term1, term2):
        pass
    
    # 문서 벡터 행렬에서 문서 간 코사인 유사도 측정하여 행렬형태로 반환
    def DocSimilarityMartrix(self, doc_vec_matrix):
        pass
    
    # 두개 문서를 입력했을 때 코사인 유사도 반환
    def GetDocSimilarity(self, doc1, doc2):
        pass

In [None]:
doc_ls = [
    '바나나 사과 포도 포도',
    '사과 포도',
    '포도 바나나',
    '짜장면 짬뽕 탕수욕',
    '볶음밥 탕수욕',
    '짜장면 짬뽕',
    '라면 스시',
    '스시',
    '가츠동 스시 소바',
    '된장찌개 김치찌개 김치',
    '김치 된장',
    '비빔밥 김치'
]

lsa = LSA(doc_ls, 3)
lsa.TopicModeling()
lsa.GetTermSimilarity('사과','바나나')
lsa.GetTermSimilarity('사과','짜장면')
lsa.GetDocSimilarity('사과 포도', '포도 바나나')
lsa.GetDocSimilarity('사과 포도', '라면 스시')

## 1.2 sklearn 활용

### 1) 토픽모델링

In [None]:
doc_ls = [
    '바나나 사과 포도 포도 짜장면',
    '사과 포도',
    '포도 바나나',
    '짜장면 짬뽕 탕수육',
    '볶음밥 탕수육',
    '짜장면 짬뽕',
    '라면 스시',
    '스시 짜장면',
    '가츠동 스시 소바',
    '된장찌개 김치찌개 김치',
    '김치 된장 짜장면',
    '비빔밥 김치'
]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

n_topic= 4

tfidf_vect = TfidfVectorizer(max_features= 1000, max_df = 0.5, smooth_idf=True)
tfidf = tfidf_vect.fit_transform(doc_ls)
svd = TruncatedSVD(n_components=n_topic, algorithm='randomized', n_iter=100)
u_sigma = svd.fit_transform(tfidf)
svd.components_

In [None]:
vocab = tfidf_vect.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.
n = 3
for idx, topic in enumerate(svd.components_):
    print("Topic %d:" % (idx), [(vocab[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])

### 2) 단어벡터

In [None]:
svd.components_.T

In [None]:
# 단어벡터
for i in range(len(vocab)) :
    print("{} : {}".format(vocab[i], svd.components_.T[i]))

In [None]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

def calc_similarity_matrix(vectors) :
    def cosine_similarity(a, b) : 
        return dot(a, b)/(norm(a)*norm(b))

    n_word = len(vectors)
    similarity_matrix = np.zeros((n_word, n_word))

    for i in range(n_word) :
        for j in range(i, n_word) :
            similarity_matrix[j, i] = cosine_similarity(vectors[i], vectors[j]).round(4)  

    return similarity_matrix

In [None]:
import seaborn as sns

def visualize_similarity(similarity_matrix) :
    uniform_data = similarity_matrix
    mask = np.triu(np.ones_like(similarity_matrix, dtype=np.bool))
    plt.rcParams['figure.figsize'] = [8, 6]
    ax = sns.heatmap(uniform_data, mask=mask, #xticklabels=features, yticklabels=features, 
                    annot=True, fmt=".2f",annot_kws={'size':8}, cmap='coolwarm')

In [None]:
word_vectors = svd.components_.T
word_similarity_matrix = calc_similarity_matrix(word_vectors)
visualize_similarity(word_similarity_matrix)

### 3) 문서벡터

In [None]:
doc_vectors = u_sigma/svd.singular_values_
doc_similarity_matrix = calc_similarity_matrix(doc_vectors)
visualize_similarity(doc_similarity_matrix)

### 4) 벡터 시각화

- manifold.TSNE() : t-SNE(t분포 Stochastic Neighbor Embedding) 차원 축소 기법의 하나

In [None]:
vectors = word_vectors
labels = tfidf_vect.get_feature_names()

In [None]:
from sklearn.manifold import TSNE
import numpy as np

def visualize_vectors(vectors, labels):
    tsne = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=2)
    np.set_printoptions(suppress=True)
    T = tsne.fit_transform(vectors)
    #labels = vocab

    plt.figure(figsize=(10, 6))
    plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
    for label, x, y in zip(labels, T[:, 0], T[:, 1]):
        plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')

In [None]:
visualize_vectors(vectors, labels)

### 5) 파이프라인 사용

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def my_tokenizer(text):
    return [w for w in text.split() if len(w) > 1]
             
lsa_pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer = my_tokenizer)),
    ('tfidf', TfidfTransformer(smooth_idf=True)),
    ('lsa', TruncatedSVD(n_components=n_topic, algorithm='randomized', n_iter=100)), 
])

lsa_pipeline.fit(doc_ls)

In [None]:
lsa = lsa_pipeline.named_steps['lsa']
count_vect = lsa_pipeline.named_steps['vect']

In [None]:
vocab = count_vect.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.

def get_topics(components, feature_names, n=3):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lsa.components_,vocab)

## 1.3 gensim 활용

### 1) 토픽모델링

In [None]:
docs = [
    '바나나 사과 포도 포도',
    '사과 포도',
    '포도 바나나',
    '짜장면 짬뽕 탕수욕',
    '볶음밥 탕수욕',
    '짜장면 짬뽕',
    '라면 스시',
    '스시',
    '가츠동 스시 소바',
    '된장찌개 김치찌개 김치',
    '김치 된장',
    '비빔밥 김치'
]

In [None]:
doc_ls = [doc.split() for doc in docs]
doc_ls[0]

In [None]:
from gensim import corpora
from gensim.models import LsiModel
from gensim.models import TfidfModel

n_dim = 4

id2word = corpora.Dictionary(doc_ls)
corpus_TDM = [id2word.doc2bow(text) for text in doc_ls]
tfidf = TfidfModel(corpus_TDM) #train
corpus_TFIDF = tfidf[corpus_TDM] #predict
model_LSA = LsiModel(corpus_TFIDF, id2word=id2word, num_topics=n_dim)

for top in model_LSA.print_topics(n_dim, 3):
    print(top)

In [None]:
model_LSA.projection.u

In [None]:
from gensim.matutils import sparse2full
corpus_VT = model_LSA[corpus_TDM]
VT = [sparse2full(doc_vector, n_dim).tolist() for doc_vector in corpus_VT]
VT

### 2) 단어벡터

In [None]:
for i in id2word.keys() :
    print("{} : {}".format(id2word[i], model_LSA.projection.u[i]))

In [None]:
word_vectors = model_LSA.projection.u
word_similarity_matrix = calc_similarity_matrix(word_vectors)
visualize_similarity(word_similarity_matrix)

### 3) 문서벡터

In [None]:
from gensim.matutils import sparse2full
corpus_V = model_LSA[corpus_TDM]
V = [sparse2full(doc_vector, n_dim).tolist() for doc_vector in corpus_VT]

doc_vectors = V
doc_similarity_matrix = calc_similarity_matrix(doc_vectors)
visualize_similarity(doc_similarity_matrix)

### 4) 벡터시각화

In [None]:
vectors = word_vectors
labels = [k for k in id2word.keys()]

visualize_vectors(vectors, labels)

# 2 잠재디리클레할당(LDA, Latent Dirichlet Allocation)

## 2.1 직접구현

### 1) 직접 구현

In [None]:
import random
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import defaultdict

class LDA :
    def __init__(self, docs, topic_num, alpha = 0.1, beta = 0.001):
        self.alpha = alpha
        self.beta = beta
        self.k = topic_num
        self.docs = docs
      
    def RandomlyAssignTopic(self, docs) :
        dic = defaultdict()
        t2i = defaultdict(lambda : len(t2i))
        i2t = defaultdict()
        d = 0
        w = 0
      
        wnl = WordNetLemmatizer()
        stopword = stopwords.words('english')
        stopword.append(',')
        
        # 임의의 토픽을 할당
        for tokens in [word_tokenize(doc) for doc in docs] :
            for token in [wnl.lemmatize(token.lower()) for token in tokens 
                          if token not in stopword] :
                i2t[t2i[token]] = token
                dic[(d, t2i[token], w)] = random.randint(0,self.k-1)
                w += 1
            d += 1
        
        print(dic)
        return dic, t2i, i2t
    
    
    def CountDocTopic(self, t2i) :
        docs = np.zeros((self.k, len(self.docs)))
        terms = np.zeros((self.k, len(t2i.keys())))

        
        #문서별 토큰별 빈도수 계산
        for (d, n, w) in self.term_topic.keys() :
            topic = self.term_topic[(d, n, w)]
            docs[topic, d] += 1 + self.alpha
            terms[topic, n] += 1 + self.beta
        
        #비어있는 값는 값에 alpha, beta 설정
        docs = np.where(docs==0.0, self.alpha, docs) 
        terms = np.where(terms==0.0, self.beta, terms)
        
        print("단어 토픽별 빈도")
        print(terms.round(2))
        print("문서 토픽별 빈도")
        print(docs.round(1))
          
        return docs, terms
    
    
    def IterateAssignTopic(self, docs, terms, i2t) :
        #한개의 단어씩 주제 배정
        prev = {}
        
        while prev != self.term_topic:
            for (d, n, w) in self.term_topic.keys() :
                topic = [0, 0]

                docs[self.term_topic[(d, n, w)], d] -= (1 + self.alpha) #숫자만 빼야함. 코드 실행하여 재확인
                terms[self.term_topic[(d, n, w)], n] -= (1 + self.beta)
                
                docs = np.where(docs==0.0, self.alpha, docs) 
                terms = np.where(terms==0.0, self.beta, terms)

                print()
                print("{}(d:{}, n:{}, w:{}) = topic:{}".format(i2t[n], d, n, w, self.term_topic[(d, n, w)]))
                print("문서 토픽별 빈도")
                print(docs.round(1))
                print("단어 토픽별 빈도")
                print(terms.round(3))
                
                
                prev = self.term_topic
                
                for t in range(self.k) :
                    p_t_d = docs[t, d]/docs[:,d:d+1].sum()
                    p_w_t = terms[t, n]/terms[t:t+1,:].sum()
                    prob = p_t_d * p_w_t

                    if topic[1] < prob :
                        topic = [t, prob]
                      
                    print("topic {} 일 확률 = {}/{} * {}/{} = {} * {} = {}".format(t
                        ,docs[t, d].round(1) , docs[:,d:d+1].sum().round(1)
                        ,terms[t, d].round(3) , terms[t:t+1,:].sum().round(3)
                        , p_t_d.round(3), p_w_t.round(4), prob.round(4)))
                
                
                if docs[topic[0], d] < 1 : docs[topic[0], d] = 0
                if terms[topic[0], n] < 1 : terms[topic[0], n] = 0
                  
                #확률이 가장 큰 토픽을 할당  
                self.term_topic[(d, n, w)] = topic[0]
                docs[topic[0], d] += (1 + self.alpha)
                terms[topic[0], n] += (1 + self.beta)
                
                print("할당된 토픽:{}".format(self.term_topic[(d, n, w)]))
                print("="*50)
            
        return terms

    
    # 토픽별 주요 키워드 출력
    def TopicModeling(self, count=3) :
        self.term_topic, t2i, i2t = self.RandomlyAssignTopic(self.docs)
        docs, terms = self.CountDocTopic(t2i)
        terms = self.IterateAssignTopic(docs, terms, i2t)
        
        score = terms / terms.sum(axis=1, keepdims=True)
        
        for i in range(self.k) :
            print("\nTopic {}".format(i+1))
            sorted_index = np.flip(np.argsort(score[i]),0)[:count]
            for j in sorted_index :
                #pass
                print("({}={})".format(i2t[j], score[i,j].round(3)), end = ' ')


In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
doc_ls = [
    "Cute kitty",
    "Eat rice or cake",
    "Kitty and hamster",
    "Eat bread",
    "Rice, bread and cake",
    "Cute hamster eats bread and cake"
]

lda = LDA(doc_ls, 2)
lda.TopicModeling(5)

### 2) 실습 템플릿

In [None]:
class LDA :
    def __init__(self, doc_ls, topic_num, alpha = 0.1, beta = 0.001):
        self.alpha = alpha
        self.beta = beta
        self.k = topic_num
    
    def RandomlyAssignTopic(self, doc_ls):
        pass
    

    def IterateAssignTopic(self) :
        pass
    
    # 토픽별 주요 키워드 출력
    def TopicModeling(self) :
        pass

## 2.2  sklearn 활용

### 1) 토픽모델링 (파이프라인 미사용)

- decomposition.LatentDirichletAllocation() : LDA 모델

In [None]:
!pip install pyLDAvis

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

#뉴스 다운로드 및 전처리
def get_news(apply_split=True) :
    #20newsgroup 다운로드
    dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
    documents = dataset.data

    news_df = pd.DataFrame({'document':documents})
    news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ") # 특수 문자 제거
    news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) # 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
    news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())# 전체 단어에 대한 소문자 변환
    tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화

    stop_words = stopwords.words('english') # NLTK 불용어 조회

    if apply_split :
        return tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
    else :
        return tokenized_doc.apply(lambda x: ' '.join([item for item in x if item not in stop_words]))

In [None]:
#공백으로 토큰 분리
def my_tokenizer(text):
    return text.split()

tokenized_docs = get_news(False)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

tfidf_vect = TfidfVectorizer(tokenizer = my_tokenizer)
tfidf = tfidf_vect.fit_transform(tokenized_docs)
lda = LatentDirichletAllocation(n_components=20, 
                                max_iter=20, 
                                learning_method='online', 
                                random_state=100)

lda_output = lda.fit_transform(tfidf)

In [None]:
#!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vect, mds='tsne')
pyLDAvis.display(vis)

### 2) 토픽모델링 (파이프라인 사용)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

tokenized_docs = get_news(False)
#파이프라이구성
lda_pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(tokenizer = my_tokenizer)),
    ('lda', LatentDirichletAllocation(n_components=20, 
                                      max_iter=20, 
                                      learning_method='online', 
                                      random_state=100))])

lda_pipeline.fit(tokenized_docs)

In [None]:
!pip install pyLDAvis

In [None]:
#!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn

tfidf_vect = lda_pipeline.named_steps['tfidf_vect']
tfidf = tfidf_vect.fit_transform(tokenized_docs)
lda = lda_pipeline.named_steps['lda']

vis = pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vect, mds='tsne')
pyLDAvis.display(vis)

### 3) 하이퍼파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

def tuning_hyperparams(train_data, n_jobs=1) :                
    lda_pipeline = Pipeline([
        ('tfidf_vect', TfidfVectorizer(tokenizer = my_tokenizer)),
        ('lda', LatentDirichletAllocation(max_iter=10, random_state=100))
    ])
    
    search_params = {
        'tfidf_vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf_vect__use_idf': (True, False),
        'lda__n_components': [10, 20]
    }

    gs_lda = GridSearchCV(lda_pipeline, search_params, n_jobs=n_jobs)
    gs_lda = gs_lda.fit(train_data)
    print("Best score: {0}".format(gs_lda.best_score_))  
    print("Best parameters set:")  
    best_parameters = gs_lda.best_estimator_.get_params()  
    for param_name in sorted(list(best_parameters.keys())):  
        print("\t{0}: {1}".format(param_name, best_parameters[param_name]))
    return gs_lda.best_estimator_

lda_pipeline = tuning_hyperparams(tokenized_docs)

In [None]:
#!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn

tfidf_vect = lda_pipeline.named_steps['tfidf_vect']
tfidf = tfidf_vect.fit_transform(tokenized_docs)
lda = lda_pipeline.named_steps['lda']

vis = pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vect, mds='tsne')
pyLDAvis.display(vis)

## 2.3 gensim 활용

### 1) 토픽모델링

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from gensim import corpora
from gensim.models import LdaModel, TfidfModel

tokenized_docs = get_news()
id2word = corpora.Dictionary(tokenized_docs)
corpus_TDM = [id2word.doc2bow(doc) for doc in tokenized_docs]
tfidf = TfidfModel(corpus_TDM)
corpus_TFIDF = tfidf[corpus_TDM]

n = 20
lda = LdaModel(corpus=corpus_TFIDF,
               id2word=id2word,
               num_topics=n, 
               random_state=100)

for t in lda.print_topics():
    print(t)

In [None]:
corpus_TDM = [id2word.doc2bow(doc) for doc in tokenized_docs]
tfidf = TfidfModel(corpus_TDM)
corpus_TFIDF = tfidf[corpus_TDM]

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda, corpus_TFIDF, id2word, mds='tsne')
pyLDAvis.display(vis)

### 2) 하이퍼파라미터 튜닝

- models.coherencemodel.CoherenceModel() : LDA에서 최적 토픽 개수를 추출하는 모델 (토픽의 응집력 계산)

In [None]:
from gensim import corpora
from gensim.models import LdaModel, TfidfModel
from gensim.models.coherencemodel import CoherenceModel

def compute_coherence_perplexity(tokenized_docs, end, start=2, step=3) :
    id2word = corpora.Dictionary(tokenized_docs)
    corpus_TDM = [id2word.doc2bow(doc) for doc in tokenized_docs]
    tfidf = TfidfModel(corpus_TDM)
    corpus_TFIDF = tfidf[corpus_TDM]
    
    coherence_values = []
    perplexity_values = []
    model_list = []
    topic_n_list = []
    
    for num_topics in range(start, end, step):
        model = LdaModel(corpus_TFIDF, num_topics=num_topics, id2word = id2word) 
            
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, 
                                        texts=tokenized_docs, 
                                        dictionary=id2word, 
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        perplexity_values.append(model.log_perplexity(corpus_TFIDF))
        topic_n_list.append(num_topics)

    for t, c, p in zip(topic_n_list, coherence_values, perplexity_values) :
        print("topic_n={}, coherence : {}, perplexity : {}".format(t,c,p))

    return corpus_TFIDF, id2word, model_list, coherence_values, perplexity_values

In [None]:
from gensim import corpora
from gensim.models import LdaModel, TfidfModel

tokenized_docs = get_news()
corpus, id2word, model_list, coherence_values, perplexity_values = compute_coherence_perplexity(tokenized_docs, start=10, end=30, step=10)

In [None]:
lda_model = model_list[0]

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word, mds='tsne')
pyLDAvis.display(vis)