# AI 전문가 교육과정 실습 2 - part Final

***
### NLP응용: 토픽 추출
Applied Natrual Language Processing: Topic Modeling

강사: 차미영 교수 (카이스트 전산학부)    
조교: 신민기, 정현규 (카이스트 전산학부)

실습 담당: 신민기 (mingi.shin@kaist.ac.kr)

# LDA on Korean Data

한국어 데이터를 처리하기 위해 konlpy, openjdk, JPype, mecab 등을 설치한다.

In [None]:
!pip install pyLDAvis==3.2.2
!pip install konlpy

In [None]:
!apt-get update
!apt-get install g++ openjdk-8-jdk
!pip3 install konlpy JPype1-py3
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)
!pip3 install mecab-python

In [None]:
import glob
import pandas as pd
import matplotlib.pyplot as plt

import re
from konlpy.tag import Mecab
from wordcloud import WordCloud

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from pprint import pprint

import pyLDAvis
import pyLDAvis.gensim
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Data load

데이터: 코로나 바이러스와 관련된 키워드를 언급한 한국어 트위터 데이터

먼저 zip 파일을 다운 받고, "세션 저장소에 업로드" 기능을 이용해 업로드해주세요.

In [None]:
# 압축 해제
!unzip ./data_topic.zip

In [None]:
file_list = glob.glob("./data/*.json")

In [None]:
total_df = pd.read_json(file_list[0], lines=True)
total_df.reset_index(inplace = True)
original_df = total_df.copy()

In [None]:
total_df.head(3)

In [None]:
total_df.describe()

# Pre-processing function

In [None]:
# Basic Cleaning Text Function
def CleanText(readData):

    # Remove Retweets
    text = re.sub('RT @[\w_]+: ', '', readData)

    # Remove Mentions
    text = re.sub('@[\w_]+', '', text)

    # Remove or Replace URL
    # text = url_re.sub('URL', text)
    text = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", ' ', text) # start with http
    text = re.sub(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{2,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", ' ', text) # Don't start with http

    # Remove Hashtag
    text = re.sub('[#]+[0-9a-zA-Z_]+', ' ', text)

    # Remove Garbage Words (ex. &lt, &gt, etc)
    text = re.sub('[&]+[a-z]+', ' ', text)

    # Remove Special Characters
    text = re.sub('[^0-9a-zA-Zㄱ-ㅎ가-힣]', ' ', text)

    # Remove Numbers (If you want, activate the code)
    text = re.sub(r'\d+',' ',text)

    # Remove English (If you want, activate the code)
    text = re.sub('[a-zA-Z]' , ' ', text)

    # Remove newline
    text = text.replace('\n',' ')

    # Remove multi spacing & Reform sentence
    text = ' '.join(text.split())

    return text

In [None]:
def preprocessing_mecab(readData):
    #### Clean text
    sentence = CleanText(readData)

    #### Tokenize
    morphs = mecab.pos(sentence)

    # refer https://konlpy-ko.readthedocs.io/ko/v0.4.3/morph/ for more details
    JOSA = ["JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC"]
    SIGN = ["SF", "SE", "SSO", "SSC", "SC", "SY"]
    TERMINATION = ["EP", "EF", "EC", "ETN", "ETM"] # 어미
    SUPPORT_VERB = ["VX"]
    NUMBER = ["SN"]

    # Remove JOSA, EOMI, etc
    morphs[:] = (morph for morph in morphs if morph[1] not in JOSA+SIGN+TERMINATION+SUPPORT_VERB)

    # If you want to save only Nouns:
    # morphs = mecab.nouns(sentence)

    # Remove Stopwords
    morphs[:] = (morph for morph in morphs if morph[0] not in korean_stopwords["형태"].tolist())
    morphs[:] = (morph for morph in morphs if morph[0] not in my_korean_stopwords["형태"].tolist())

    # Remove length-1 words
    morphs[:] = (morph for morph in morphs if not (len(morph[0]) == 1))

    # Remove Numbers
    morphs[:] = (morph for morph in morphs if morph[1] not in NUMBER)

    # Result pop-up
    result = []
    for morph in morphs:
        result.append(morph[0])

    return result

In [None]:
# Korean Stopwords Load
korean_stopwords = pd.read_csv("./data/korean_stopwords.txt", delimiter='\t', names=["형태", "품사", "비율"])

# Add Custom Korean Stopwords
my_data = [["님", "NNG"], ["들", "XSN"], ["ㅋㅋㄱㅋㄱㅋ", "NNG"],
           ["오늘", "NNG"], ["얘기", "NNG"], ["ㅠㅠ", "NNG"], ["없이", ], ["딱히", ],
           ['ㅋㅋ', ], ['ㅋㅋㅋ', ], ["그런데", ], ["누구", ], ["여기저기", ]]

my_korean_stopwords = pd.DataFrame(my_data, columns = ['형태', '품사'])

# Main
mecab = Mecab(dicpath="/usr/local/lib/mecab/dic/mecab-ko-dic") # Mecab Dictionary Path

In [None]:
SAMPLE_TEXT = "RT @boxplus01: 美언론, 'abc한국 코로나 확산주범은 신천지와 보수세력' https://t.co/Phq0l48aUm"
print("Before preprocessing : {}".format(SAMPLE_TEXT))
print("After preprocessing : {}".format(preprocessing_mecab(SAMPLE_TEXT)))

In [None]:
total_df['tweet'] = total_df['tweet'].apply(lambda x: preprocessing_mecab(x))

In [None]:
total_df['tweet']

# # Find topics

In [None]:
data_lemmatized = total_df['tweet'].tolist()

In [None]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized

corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1])

In [None]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=2,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Visualization

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

# Choose the number of topics

In [None]:
from tqdm import tqdm

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step)):
        model =  gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=2,
                                           alpha='auto',
                                           per_word_topics=True)

        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=35, step=6)

In [None]:
limit=35; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

# Finding the dominant topic in each sentence

In [None]:
optimal_model = model_list[5]

In [None]:
optimal_model.per_word_topics = False

In [None]:
# Compute Perplexity
print('\nPerplexity: ', optimal_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=optimal_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=None):
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(tqdm(ldamodel[corpus])):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=total_df['tweet'])

In [None]:
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

In [None]:
df_dominant_topic.head(50)