# 04. 파이썬을 이용한 토픽모델링(LDA)

* 싸이그래머 / 어바웃 파이썬
* 김무성

# 차례
* 토픽모델링 & LDA 
* Gensim을 이용한 토픽 모델링(LDA) 예제
* 20 Newsgroups 예제
    - DataSet
        - Data Download
        - Exploring the dataset
    - LDA with Gensim
        - Loading the tokenizing the corpus
        - Creating the dictionary, and bag of words corpus
        - Fitting the LDA model
    - Visualizing the model with pyLDAvis   

# 토픽모델링 & LDA
* [1] 텍스트의 통계학: (3) 네 주제를 알라 - http://nullmodel.egloos.com/1958448
* [2] Topic Models : LDA and Correlated Topic Models - https://www.slideshare.net/clauwa/topic-models-lda-and-correlated-topic-models

--------------------------

# Gensim을 이용한 토픽 모델링(LDA) 예제

* [3] Complete Guide to Topic Modeling - https://nlpforhackers.io/topic-modeling/

### 데이터 준비 

#### 영어 예시

In [None]:
import nltk
nltk.download('brown')

In [None]:
from nltk.corpus import brown


data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)
 
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:5])

#### 한글 예시 (실습)

##### 다음 문서들을 이용해서 데이터를 만들자.
* https://gasazip.com/view.html?no=614736
* https://gasazip.com/1224697
* https://gasazip.com/view.html?no=599082
* https://gasazip.com/view.html?no=645465
* http://gasazip.com/view.html?no=643505
* https://gasazip.com/view.html?no=615362

In [None]:
# -- 코딩

### LDA 모델을 만들자

#### 영어 예시

In [None]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
 
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
 
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text
 
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))
 
 
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...
 
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)


#### 한글 예시 (실습)

In [None]:
# -- 코딩

### 추출된 토픽을 뿌려보자

#### 영어 예시

In [None]:

NUM_TOPICS = 10

print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)

#### 한글 예시 (실습)

In [None]:
# -- 코딩

### 이제 모델을 이용해서, 새 문서(학습에 포함되지 않았던)의 토픽 분포를 파악해보자

#### 영어 예시

In [None]:
text = "The economy is working better than ever"
bow = dictionary.doc2bow(clean_text(text))

print(lda_model[bow])
# [(0, 0.020005183), (1, 0.020005869), (2, 0.02000626), (3, 0.020005472), (4, 0.020009108), (5, 0.020005926), (6, 0.81994385), (7, 0.020006068), (8, 0.020006327), (9, 0.020005994)]


#### 한글 예시 (실습)
* https://gasazip.com/view.html?no=636135

In [None]:
# -- 코딩

### 새 문서와 유사한 문서들을 바로 찾아보자 - similarity queries using topic models.

#### 영어 예시

In [None]:
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities[:10])
# [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)]
 
# Let's see what's the most similar document
document_id, similarity = similarities[0]
print(data[document_id][:1000])
 

#### 한글 예시 (실습)

In [None]:
# -- 코딩

-------------------

# 20 Newsgroups 예제

# DataSet
* [3] 20 Newsgroups Dataset - http://qwone.com/~jason/20Newsgroups/

## Data Download

In [None]:
%%bash
mkdir -p 04_data
pushd data
if [ -d "20news-bydate-train" ]
then
  echo "The data has already been downloaded..."
else
  wget http://qwone.com/%7Ejason/20Newsgroups/20news-bydate.tar.gz
  tar xfv 20news-bydate.tar.gz
  rm 20news-bydate.tar.gz
fi
echo "Lets take a look at the groups..."
ls 20news-bydate-train/
popd

## Exploring the dataset

Each group dir has a set of files:

In [None]:
ls -lah 04_data/20news-bydate-train/sci.space | tail  -n 5

In [None]:
!head 04_data/20news-bydate-train/sci.space/61422 -n 20

# LDA with Gensim

* [4] An Introduction to gensim: "Topic Modelling for Humans" - https://www.slideshare.net/sandinmyjoints/an-introduction-to-gensim-topic-modelling-for-humans

## Loading the tokenizing the corpus

In [None]:
from glob import glob
import re
import string
import funcy as fp
from gensim import models
from gensim.corpora import Dictionary, MmCorpus
import nltk
import pandas as pd

In [None]:
# quick and dirty....
EMAIL_REGEX = re.compile(r"[a-z0-9\.\+_-]+@[a-z0-9\._-]+\.[a-z]*")
FILTER_REGEX = re.compile(r"[^a-z '#]")
TOKEN_MAPPINGS = [(EMAIL_REGEX, "#email"), (FILTER_REGEX, ' ')]

def tokenize_line(line):
    res = line.lower()
    for regexp, replacement in TOKEN_MAPPINGS:
        res = regexp.sub(replacement, res)
    return res.split()
    
def tokenize(lines, token_size_filter=2):
    tokens = fp.mapcat(tokenize_line, lines)
    return [t for t in tokens if len(t) > token_size_filter]
    

def load_doc(filename):
    group, doc_id = filename.split('/')[-2:]
    with open(filename, errors='ignore') as f:
        doc = f.readlines()
    return {'group': group,
            'doc': doc,
            'tokens': tokenize(doc),
            'id': doc_id}


docs = pd.DataFrame(list(map(load_doc, glob('04_data/20news-bydate-train/*/*')))).set_index(['group','id'])
docs.head()

## Creating the dictionary, and bag of words corpus

<img src="04_figures/bow.jpg" width=600 />

In [None]:

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
  print('Building dictionary...')
  dictionary = Dictionary(docs)
  stopwords = nltk_stopwords().union(additional_stopwords)
  stopword_ids = map(dictionary.token2id.get, stopwords)
  dictionary.filter_tokens(stopword_ids)
  dictionary.compactify()
  dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
  dictionary.compactify()

  print('Building corpus...')
  corpus = [dictionary.doc2bow(doc) for doc in docs]

  return dictionary, corpus


In [None]:
dictionary, corpus = prep_corpus(docs['tokens'])

In [None]:
MmCorpus.serialize('04_data/newsgroups.mm', corpus)
dictionary.save('04_data/newsgroups.dict')

## Fitting the LDA model

In [None]:
%%time
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=50, passes=10)
                                      
lda.save('04_data/newsgroups_50_lda.model')

In [None]:
# print the most contributing words for 20 randomly selected topics
lda.print_topics(num_topics=20, num_words=5)

# Visualizing the model with pyLDAvis


In [None]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [None]:
vis_data = gensimvis.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis_data)

# 참고자료
* [1] 텍스트의 통계학: (3) 네 주제를 알라 - http://nullmodel.egloos.com/1958448
* [2] Topic Models : LDA and Correlated Topic Models - https://www.slideshare.net/clauwa/topic-models-lda-and-correlated-topic-models
* [3] Complete Guide to Topic Modeling - https://nlpforhackers.io/topic-modeling/
* [4] 20 Newsgroups Dataset - http://qwone.com/~jason/20Newsgroups/
* [5] An Introduction to gensim: "Topic Modelling for Humans" - https://www.slideshare.net/sandinmyjoints/an-introduction-to-gensim-topic-modelling-for-humans
* [6] Visualizing a Gensim model - http://nbviewer.jupyter.org/github/bmabey/pyLDAvis/blob/master/notebooks/Gensim%20Newsgroup.ipynb