# LDA - Topic Modeling

In [1]:
documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [2]:
from collections import defaultdict
import random

a = 0.1
b = 0.1
K = 3

docTermTopicMat = list()
Vocaburary = list()

random.seed(0)

for d in documents:
    termTopic = list()
    for t in d:
        termTopic.append([t.lower(), random.randrange(K)])
        Vocaburary.append(t.lower())
    docTermTopicMat.append(termTopic)
Vocaburary = list(set(Vocaburary))
        
M = len(docTermTopicMat)
N = len(Vocaburary)

In [3]:
termTopic

[['libsvm', 0], ['regression', 0], ['support vector machines', 2]]

In [4]:
len(Vocaburary)

36

In [5]:
topicTermMatrix = defaultdict(lambda:defaultdict(int))
docTopicMatrix = defaultdict(lambda:defaultdict(int))

for i, termTopic in enumerate(docTermTopicMat):
    for row in termTopic:
#         row[0] #단어
#         row[1] #토픽
        # (1 / 분자)

        
        # (1) (1 / 분모) k번째 토픽에서, r번째 고유어가 몇 번
        topicTermMatrix[row[1]][row[0]] += 1
        # docTopicMatrix[m번째문서][k번째토픽] = 몇개의 단어
        # (2) i번째 문서에서, k번째 토픽에 몇 개의 단어 
        docTopicMatrix[i][row[1]] += 1 
    
#         theta = 문서에 토픽 분포
#         phi = 토픽에 단어 분포
        
        
# 1. 문서에 상관없이 각 단어가 어느 토픽에 몇 번 + B 나왔는지
# 2. 문서에 상관없이 특정단어가 k번째 토픽에 몇 번 + B나왔는지
# 3. m개의문서에서 k번째 토픽에 몇개의 단어 + a 나왔는지

In [7]:
def topicAssign(m, l):

    totalCount = sum([f for k in range(K)
     for t,f in topicTermMatrix[k].items()
     if t == l])
#     print(l, totalCount)
    probList = list()
    for k in range(K):
        probList.append(topicLikelihood(k, l) * docLikelihood(m, k))
    _sum = sum(probList) * random.random()
    
    for i, p in enumerate(probList):
        _sum -= p
        if _sum <= 0:
            k = i
            break
    return k

In [8]:
def topicLikelihood(k, l):
    return (topicTermMatrix[k][l] + b) / \
            (sum(topicTermMatrix[k].values()) + (b*N))

In [9]:
def docLikelihood(m, k):
#     print(m, k)
    return (docTopicMatrix[m][k] + a)

In [10]:
_iter = 1000
for _ in range(_iter):
    for i, termTopic in enumerate(docTermTopicMat):
        for row in termTopic:
            topicTermMatrix[row[1]][row[0]] -= 1
            docTopicMatrix[i][row[1]] -= 1
            
            k = topicAssign(i, row[0])
            
            row[1] = k #토픽 새로 assign
            topicTermMatrix[row[1]][row[0]] += 1
            docTopicMatrix[i][row[1]] += 1

In [11]:
docTopicMatrix[0]

defaultdict(int, {1: -1, 0: 7, 2: 0})

In [12]:
topicTermMatrix

defaultdict(<function __main__.<lambda>()>,
            {1: defaultdict(int,
                         {'hadoop': -1,
                          'big data': 0,
                          'java': 1,
                          'storm': 0,
                          'cassandra': 0,
                          'nosql': 0,
                          'mongodb': 0,
                          'scipy': 1,
                          'r': 4,
                          'machine learning': 0,
                          'programming languages': 1,
                          'statistics': 3,
                          'probability': 3,
                          'mahout': 0,
                          'neural networks': 0,
                          'deep learning': 0,
                          'artificial intelligence': 0,
                          'python': 3,
                          'hbase': 0,
                          'spark': 0,
                          'postgres': 0,
                          'scikit-learn'

In [13]:
docTopicMatrix

defaultdict(<function __main__.<lambda>()>,
            {0: defaultdict(int, {1: -1, 0: 7, 2: 0}),
             1: defaultdict(int, {1: 0, 2: 0, 0: 5}),
             2: defaultdict(int, {2: 0, 0: 2, 1: 4}),
             3: defaultdict(int, {1: 5, 2: 0, 0: 0}),
             4: defaultdict(int, {1: 0, 0: 0, 2: 4}),
             5: defaultdict(int, {2: 0, 1: 6, 0: 0}),
             6: defaultdict(int, {1: 4, 2: 0, 0: 0}),
             7: defaultdict(int, {0: 0, 2: 4, 1: 0}),
             8: defaultdict(int, {2: 4, 1: 0, 0: 0}),
             9: defaultdict(int, {0: 4, 2: 0, 1: 0}),
             10: defaultdict(int, {2: 0, 0: 0, 1: 3}),
             11: defaultdict(int, {0: 0, 2: 2, 1: 2}),
             12: defaultdict(int, {0: 0, 2: 0, 1: 3}),
             13: defaultdict(int, {2: 0, 0: 5, 1: 0}),
             14: defaultdict(int, {0: 0, 2: 3, 1: 0})})

In [14]:
for k, termList in topicTermMatrix.items():
    print(k, "번째 토픽")
    print(sorted(termList.items(), key=lambda x:x[1], reverse=True)[:4])

1 번째 토픽
[('r', 4), ('statistics', 3), ('probability', 3), ('python', 3)]
0 번째 토픽
[('hbase', 3), ('postgres', 2), ('big data', 2), ('hadoop', 2)]
2 번째 토픽
[('regression', 2), ('neural networks', 2), ('artificial intelligence', 2), ('deep learning', 2)]


# 1. 내 수집 데이터(국제, 생활) => 토픽 모델링

In [15]:
from os import listdir
def fileids(path):
    return [path+file for file in listdir(path)]

In [16]:
def filecontent(file):
    with open(file, encoding='utf-8') as fp:
        content = fp.read()
    return content

In [17]:
from string import punctuation
import re

def makePattern():
    pattern = dict()

    # 구두점
    pattern1 = re.compile(r'[{0}]'.format(re.escape(punctuation)))
    pattern['punc'] = pattern1
    # corpus = pattern1.sub(' ',corpus)

    # 불용어
    pattern2 = re.compile(r'[A-Za-z0-9]{7,}')
    pattern['stop'] = pattern2
    # corpus = pattern2.sub(' ',corpus)

    # 이메일
    # pattern3 = re.compile(r'\w{2,}@\w{3,}(.\w{2,})+')
    pattern3 = re.compile(r'\w{2,}@(.?\w{2,})+')
    pattern['email'] = pattern3
    # corpus = pattern3.sub(' ',corpus)

    # 도메인
    pattern4 = re.compile(r'(.?\w{2,}){2,}')
    pattern['url'] = pattern4
    # corpus = pattern4.sub(' ',corpus)

    # 한글 이외
    pattern5 = re.compile(r'[^가-힣0-9]+')
    pattern['nonkorean'] = pattern5
    # corpus = pattern5.sub(' ',corpus)

    # WhiteSpace
    pattern6 = re.compile(r"\s{2,}")
    pattern['whitespace'] = pattern5
    # corpus = pattern6.sub(' ',corpus)
    
    return pattern

In [23]:
from collections import defaultdict
from nltk.tokenize import word_tokenize
from konlpy.tag import Komoran
# content = filecontent(fileids('./news_crawl_project/')[-2])
ma = Komoran()
pattern = makePattern()

def punc_stop(file):
    for _ in ['email', 'punc', 'stop','whitespace']:
        file = pattern[_].sub(' ',file)
    return file

def indexing(file):
    indexTerm1 = defaultdict(int)
    indexTerm2 = defaultdict(int)
    indexTerm3 = defaultdict(int)
    indexTerm4 = defaultdict(int)
    for term in word_tokenize(file):
        indexTerm1[term] += 1 # 원시어절
    
    for _ in indexTerm1:
        for t in ma.pos(_):
            
            indexTerm2[t] += 1 # 원시형태소+품사
            if len(t[0]) > 1: 
                if t[1].startswith('N'):
                    indexTerm4[t[0]] += 1 # 명사
    
    return indexTerm4 #일단은 명사만 

In [24]:
documentList = defaultdict(lambda: defaultdict(int))
idx = 0
for file in fileids('./news_crawl_project/'):
    if file.split('/')[-1][:2] in ["정치", "경제", "사회"]:
        documentList[idx] = indexing(punc_stop(filecontent(file)))
        idx += 1
        if idx % 100 == 0:
            print(idx, '진행중')

100 진행중
200 진행중
300 진행중
400 진행중
500 진행중
600 진행중
700 진행중


In [25]:
len(list(documentList[0]))

['앵커',
 '삼성그룹',
 '노조',
 '파괴',
 '의혹',
 '검찰',
 '지금',
 '합병',
 '에버랜드',
 '수사',
 '결과',
 '그룹',
 '지휘부',
 '역할',
 '미래',
 '전략',
 '계열사',
 '노무',
 '담당자',
 '조직',
 '개입',
 '판단',
 '저희',
 '노사',
 '문건',
 '보도',
 '와해',
 '제기',
 '실체',
 '이번',
 '오너',
 '일가',
 '여부',
 '주목',
 '김선미',
 '기자',
 '삼성',
 '설립',
 '공작',
 '당시',
 '간부',
 '미행',
 '술자리',
 '확인',
 '경찰',
 '음주운전',
 '혐의',
 '신고',
 '알코올',
 '농도',
 '수준',
 '체포',
 '실패',
 '사찰',
 '포차',
 '운행',
 '사실',
 '이후',
 '차량',
 '번호',
 '촬영',
 '의뢰',
 '해당',
 '해고',
 '작성',
 '적시',
 '조장희',
 '부위원장',
 '문제',
 '인력',
 '주동자',
 '컨트롤',
 '타워',
 '주도',
 '동참',
 '전인',
 '직원',
 '동원',
 '어용',
 '신고서',
 '서류',
 '대신',
 '언론',
 '대응',
 '방법',
 '복수',
 '회사',
 '교섭',
 '창구',
 '악용',
 '임금',
 '협약',
 '오늘',
 '강경',
 '삼성전자',
 '부사장',
 '이모',
 '전무',
 '재판',
 '영상',
 '디자인',
 '이정',
 '취재',
 '후원',
 '편집',
 '경화',
 '클릭',
 '정신',
 '환자',
 '상담',
 '흉기',
 '의사',
 '사망',
 '운영위',
 '박범계',
 '의원',
 '한마디',
 '축구',
 '천재',
 '사비',
 '망언',
 '한국',
 '아시안컵',
 '탈락',
 '12월',
 '뉴스룸',
 '한번',
 '이어',
 '보기',
 '비트',
 '레전드',
 '콘텐트',
 '기사',
 '저작권법',
 '보호',
 '무단',
 '전재',
 '복사',
 

In [28]:
a = 0.1
b = 0.1
K = 3

docTermTopicMat = list()
Vocaburary = list()

random.seed(0)

for d in documentList:
    termTopic = list()
    for t in list(documentList[d]):
        termTopic.append([t.lower(), random.randrange(K)])
        Vocaburary.append(t.lower())
    docTermTopicMat.append(termTopic)
Vocaburary = list(set(Vocaburary))
        
M = len(docTermTopicMat)
N = len(Vocaburary)

In [None]:
from tqdm import tqdm_notebook

_iter = 5
bar_total = tqdm_notebook(range(_iter))

for _ in bar_total:
    for i, termTopic in enumerate(docTermTopicMat):
        for row in termTopic:
            topicTermMatrix[row[1]][row[0]] -= 1
            docTopicMatrix[i][row[1]] -= 1
            
            k = topicAssign(i, row[0])
            
            row[1] = k #토픽 새로 assign
            topicTermMatrix[row[1]][row[0]] += 1
            docTopicMatrix[i][row[1]] += 1

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

In [None]:
for k, termList in topicTermMatrix.items():
    print(k, "번째 토픽")
    print(sorted(termList.items(), key=lambda x:x[1], reverse=True)[:4])