## LDA

In [1]:
from collections import Counter
import random

def p_topic_given_document(topic, d, alpha=0.1):
    return ((document_topic_counts[d][topic] + alpha) /
            (document_lengths[d] + K * alpha))

def p_word_given_topic(word, topic, beta=0.1):
    return ((topic_word_counts[topic][word] + beta) /
            (topic_counts[topic] + V * beta))

def topic_weight(d, word, k):
    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d, word,K):
    return sample_from([topic_weight(d, word, k) for k in range(K)])

def sample_from(weights):
    total = sum(weights)
    rnd = total * random.random()
    for i, w in enumerate(weights):
        rnd -= w
        if rnd <= 0:
            return i

def document_init(documents):
    random.seed(0)
    K=10
    document_topics = [[random.randrange(K) for word in document]
                        for document in documents]
    document_topic_counts = [Counter() for _ in documents]
    topic_word_counts = [Counter() for _ in range(K)]
    topic_counts = [0 for _ in range(K)]
    document_lengths = [len(document) for document in documents]
    distinct_words = set(word for document in documents for word in document)
    V = len(distinct_words)
    D = len(documents)
    return document_topics,document_topic_counts,topic_word_counts,topic_counts,document_lengths,distinct_words,V,D,K

def fun_LDA(documents):
    for d in range(D):
        for word, topic in zip(documents[d], document_topics[d]):
            document_topic_counts[d][topic] += 1
            topic_word_counts[topic][word] += 1
            topic_counts[topic] += 1

    for iter in range(1000):
        for d in range(D):
            for i, (word, topic) in enumerate(zip(documents[d],
                                                  document_topics[d])):
                document_topic_counts[d][topic] -= 1
                topic_word_counts[topic][word] -= 1
                topic_counts[topic] -= 1
                document_lengths[d] -= 1
                new_topic = choose_new_topic(d, word,K)
                document_topics[d][i] = new_topic
                document_topic_counts[d][new_topic] += 1
                topic_word_counts[new_topic][word] += 1
                topic_counts[new_topic] += 1
                document_lengths[d] += 1
    return document_topics,document_topic_counts,topic_word_counts,topic_counts


## 데이터 전처리

In [2]:
import pandas as pd
origin = pd.read_excel('data.xlsx')
origin = origin.iloc[:,2]

import re
def def_pre(origin):
    dialogue_list = []  
    for sentece in origin:
        sentece = re.sub(r'\([^)]*\)', '', str(sentece))
        sentece = re.sub('[.,?/*!"]', '', str(sentece))
        if sentece[:1] =='아'or sentece[:1] =='검':
            dialogue_list.append(sentece[2:])
    return dialogue_list
         
def def_remove_space(dialogue_list):
    strip_senten = []
    for i in dialogue_list:
        strip_senten.append(i.strip())
    return strip_senten
    
def def_remove_space2(strip_senten):
    full_senten = []  
    for i in range(len(strip_senten)):
        if( len(strip_senten[i]) > 0):
            full_senten.append(strip_senten[i])
        else:
            continue
    return full_senten

from konlpy.tag import Kkma
kkma = Kkma()
def def_kkmaNoun(full_senten):
    origin_noun_kkma = []
    for i in range(len(full_senten)):
        nouns = kkma.pos(full_senten[i])
        # print(nouns)
        etc = []
        for word in nouns:
            if (len(word[0])>1) and (word[1] == 'NNG' or word[1] == 'NNP' or word[1] == 'NNB' or word[1] == 'NNM' or word[1] == 'NR' or word[1] == 'NP'):
                etc.append(word[0])
            else:
                continue
        origin_noun_kkma.append(etc)
    return origin_noun_kkma


#데이터 전처리    
dialogue_list = def_pre(origin)
strip_senten = def_remove_space(dialogue_list)
full_senten = def_remove_space2(strip_senten)
# display(full_senten)


#명사 추출
origin_noun_kkma = def_kkmaNoun(full_senten)
origin_noun_kkma2 = def_remove_space2(origin_noun_kkma)
display(origin_noun_kkma2)

[['가족생활', '외할머니', '외할아버지'],
 ['아빠', '모로코', '지금', '울산', '대우', '건설', '계세'],
 ['시간', '내서', '우리'],
 ['할머니', '할아버지', '제주도'],
 ['외할머니', '외할아버지', '우리'],
 ['인형', '놀이'],
 ['엄마', '할머니', '요리', '계세'],
 ['아빠', '대우', '건설', '공장'],
 ['제주도', '사시', '할머니', '할아버지', '생활'],
 ['인형', '친구', '요정', '이름'],
 ['가족'],
 ['가족', '내가', '사랑'],
 ['우리', '가족', '번째', '가족', '번째', '캐릭터', '영어', '선생님'],
 ['지금', '영어', '수업'],
 ['생일날', '23'],
 ['이모', '거래'],
 ['한이', '삼촌'],
 ['삼촌', '저녁', '저녁', '저희'],
 ['상빈', '삼촌', '한국', '사람'],
 ['캐나다', '동안', '한국', '캐나다'],
 ['사실', '캐나다', '사람'],
 ['사실', '외국인', '캐나다', '사람', '친구'],
 ['30'],
 ['성빈', '삼촌', '인형', '지퍼'],
 ['하나', '지퍼'],
 ['하나', '코알라', '신한', '인형'],
 ['하나', '계란', '이름', '계란'],
 ['내가', '매일'],
 ['외할머니', '외할아버지', '엄마', '아빠'],
 ['아빠', '대우', '건설'],
 ['언니', '오빠'],
 ['동생'],
 ['외동딸'],
 ['비밀'],
 ['내가', '시크', '플래쉬', '노트북', '하나', '우리', '비밀', '이야기'],
 ['계란'],
 ['할아버지', '아빠'],
 ['엄마', '엉덩이', '이름', '쓰기', '통과', '시크', '멤버'],
 ['엄마', '삼촌', '안해'],
 ['포기'],
 ['엄마', '할아버지', '드라마'],
 ['요리'],
 ['운전'],
 ['엄마', '보고', '

In [3]:
documents= origin_noun_kkma2
document_topics,document_topic_counts,topic_word_counts,topic_counts,document_lengths,distinct_words,V,D,K = document_init(documents)
document_topics,document_topic_counts,topic_word_counts,topic_counts = fun_LDA(documents)

## 단어의 토픽확률

In [4]:
display(len(document_topic_counts))

def fun_topicRatio(documents):
    topic_word_counts_rat = [[0]*len(topic_word_counts[0]) for i in range(K)]
    for top_num in range(K):
        i = 0
        for keys in topic_word_counts[top_num].keys():
    #         print(topic_word_counts[top_num][keys]/sum(topic_word_counts[top_num].values()))
            topic_word_counts_rat[top_num][i] = round(topic_word_counts[top_num][keys]/sum(topic_word_counts[top_num].values()),3)
            i = i+1
    # display(topic_word_counts_rat)
    from pandas import DataFrame
    import pandas as pd
    print("단어에 대한 topic비율")
    df_rat = pd.DataFrame(topic_word_counts_rat,index = ['topic %d'%i for i in range(K)],
                         columns=[keys for keys in topic_word_counts[0].keys()])
    return df_rat.T
fun_topicRatio(documents)

117

단어에 대한 topic비율


Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
외할아버지,0.100,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
가족,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
사랑,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
번째,0.000,0.000,0.000,0.000,0.043,0.000,0.000,0.000,0.000,0.116
코알라,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
매일,0.000,0.000,0.000,0.000,0.109,0.000,0.042,0.000,0.000,0.000
엉덩이,0.000,0.000,0.000,0.024,0.174,0.000,0.000,0.000,0.000,0.000
안해,0.033,0.000,0.000,0.048,0.022,0.000,0.000,0.000,0.000,0.000
드라마,0.000,0.057,0.000,0.095,0.000,0.000,0.000,0.000,0.000,0.000
할머니,0.000,0.000,0.159,0.000,0.000,0.000,0.000,0.000,0.000,0.000


## 단어의 토픽 리스트

In [5]:
def fun_topicList(documents):
    topic_count = [[] for i in range(K)]

    for i in range(len(documents)):
        for j in range(len(documents[i])):
            for k in range(K):
                if document_topics[i][j] == k:
                    topic_count[k].append(documents[i][j])

    for i in range(K):
        topic_count[i] = list(set(topic_count[i]))    

    topic_count = pd.DataFrame(topic_count,index= ['topic %d'%i for i in range(K)])
    return display(topic_count.T)

fun_topicList(documents)

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,우리,이야기,각각,고구마,삼촌,계란,사탕,운전,만들기,내서
1,수업,혼란,실로폰,경단,우리,미디어,30,동쪽,토끼,할머니
2,하트,악당,수업,대우,샌드위치,내가,문제,포기,한이,보자기
3,왕관,플래쉬,이야기,해산물,우리집,생각,초콜렛,퀴즈,23,멤버
4,아기,절대,나도,모로코,저희,무엇,치와와,드라마,젤리,계세
5,소리,시크,입학,지퍼,마법,보자기,초콜릿,매일,옛날,엉덩이
6,악당,캐릭터,초등학교,계란,왕관,아이,퀴즈,소풍,부산,통과
7,모래성,리어,어린이집,요리,외동딸,우리,남쪽,선물,그거,다솜
8,도둑,소개,우유,마요네즈,상빈,수학,리얼,가요,생일날,지도
9,안해,언니,오징어,여행,성빈,파라나,프리,팝콘,물기,제주도


## 발화(문서)별 비율- 원본


In [6]:
def fun_docuRatio(documents):
    document_topic_counts_rat = [[0]*K for i in range(len(documents))]

    for doc_num in range(len(documents)):
        for key in document_topic_counts[doc_num].keys():
            document_topic_counts_rat[doc_num][key] = round(document_topic_counts[doc_num][key]/sum(document_topic_counts[doc_num].values())*100,3) 
    # display(document_topic_counts_rat)
    from pandas import DataFrame
    import pandas as pd
    print("문서에 대한 topic비율")
    df_rat = pd.DataFrame(document_topic_counts_rat, columns=['topic %d'%i for i in range(0,K)],index = ['doc %d'%i for i in range(1,len(documents)+1)])
    return display(df_rat)

fun_docuRatio(documents)

문서에 대한 topic비율


Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
doc 1,66.667,0.0,33.333,0.000,0.000,0.000,0.0,0.000,0.000,0.000
doc 2,0.000,0.0,0.000,85.714,0.000,0.000,0.0,0.000,0.000,14.286
doc 3,0.000,0.0,33.333,0.000,0.000,33.333,0.0,0.000,0.000,33.333
doc 4,0.000,0.0,0.000,0.000,0.000,0.000,0.0,0.000,0.000,100.000
doc 5,66.667,0.0,33.333,0.000,0.000,0.000,0.0,0.000,0.000,0.000
doc 6,0.000,0.0,100.000,0.000,0.000,0.000,0.0,0.000,0.000,0.000
doc 7,0.000,0.0,0.000,25.000,0.000,0.000,0.0,0.000,0.000,75.000
doc 8,0.000,0.0,0.000,100.000,0.000,0.000,0.0,0.000,0.000,0.000
doc 9,0.000,0.0,0.000,0.000,0.000,20.000,0.0,0.000,0.000,80.000
doc 10,0.000,0.0,25.000,0.000,0.000,0.000,75.0,0.000,0.000,0.000


## 3문장씩 묶기

In [7]:
def bind_document(documents):
    new_documents=[]
    for i in range(len(documents)-2):
        new_documents.append(documents[i]+documents[i+1]+documents[i+2])
    return new_documents

documents_bin = bind_document(documents)