In [None]:
import os
import json
import re
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from datetime import datetime

# Data load

In [None]:
def data_load(name):
    with open(f'crawl_result/{name}.json','r',encoding='utf-8')as f:
        data=json.load(f)
        DF=pd.DataFrame(data['data'])
    return DF

In [None]:
QA=data_load('wineQ&A_text')
rec=data_load('wine_recommend_text')

In [None]:
def clean_text(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    text=emoji_pattern.sub(r'', text)
    re_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),|]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(re_pattern, 'url', text)
    text = re.sub('\([^)]*\)', ',', text)
    text = re.sub('\[[^)]*\]', ',', text)
    text=re.sub('[^ㄱ-ㅎ ㅏ-ㅣ가-힣A-Za-z0-9!?.,~]+',' ',text)
    text=re.sub('[\\s+ *]',' ',text)
    return text

In [None]:
def cleaning(DF):
    DF['clean_content']=DF.본문.apply(lambda x:clean_text(x))
    DF['clean_title']=DF.제목.apply(lambda x:clean_text(x))
    DF['contents']=DF.clean_title+DF.clean_content
    return DF

In [None]:
QA=cleaning(QA)
REC=cleaning(rec)

In [None]:
QA['date']=QA.날짜.apply(lambda x: datetime.strptime(''.join(x.split('.')[:3]),'%Y%m%d'))

In [None]:
Date_count=QA.groupby('date').count().loc[:,'본문']

In [None]:
import chart_studio.plotly.plotly as py
import cufflinks as cf
cf.go_offline(connected=True)

In [None]:
Date_count.iplot(kind='bar',colors='Red')

In [None]:
Date_count.var()

In [None]:
import matplotlib.font_manager as font_manager

font_list = font_manager.findSystemFonts(fontpaths=None, fontext='ttf')

# 전체개수
print(len(font_list)) 

# 처음 10개만 출력
#font_list[:-10]

In [None]:
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

def show_contents_length(DF):
    font_name = font_manager.FontProperties(fname='/System/Library/Fonts/Supplemental/Arial Narrow Bold Italic.ttf'
                                           ).get_name()
    rc('font', family=font_name,size=15)
    plt.figure(figsize=(10, 8))
    print('컨텐츠의 최대 길이 :',max(len(l) for l in  DF.contents))
    print('컨텐츠의 평균 길이 :',sum(map(len, QA.contents))/len(DF.contents))
    plt.hist([len(s) for s in DF.contents], bins=50,color='#d62728')
    plt.xlabel('length of contents')
    plt.ylabel('number of contents')
    plt.show()

In [None]:
show_contents_length(QA)

# Tokeninzing

In [None]:
def Tokeninzing(DF):
    stopword=[ '와인','마시다','하다','있다','어제','이기',
              '댓글', '대하다','클릭', '드리다','체계',
               '댓글', '글','답변','소통','등업',
               '이렇다','대부분','그렇다','그러다',
                '감사', '되다', '등급', '기본', '안내', '규정', '체계'
               '와쌉','계시다','사람','읽다','음',
               '가능','가다','가요','가져가다','가지다','그러다',' ㅂ','ㅁ','안녕','안녕하세요']
    
    kiwi = Kiwi(num_workers=16)
    kiwi.prepare()
    E=[]
    e=[]
    for each_doc in kiwi.analyze(DF['contents'], top_n=1):
        for each_word in each_doc[0][0]:
            if each_word[0] not in stopword:
                if ('VV' in each_word[1]) or ('VA' in each_word[1]):
                    word=each_word[0] + '다'
                    if word not in stopword:
                        e.append(word)
                if ('NNG' in each_word[1]) or ('NNP' in each_word[1]):
                    e.append(each_word[0])
                if each_word[0] =='리딩':
                    e.append('브'+each_word[0])
                if each_word[0] =='페어':
                    e.append(each_word[0]+'링')
            else:
                pass
        E.append(e)
        e=[]
    temp_title=E
        
#     temp_title = [[each_word[0] if ('NNG' in each_word[1]) or ('NNP' in each_word[1])
#                   else each_word[0] + '다' if ('VV' in each_word[1]) or ('VA' in each_word[1])
#                   else None for each_word in each_doc[0][0]]
#                   for each_doc in kiwi.analyze(DF['contents'], top_n=1)]
    target_title = [[each_word for each_word in each_doc if each_word] for each_doc in temp_title]
    DF['token']=target_title
    return DF

In [None]:
from soynlp.utils import DoublespaceLineCorpus
from soynlp.word import WordExtractor
from soynlp.tokenizer import MaxScoreTokenizer
from gensim.models import Word2Vec
from soynlp.noun import LRNounExtractor_v2


def soytokenizer(DF):
    S=DF.contents
    corpus = DoublespaceLineCorpus(S, iter_sent=True)
    word_extractor = WordExtractor(min_frequency=2,
        min_cohesion_forward=0.5, 
        min_right_branching_entropy=0.0
    )
    word_extractor.train(S) # list of str or like
    words = word_extractor.extract()
    
    noun_extractor = LRNounExtractor_v2(verbose=True)
    noun_extractor.train(S) 
    nouns = noun_extractor.train_extract(S)

    cohesion_score = {word:score.cohesion_forward for word, score in words.items() if 1.0 > score.cohesion_forward >=0.8}
    score = {word:score.cohesion_forward for word, score in nouns.items() if 1.0 > int(score['score']) >=0.6}
    
    Wtokenizer = MaxScoreTokenizer(scores=cohesion_score)  
    Ntokenizer = MaxScoreTokenizer(scores=score)
    return Wtokenizer,Ntokenizer

In [None]:
QA['more_clear']=QA.contents.apply(lambda x: re.sub('[^ㄱ-ㅎ ㅏ-ㅣ가-힣A-Za-z]+',' ',x))

In [None]:
WT,T=soytokenizer(QA)
QA['soytoken']=QA.more_clear.apply(lambda x: T.tokenize(x))

In [None]:
QA=Tokeninzing(QA)
REC=Tokeninzing(REC)

In [None]:
QA['corpus']=QA.token.apply(lambda x: ' '.join(x))
REC['corpus']=REC.token.apply(lambda x: ' '.join(x))

In [None]:
QA['doc_len']=QA.contents.apply(lambda x: len(x))

In [None]:
QA.soytoken

# 1


# 2

In [None]:
def get_text_with_word(word):
    l=re.findall('(ㅠㅠ)',word)
    L=re.findall('(ㅜㅜ)',word)
    i=re.findall('(\?\?)',word)
    return len(l)+len(i)+len(L)

In [None]:
QA['pain']=QA.contents.apply(lambda x :'pain' if get_text_with_word(x) != 0 else '')

In [None]:
PAIN=QA[QA.pain!='']

In [None]:
len(DOCLEN)

In [None]:
DOCLEN=QA[QA.doc_len>165] #2720

In [None]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
    
def plot_2d_graph(vocabs, xs, ys):
    plt.figure(figsize=(20 ,20))
    font_name = font_manager.FontProperties(fname='c:\\windows\\fonts\\nanumbarungothic-yethangul.ttf',
                                           ).get_name()
    rc('font', family=font_name)
    rc('font', size=15)
    plt.scatter(xs, ys, marker = 'o')
    for i, v in enumerate(vocabs):
        plt.annotate(v, xy=(xs[i], ys[i]))

def Token2vec(DF,mincount):
    model = Word2Vec(sentences = DF.soytoken, min_count = mincount, workers = 6, sg = 0)
    word_vectors = model.wv
    pca = PCA(n_components=2)
    vocabs = list(model.wv.index_to_key)
    word_vocab_list = [model.wv[v] for v in vocabs]
    xys = pca.fit_transform(word_vocab_list)
    xs = xys[:,0]
    ys = xys[:,1]
    plot_2d_graph(vocabs, xs, ys)

In [None]:
Token2vec(PAIN,100)

In [None]:
QA.contents[7174]

In [None]:
QA.token

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
tfidv,tfidf=TFIDF(QA)
lda=LatentDirichletAllocation(n_components=5)
lda.fit(tfidf)

In [None]:
def drop_certain_words(corpus, sparse_matrix, drop_words):
    drop_words_index = [np.where(corpus == word)[0][0] for word in drop_words]
    to_keep = sorted(set(range(sparse_matrix.shape[1])) - set(drop_words_index))
    corpus = corpus[to_keep]
    sparse_matrix = sparse_matrix[:, to_keep]
    return corpus, sparse_matrix

In [None]:
def display_topics(model, feature_names, no_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        important_words = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        print("Topic %d:\" % topic_idx")
        print(",".join(important_words))
        topics.append(important_words)
    return topics

In [None]:
display_topics(lda,tfidv.get_feature_names(),10)

In [None]:
[QA.corpus]

In [None]:
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

def TFIDF(DF):
    tfidv = TfidfVectorizer(min_df=0.1).fit(DF.corpus)
    #tfidf = TfidfVectorizer(max_features = 100, max_df=0.95, min_df=0).fit_transform(DF.corpus)# 상위 100개
    TFIDF=tfidv.transform(DF.corpus)
    #data_array = TFIDF.toarray()
    #text=tfidv.get_feature_names()
    return tfidv,TFIDF

In [None]:
TFIDF(QA)

In [None]:
def closer_look(df, topic_num, content, limit=40):
    each_topic_df = df[df['topic label'] == topic_num]
    print(each_topic_df[['topic prob', '제목', '본문', '댓글']].sort_values(by='topic prob', ascending=False)[content][:limit])

In [None]:
doc_labeling(QA,TFIDF(QA)[1],)

In [None]:
import pyLDAvis.gensim
from gensim import corpora
import gensim

# gensim

In [None]:
Token=QA.token
dictionary = corpora.Dictionary(Token)
corpus = [dictionary.doc2bow(text) for text in Token]

NUM_TOPICS = 5 #20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)