In [1]:
import os
import json
import re
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from datetime import datetime

In [2]:
def data_load(name):
    with open(f'crawl_result/{name}.json','r',encoding='utf-8')as f:
        data=json.load(f)
        DF=pd.DataFrame(data['data'])
    return DF

In [3]:
QA=data_load('wineQ&A_text')
rec=data_load('wine_recommend_text')

In [4]:
def clean_text(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    text=emoji_pattern.sub(r'', text)
    re_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),|]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(re_pattern, 'url', text)
    text = re.sub('[+]', ',', text)
    text=re.sub('[^ ㄱ-ㅣ가-힣A-Za-z0-9!?.,~[]]+',' ',text)
    text=re.sub('[\\s *]',' ',text)
    return text

In [5]:
def cleaning(DF):
    DF['clean_content']=DF.본문.apply(lambda x:clean_text(x))
    DF['clean_title']=DF.제목.apply(lambda x:clean_text(x))
    DF['contents']=DF.clean_title+DF.clean_content
    return DF

In [6]:
QA=cleaning(QA)
REC=cleaning(rec)

In [None]:
D=[]
for date in list(set(QA.날짜.tolist())):
    c=0
    for ind in QA.index:
        if ''.join(date.split('.')[:3]) == ''.join(QA.loc[ind,'날짜'].split('.')[:3]):
            c+=1
    D.append({''.join(date.split('.')[:3]):c})

In [7]:
QA['date']=QA.날짜.apply(lambda x: datetime.strptime(''.join(x.split('.')[:3]),'%Y%m%d'))

In [8]:
Date_count=QA.groupby('date').count().loc[:,'본문']

In [28]:
import plotly.plotly as py
import cufflinks as cf
cf.go_offline(connected=True)
#Date_count title="컨텐츠 수 ", xTitle="날짜", yTitle='컨텐츠')

In [29]:
Date_count.iplot(kind='line')

ModuleNotFoundError: No module named 'plotly.validator_cache'

In [None]:
def Tokeninzing(DF):
    kiwi = Kiwi(num_workers=16)
    kiwi.prepare()
    temp_title = [[each_word[0] if ('NNG' in each_word[1]) or ('NNP' in each_word[1])
                  else each_word[0] + '다' if ('VV' in each_word[1]) or ('VA' in each_word[1])
                  else None for each_word in each_doc[0][0]]
                 for each_doc in kiwi.analyze(DF['contents'], top_n=1)]
    target_title = [[each_word for each_word in each_doc if each_word] for each_doc in temp_title]
    DF['token']=target_title\n",
    return DF

In [None]:
QA=Tokeninzing(QA)
REC=Tokeninzing(REC)

In [None]:
QA['corpus']=QA.token.apply(lambda x: ' '.join(x))
REC['corpus']=REC.token.apply(lambda x: ' '.join(x))

In [None]:
QA['pain']=QA.contents.apply(lambda x :'pain' if len(re.findall('(ㅠㅠ)',x))!=0 else '')

In [None]:
P=QA[QA.pain!='']

In [None]:
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

def show_contents_length(DF):
    font_name = font_manager.FontProperties(fname='C:/Windows/Fonts/NanumBarunGothic-YetHangul.ttf').get_name()
    rc('font', family=font_name)
    plt.figure(figsize=(10, 8))
    print('컨텐츠의 최대 길이 :',max(len(l) for l in  DF.contents))
    print('컨텐츠의 평균 길이 :',sum(map(len, QA.contents))/len(DF.contents))
    plt.hist([len(s) for s in QA.contents], bins=50)
    plt.xlabel('length of samples')
    plt.ylabel('number of samples')
    plt.show()
    

In [None]:
Token2vec(QA,300)

In [None]:
"# from wordcloud import WordCloud\n",
    "# import matplotlib.pyplot as plt\n",
    "\n",
    "# wordclouds=WordCloud(width=800,height=800,background_color='white',colormap='Greens')\n",
    "# from collections import Counter\n",
    "# count=Counter(text)\n",
    "# fig=plt.figure(figsize=(10,10))\n",
    "# plt.imshow(wordclouds.to_array())\n",
    "# plt.show()"

In [None]:
"def drop_certain_words(corpus, sparse_matrix, drop_words):\n",
    "    drop_words_index = [np.where(corpus == word)[0][0] for word in drop_words]\n",
    "    to_keep = sorted(set(range(sparse_matrix.shape[1])) - set(drop_words_index))\n",
    "    corpus = corpus[to_keep]\n",
    "    sparse_matrix = sparse_matrix[:, to_keep]\n",
    "    return corpus, sparse_matrix"

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
tfidv,tfidf=TFIDF(QA)
LatentDirichletAllocation(n_components=6)
lda=LatentDirichletAllocation(n_components=6)\n",
    "lda.fit(tfidf)

In [None]:
def display_topics(model, feature_names, no_top_words):\n",
    "    topics = []\n",
    "    for topic_idx, topic in enumerate(model.components_):\n",
    "        important_words = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]\n",
    "\n",
    "        print(\"Topic %d:\" % topic_idx)\n",
    "        print(\" \".join(important_words))\n",
    "        topics.append(important_words)\n",
    "    return topics

In [None]:
words_list, TFIDF=drop_certain_words(np.array(TFIDF(QA)[0].get_feature_names()),
                                     TFIDF(QA)[1],['와인','마시다','하다','있다'
                                                   '댓글', '안내', '고수','답변','소통',
                                                '이렇다','대부분','그렇다','그러다',
                                                   '와쌉','계시다','사람','읽다',
                                                   '가능','가다','가요','가져가다','가지다',
                             ])

In [None]:
from sklearn.manifold import TSNE\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "def TFIDF(DF):\n",
    "    tfidv = TfidfVectorizer(min_df=0.01).fit(DF.corpus)\n",
    "    #tfidf = TfidfVectorizer(max_features = 100, max_df=0.95, min_df=0).fit_transform(DF.corpus)# 상위 100개\n",
    "    TFIDF=tfidv.transform(DF.corpus)\n",
    "    #data_array = TFIDF.toarray()\n",
    "    #text=tfidv.get_feature_names()\n",
    "    return tfidv,TFIDF\n",
    "\n",
    "def Tsne(tfidv,TFIDF,perplexity,n_classes):\n",
    "    colors = 'firebrick darksalmon lightseagreen'.split()\n",
    "    tsne = TSNE(n_components=2, n_iter=10000, verbose=1 ,perplexity= perplexity)\n",
    "    Z = tsne.fit_transform(TFIDF.toarray().T)\n",
    "    print(Z[0:5])\n",
    "    print('Top words: ',len(Z))\n",
    "    tfidf_dict = tfidv.get_feature_names()\n",
    "    plt.figure(figsize=(10, 8))\n",
    "    plt.scatter(Z[:,0], Z[:,1])\n",
    "    for i in range(len(tfidf_dict)):\n",
    "        plt.annotate(s=tfidf_dict[i].encode(\"utf8\").decode(\"utf8\"), xy=(Z[i,0], Z[i,1]))\n",
    "    plt.draw()\n",
    "    \n",
    "#     for c in range(n_classes):\n",
    "#         idx = np.where(data == c)[0]\n",
    "#         Z_ = Z[idx]\n",
    "#         p.scatter(z_[:,0], z_[:,1], fill_color=colors[c], line_color=colors[c])    \n",
    "#     show(p)"

In [None]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
    
    def plot_2d_graph(vocabs, xs, ys):
        plt.figure(figsize=(20 ,20))
        font_name = font_manager.FontProperties(fname='C:/Windows/Fonts/NanumBarunGothic-YetHangul.ttf').get_name()
        rc('font', family=font_name)
        rc('font', size=15)
        plt.scatter(xs, ys, marker = 'o')
        for i, v in enumerate(vocabs):
            plt.annotate(v, xy=(xs[i], ys[i]))
    
    def Token2vec(DF,mincount):
        model = Word2Vec(sentences = DF.token, vector_size = 3, min_count = mincount, workers = 6, sg = 0)
        word_vectors = model.wv
        pca = PCA(n_components=2)
        vocabs = list(model.wv.index_to_key)
        word_vocab_list = [model.wv[v] for v in vocabs]
        xys = pca.fit_transform(word_vocab_list)
        xs = xys[:,0]
        ys = xys[:,1]
        plot_2d_graph(vocabs, xs, ys)

In [None]:
kiwi = Kiwi(num_workers=16)\n",
    "kiwi.prepare()\n",
    "T=[[tt[0] if (tt[1] in ['NNG','NNP','SW']) and (tt[0] not in ['와인','글','댓글','등업','답변','질문','있다','되다','안녕','하다','마시','되다',\n",
    "                                                                   '등급','소통','감사','안내','클릭'\n",
    "                                                                ]) and (len(tt[0])>1)  \n",
    "               else None for tt in t[0][0]]\n",
    "            for t in kiwi.analyze(P)]\n",
    "\n",
    "T\n",
    "target_title = [[each_word for each_word in each_doc if each_word] for each_doc in T]"

In [None]:
import pyLDAvis.gensim
from gensim import corpora
import gensim

In [None]:
stopwords=['와인','마시다','하다','있다',
  '댓글', '안내', '고수','답변','소통',
'이렇다','대부분','그렇다','그러다',
'와쌉','계시다','사람','읽다',
'가능','가다','가요','가져가다','가지다'
  ]

In [None]:
for t in QA.token:
    for tt in t:
        if tt not in stopwords:
            token.append(tt)
    Token.append(token)
    token=[]

# gensim

In [None]:
dictionary = corpora.Dictionary(Token)
corpus = [dictionary.doc2bow(text) for text in Token]

NUM_TOPICS = 6 #20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)