In [100]:
import sqlite3
import hashlib
import requests
from bs4 import BeautifulSoup
from joblib import Parallel, delayed
import MeCab as mc
import numpy as np
from gensim.models import word2vec, LdaModel, TfidfModel, LdaMulticore
from gensim import corpora
import pickle

In [13]:
conn = sqlite3.connect('./tabelog_reviews.sqlite3')
cur = conn.cursor()
cur.execute('create table restaurants(id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT, rate INTEGER, url TEXT)')
cur.execute('create table reviews(id integer, content text, restaurant_id integer, digest BLOB, FOREIGN KEY(restaurant_id) REFERENCES restaurants(id))')

conn.commit()
conn.close()

In [29]:
restaurants, ratings, urls = [],[],[]
for page in range(1,51):
    url = f'https://tabelog.com/rstLst/steak/{page}/?Srt=D&SrtT=rt&sort_mode=1&sk=%E3%82%B9%E3%83%86%E3%83%BC%E3%82%AD&svt=1900&svps=2&select_sort_flg=1'
    html_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    for item in html_soup.find_all('a', class_='list-rst__rst-name-target cpy-rst-name js-ranking-num'):
        restaurants.append(item.text)
        urls.append(item.get('href'))
    for item in html_soup.find_all('span', class_='c-rating__val c-rating__val--strong list-rst__rating-val'):
        ratings.append(item.text)

In [31]:
conn = sqlite3.connect('./tabelog_reviews.sqlite3')
cur = conn.cursor()

for idx, (name, rate, url) in enumerate(zip(restaurants, ratings, urls)):
    cur.execute('insert into restaurants values (?,?,?,?)', (idx, name, rate, url))

conn.commit()
conn.close()

In [40]:
conn = sqlite3.connect('./tabelog_reviews.sqlite3')
cur = conn.cursor()
cur.execute('select id,url from restaurants')
res = cur.fetchall()
conn.close()

In [48]:
tail = 'dtlrvwlst/?smp=0&lc=2&PG='
rev_results = []

def get_review(review):
    try:
        res = requests.get('https://tabelog.com' + review, timeout=5)
        return BeautifulSoup(res.text, 'html.parser').find('div',class_='rvw-item__rvw-comment').text
    except Exception as e:
        print(e)

In [1]:
conn = sqlite3.connect('./tabelog_reviews.sqlite3')
cur = conn.cursor()

for index, url in res:
    print(index)
    iteration = 1
    while(True):
        url = url + tail + str(iteration)
        try:
            r = requests.get(url,timeout=5)
            if r.status_code != 200:
                break
            iteration += 1
            review_soup = BeautifulSoup(r.text, 'html.parser')
            review_urls =  [item.get('href') for item in review_soup.find_all('a', class_='rvw-simple-item__title-target')]
            
            results = Parallel(n_jobs=4,verbose=10)([
                    delayed(get_review)(i) for i in review_urls
                ])
            for result in results:
                cur.execute('insert into reviews values (null,?,?,?)', (result, index, hashlib.md5(result.encode()).hexdigest()))
                conn.commit()
        except Exception as e:
            print(e)
            conn.commit()
#             conn.close()
#             break
conn.close()

In [59]:
def mecab_analysis(texts):
    t = mc.Tagger("-Ochasen")
    # 辞書ロードうまくいかない
    #t = mc.Tagger( '-Owakati -d /Users/tominagaryota/mecab-ipadic-neologd')
    t.parse('')
    output = []
    node =  t.parseToNode(texts)  
    while node:
        if node.surface != "":  # ヘッダとフッタを除外
            word_type = node.feature.split(",")[0]
            if word_type in ['名詞', '形容詞']:
                output.append(node.surface)
            #output.append(node.surface)
        node = node.next
        if node is None:
            break
    return output

In [61]:
model = word2vec.Word2Vec.load('./model/wiki.model')

In [62]:
model.most_similar(positive = ['犬'])

  """Entry point for launching an IPython kernel.


[('猫', 0.8706178665161133),
 ('ウサギ', 0.8539745807647705),
 ('ネコ', 0.818726122379303),
 ('キツネ', 0.817758321762085),
 ('ネズミ', 0.8163456320762634),
 ('子犬', 0.814063549041748),
 ('クマ', 0.8122203350067139),
 ('オオカミ', 0.8110953569412231),
 ('ヤギ', 0.8104976415634155),
 ('柴犬', 0.790104329586029)]

In [64]:
def query(sql):
    conn = sqlite3.connect('./tabelog_reviews.sqlite3')
    cur = conn.cursor()
    try:
        cur.execute(sql)
        res = cur.fetchall()
    except Exception as e:
        print(e)
        res = None
    finally:
        conn.close()
        return res

In [65]:
sql = "select content from reviews"
reviews = query(sql)

In [66]:
len(reviews)

74189

In [74]:
reviews = [review[0].strip('\n').strip(' ').strip("\n") for review in reviews]

In [77]:
wakati_reviews = [mecab_analysis(review) for review in reviews]

sizeはベクトルの次元数
min_countは最低出現回数
sgは学習アルゴリズム指定(1: skig-pram 2: CBOW)

In [79]:
tabelog_model = word2vec.Word2Vec(wakati_reviews, size=1000,min_count=5,window=5,iter=1000,workers=4, sg=1)

In [82]:
tabelog_model.save('./model/tabelog_model.model')

In [87]:
tabelog_model.most_similar(positive = ['バンビーナ'])

  """Entry point for launching an IPython kernel.


[('うし', 0.5896492004394531),
 ('ごろ', 0.5586953163146973),
 ('カルボーネ', 0.38192981481552124),
 ('トマトナムル', 0.26702529191970825),
 ('渋谷店', 0.24012988805770874),
 ('五反田', 0.23327815532684326),
 ('ヒルトップ', 0.22916187345981598),
 ('USHIGOROBambina', 0.2236787974834442),
 ('中目黒', 0.21719831228256226),
 ('恵比寿', 0.21376973390579224)]

In [91]:
dictionary = corpora.Dictionary(wakati_reviews)
dictionary.save_as_text('./texts/tabelog_text.dict')

In [92]:
corpus = [dictionary.doc2bow(doc) for doc in wakati_reviews]
corpora.MmCorpus.serialize('./texts/tabelog_text.mm', corpus)

In [96]:
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [99]:
with open('./model/tabelog_corpus_tfidf.dump', mode='wb') as f:
    pickle.dump(corpus_tfidf, f)

In [124]:
N_TOPICS = 50
lda3 = LdaMulticore(corpus=corpus_tfidf, id2word=dictionary, iterations=10000,
                             num_topics=N_TOPICS, minimum_probability=0.001, eval_every=0, chunksize=1000,workers=4)

In [125]:
for i in range(N_TOPICS):
    print('tpc_{0}: {1}'.format(i, lda3.print_topic(i)[0:80]+'...'))

tpc_0: 0.006*"豚カツ" + 0.003*"花火" + 0.002*"笑笑" + 0.002*"初デート" + 0.002*"花火大会" + 0.001*"庶民的...
tpc_1: 0.007*"カツカレー" + 0.002*"デザートブッフェ" + 0.002*"ワッフル" + 0.002*"せんべい" + 0.002*"me." + 0...
tpc_2: 0.009*"ウルフ" + 0.009*"ギャング" + 0.004*"45分" + 0.004*"栄" + 0.003*"どんぶり" + 0.003*"賑わい...
tpc_3: 0.004*"番号" + 0.003*"きしめん" + 0.002*"手狭" + 0.002*"おかわ" + 0.001*"ソバ" + 0.001*"なぁ～っ"...
tpc_4: 0.002*"テンダー" + 0.002*"ワカ" + 0.002*"ヌイ" + 0.001*"しめの" + 0.001*"キーライムパイ" + 0.001*"...
tpc_5: 0.006*"ボーン" + 0.005*"デザートビュッフェ" + 0.002*"柳川" + 0.001*"ラン" + 0.001*"⭕" + 0.001*"苦...
tpc_6: 0.008*"ハンバーグ" + 0.004*"ステーキ" + 0.004*"ランチ" + 0.003*"カレー" + 0.003*"肉" + 0.003*"ソー...
tpc_7: 0.005*"白金" + 0.002*"⚪" + 0.001*"色紙" + 0.001*"てわけ" + 0.001*"あちき" + 0.001*"代々木駅" +...
tpc_8: 0.006*"常陸牛" + 0.005*"牛たん" + 0.004*"シナモン" + 0.004*"六本木ヒルズ" + 0.003*"昼前" + 0.002*"...
tpc_9: 0.014*"ピラフ" + 0.003*"天使" + 0.003*"1380円" + 0.002*"ゴーヤ" + 0.002*"2件" + 0.002*"シェイ...
tpc_10: 0.013*"ビュッフェ" + 0.005*"ステーキソース" + 0.004*"ハラミステーキ" + 0.003*"990円" + 0.003*"分煙" + ..