# PyDataFukuoka#9
うどんMAPをスクレイピングしてGooglePlacesAPIでレビュー取得してダッシュボーディング

In [1]:
import requests
from bs4 import BeautifulSoup
import io
from tqdm import tqdm

import re

import pandas as pd
import numpy as np

import plotly.express as px
import pickle

import googlemaps

## スクレイピング

In [None]:
scrp_res = []

for n in tqdm(range(1,86)):
    response = requests.get('https://www.tnc.co.jp/store/shop/archives/category/udonmap/page:{}'.format(n))
    soup = BeautifulSoup(response.text,'lxml')
    for i in range(1,11):
        try :
            haha = soup.select('#contents_bg > section > ul > li:nth-child({}) > div.txt'.format(i))        
            t = haha[0].get_text(strip=False)
            lis_shopinfo = [x for x in re.sub('[\n\t]', '\t', t.split('\r')[0]).split('\t') if len(x)>0]
            scrp_res.append([n, i, lis_shopinfo])
        except :
            print(n, i)

In [None]:
df_scrp_res = pd.DataFrame(scrp_res)
df_scrp_res.columns = ['pageNo', "NoInPage", "scrp_info_list"]

In [None]:
df_scrp_res['shopName'] = df_scrp_res['scrp_info_list'].map(lambda x: x[0])
df_scrp_res['adress'] = df_scrp_res['scrp_info_list'].map(lambda x: [e for e in x if "【住所】" in  e]).map(lambda x: x[0] if len(x)>0 else x)
df_scrp_res['tel'] = df_scrp_res['scrp_info_list'].map(lambda x: [e for e in x if "【電話】" in  e]).map(lambda x: x[0] if len(x)>0 else x)
df_scrp_res['biz_hour'] = df_scrp_res['scrp_info_list'].map(lambda x: [e for e in x if "【営業時間】" in  e]).map(lambda x: x[0] if len(x)>0 else x)
df_scrp_res['reg_holiday'] = df_scrp_res['scrp_info_list'].map(lambda x: [e for e in x if "【定休日】" in  e]).map(lambda x: x[0] if len(x)>0 else x)
df_scrp_res['Akira_ordere'] = df_scrp_res['scrp_info_list'].map(lambda x: x[-1])

In [None]:
df_scrp_res['srchwd'] = df_scrp_res['shopName'] + df_scrp_res['adress'].map(lambda x: re.sub('【住所】', ' ', x))

## Places APIの使用

In [None]:
key = open(".placesapikey").read()

In [None]:
def serchRating(word:str)->dict:
    try:
        client = googlemaps.Client(key) 
        geocode_result = client.geocode(word)
        detail = client.place(geocode_result[0]['place_id'], language="Japanese",  fields=['name', 'rating', 'user_ratings_total', 'geometry'])
        res = detail['result']
    except:
        res=np.nan
    return res

In [None]:
%%time
df_scrp_res['takendata'] = df_scrp_res['srchwd'].map(serchRating)

### 取得したデータの加工

In [None]:
df_scrp_res.loc[~df_scrp_res['takendata'].isnull(), 'takendata_geometry_lat'] = (
    df_scrp_res.loc[~df_scrp_res['takendata'].isnull(), 'takendata'].map(lambda x:x['geometry']['location']['lat'])
)

df_scrp_res.loc[~df_scrp_res['takendata'].isnull(), 'takendata_geometry_lng'] = (
    df_scrp_res.loc[~df_scrp_res['takendata'].isnull(), 'takendata'].map(lambda x:x['geometry']['location']['lng'])
)

df_scrp_res.loc[~df_scrp_res['takendata'].isnull(), 'takendata_name'] = (
    df_scrp_res.loc[~df_scrp_res['takendata'].isnull(), 'takendata'].map(lambda x:x['name'])
)

df_scrp_res.loc[~df_scrp_res['takendata'].isnull(), 'takendata_rating'] = (
    df_scrp_res.loc[~df_scrp_res['takendata'].isnull(), 'takendata'].map(lambda x: x['rating'] if len(x)>=3 else None)
)

df_scrp_res.loc[~df_scrp_res['takendata'].isnull(), 'takendata_user_ratings_total'] = (
    df_scrp_res.loc[~df_scrp_res['takendata'].isnull(), 'takendata'].map(lambda x:x['user_ratings_total'] if len(x)>=3 else None)
)

In [None]:
df_preped = df_scrp_res.loc[~df_scrp_res['takendata_rating'].isnull()]

In [None]:
df_preped_drp_dup = df_preped.drop_duplicates(['shopName', 'takendata_name'])

In [None]:

df4disp = (
    df_preped_drp_dup[
        ['shopName', 'adress', 'tel', 'biz_hour', 'reg_holiday', 'Akira_ordere', 'takendata_rating', 'takendata_user_ratings_total']
    ].copy()
)
df4disp['adress'] = df4disp['adress'].map(lambda x: re.sub('【住所】', '', x) if len(x)>0 else None)
df4disp['tel'] = df4disp['tel'].map(lambda x: re.sub('【電話】', '', x) if len(x)>0 else None)
df4disp['biz_hour'] = df4disp['biz_hour'].map(lambda x: re.sub('【営業時間】', '', x) if len(x)>0 else None)
df4disp['reg_holiday'] = df4disp['reg_holiday'].map(lambda x: re.sub('【定休日】', '', x) if len(x)>0 else None)
df4disp['Akira_ordere'] = df4disp['Akira_ordere'].map(lambda x: re.sub('詳しくはこちら', '', x) if len(x)>0 else None)
df4disp

In [None]:
df4disp['adress_sp1'] = df4disp['adress'].map(lambda x:re.sub('福岡県', '', x.split('市')[0].split('郡')[0]))
# df4disp['adress_sp2'] = df4disp['adress_sp1'].map(lambda x:x.split('郡')[0])
# df4disp['adress_sp2']

In [None]:
df4disp['adress_sp1'].unique()

In [None]:
df4disp['adress_sp1'] = df4disp['adress'].map(lambda x:re.sub('福岡県', '', ((x.split('市')[0] + '市').split('郡')[0])))
df4disp.loc[df4disp['adress_sp1'] == '粕屋',  'adress_sp1'] = '糟屋'
df4disp.loc[df4disp['adress_sp1'].map(lambda x:x[-1]!='市'),  'adress_sp1'] = (
    df4disp.loc[df4disp['adress_sp1'].map(lambda x:x[-1]!='市'),  'adress_sp1'].map(lambda x:x+'郡')
)
df4disp.loc[df4disp['adress_sp1'] == '小郡',  'adress_sp1'] = '小郡市'
df4disp['adress_sp1'].unique().tolist()

## 可視化

In [None]:
px.set_mapbox_access_token(open(".mapboxtoken").read())

In [None]:
import plotly.express as px

df = px.data.carshare()
fig = px.scatter_mapbox(df_preped_drp_dup
                        , lat="takendata_geometry_lat"
                        , lon="takendata_geometry_lng"
                        , color="takendata_rating"
                        , size="takendata_user_ratings_total"
                        , hover_name='takendata_name'
                        , color_continuous_scale=px.colors.cyclical.IceFire, size_max=20, zoom=8)
fig.show()

In [None]:
px.scatter(data_frame=df_preped_drp_dup
           , x='takendata_user_ratings_total'
           , y='takendata_rating'
           , marginal_x='histogram'
          , marginal_y='histogram'
          , hover_name='shopName')

In [None]:
px.histogram(data_frame=df_preped_drp_dup.loc[df_preped_drp_dup['takendata_user_ratings_total']>= 88]
             , x='takendata_rating', marginal='box')

In [None]:
# # streamlit
# import streamlit as st

# px.set_mapbox_access_token(open(".mapboxtoken").read())

# df_preped_drp_dup['adress'] = df_preped_drp_dup['adress'].map(lambda x: re.sub('【住所】', '', x) if len(x)>0 else None)
# df_preped_drp_dup['tel'] = df_preped_drp_dup['tel'].map(lambda x: re.sub('【電話
# 】', '', x) if len(x)>0 else None)
# df_preped_drp_dup['biz_hour'] = df_preped_drp_dup['biz_hour'].map(lambda x: re.sub('【営業時間】', '', x) if len(x)>0 else None)
# df_preped_drp_dup['reg_holiday'] = df_preped_drp_dup['reg_holiday'].map(lambda x: re.sub('【定休日】', '', x) if len(x)>0 else None)
# df_preped_drp_dup['Akira_ordere'] = df_preped_drp_dup['Akira_ordere'].map(lambda x: re.sub('詳しくはこちら', '', x) if len(x)>0 else None)


# # サイドバー　スライドバー
# n_rating_cutoff = st.sidebar.slider(
#     'レビュー数の閾値を選んでください！',
#    1.0, df_preped_drp_dup['takendata_user_ratings_total'].max(), (88.0, 1300.0))




# # cutoff適用したdf
# df_n_r_cut = (
#     df_preped_drp_dup.loc[(df_preped_drp_dup['takendata_user_ratings_total']>=n_rating_cutoff[0])&
#                                    (df_preped_drp_dup['takendata_user_ratings_total']<=n_rating_cutoff[1]), :]
#     .copy()
#     .rename(columns={'takendata_user_ratings_total':'n_review', 'takendata_rating':'rating_average'})
# )

# # map
# fig_map = px.scatter_mapbox(df_n_r_cut
#                         , lat="takendata_geometry_lat"
#                         , lon="takendata_geometry_lng"
#                         , color="rating_average"
#                         , size="n_review"
#                         , hover_name='takendata_name'
#                         , color_continuous_scale=px.colors.cyclical.IceFire, size_max=20, zoom=8)

# # scatter
# fig_scatter = px.scatter(data_frame=df_n_r_cut
#                         , x='n_review'
#                         , y='rating_average'
#                         , marginal_x='rug'
#                         , marginal_y='rug'
#                         , hover_name='shopName'
#                         )

# # hist
# fig_hist_r = px.histogram(data_frame=df_n_r_cut
#              , x='rating_average'
#              , marginal='box')


# # hist2
# fig_hist_nr = px.histogram(data_frame=df_n_r_cut
#              , x='n_review'
#              , marginal='box')



# st.markdown('# PyData.Fukuoka\#9 \n ##### うどんMAPっていいコーナーだよね')
# st.text('\n \n \n \n')
# left_column, right_column = st.beta_columns(2)
# # left1
# left_column.text('\n \n \n地図')
# left_column.write(fig_map, use_container_width = True)
# # left2
# left_column.text('ratingの分布')
# left_column.write(fig_hist_r)
                              
# #right1
# right_column.text('\n \n \nレビュー数とratingの関係')
# right_column.write(fig_scatter)
# #right2
# right_column.text('レビュー数の分布')
# right_column.write(fig_hist_nr)

# # 表示 cutoff適用したdf
# st.text('うどん店一覧')
# disp_col_name = ['shopName', 'rating_average', 'n_review', 'adress', 'tel', 'biz_hour', 'reg_holiday', 'Akira_ordere']


# st.dataframe(
#     df_n_r_cut[disp_col_name]
#     , width=1200, height=400)