In [1]:
import os
import re
import json
import time
import glob
import shutil
import random
import requests
import aiofiles
import aiohttp
import asyncio
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import random

df = pd.read_excel(r"dianping_sid.xlsx")
df_li = df.values.tolist()
sid_li = [i [1] for i in df_li]

def get_proxy():
    return requests.get("http://127.0.0.1:5010/get/").json()

    
async def getPost(sid):
    
    proxy = get_proxy().get("proxy")
    print(proxy)
    url_web = 'http://m.dianping.com/shop/' + sid
    url_1 = f'https://mapi.dianping.com/mapi/msource/shop.bin?device_system=MACINTOSH&lat=0&lng=0&mtsiReferrer=pages%2Fdetail%2Fdetail%3FshopUuid%3D{sid}%26online%3D1%26shopuuid%3D{sid}%26shopId%3D{sid}%26pageName%3Dshop&online=1&pageName=shop&shopUuid={sid}&shopuuid={sid}&'
    url_2 = f'https://m.dianping.com/wxmapi/shop/rankinfo?device_system=MACINTOSH&shopUuid={sid}&showPos=tag&'
    headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
    
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(url=url_1, proxy=f'http://{proxy}', headers=headers) as resp1, session.get(url=url_2, proxy=f'http://{proxy}', headers=headers) as resp2:
                if (resp1.status == 200) and (resp2.status == 200):
                    data = await resp1.json()
                    rank = await resp2.json()
                else:
                    print('webpage status not 200 error')
                    data='NA'

                if data != '':
                    content = {'url': url_web, 'sid': sid, 'data': data, 'rank': rank}
                    async with aiofiles.open(f"dianping/{sid}.json", mode="w", encoding="utf-8") as f:
                        await f.write(json.dumps(content, ensure_ascii=False))
                    print({'shop':content['url'], 'rate':content['data']['shopPowerRate']})    
                else:
                    await getPost(sid) 

    except Exception:
        print(sid, "restarts")
        await getPost(sid)


async def getPosts(sid_li, start_id, end_id):
    sids = os.listdir("dianping")
    finised_sids = [filename[:-5] for filename in sids]
    tasks = [asyncio.create_task(getPost(sid)) for sid in sid_li[start_id: end_id + 1] if sid not in finised_sids]
    try:
        await asyncio.wait(tasks)
    except Exception:
        print(f"{start_id} to {end_id} is full")
        
        
def makedir(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)
    if not os.path.exists(directory):
        os.mkdir(directory)

      
def get_csv():
    df = pd.concat([pd.read_json(i, orient='index').T for i in sorted(glob.glob('dianping/*.json'))], ignore_index=True)
    df['name'] = df.data.str.get('name')
    df['branchName'] = df.data.str.get('branchName')
    df['shopPowerRate'] = df.data.str.get('shopPowerRate')
    df['scoreText'] = df.data.str.get('scoreText')
    df['priceText'] = df.data.str.get('priceText')
    df['voteTotal'] = df.data.str.get('voteTotal')
    df['cityName'] = df.data.str.get('cityName')
    df['regionName'] = df.data.str.get('regionName')
    df['address'] = df.data.str.get('address')
    df['recentBizTime'] = df.data.str.get('recentBizTime').str.get('title')
    df['rankShortName'] = df['rank'].str.get('rankInfo').str.get('rankShortName')
    df['rankings'] = df['rank'].str.get('rankInfo').str.get('rankings').astype('Int64')
    df['rankUrl'] = df['rank'].str.get('rankInfo').str.get('rankUrl')
    df['date'] = datetime.now().strftime('%Y-%m-%d')
    df = df.drop(columns=['data', 'rank']).replace(np.nan, np.nan).replace('', np.nan)
    df.to_csv(f"result_{datetime.now().strftime('%Y-%m-%d')}.csv", index=False)


if __name__ == '__main__':
    makedir("dianping")
    for i in tqdm(range(0, len(df_li), 100)):
        t2 = datetime.now()
#         time.sleep(5)
        await getPosts(sid_li, i, i + 99)
        print("Num:", min(len(df_li), i + 100), "/", len(df_li), "ETA:", (datetime.now() - t2) / 100 * max(0, (len(df_li) - i - 100)))
    get_csv()
    print("Done!")

  0%|          | 0/41 [00:00<?, ?it/s]

47.92.113.71:80
47.92.113.71:80
47.100.201.85:80
47.57.188.208:80
110.242.49.230:80
124.193.74.31:80
47.57.188.208:80
183.232.186.111:80
59.148.173.254:80
182.61.201.201:80
60.188.5.130:80
110.242.49.230:80
59.148.173.254:80
124.193.74.31:80
183.232.186.111:80
60.188.5.136:80
47.100.201.85:80
183.232.186.111:80
60.188.5.136:80
59.148.173.254:80
59.148.173.254:80
60.188.5.130:80
47.92.113.71:80
47.92.113.71:80
47.92.113.71:80
59.148.173.254:80
60.188.5.136:80
124.193.74.31:80
47.92.113.71:80
182.61.201.201:80
47.57.188.208:80
124.193.74.31:80
59.148.173.254:80
60.188.5.130:80
59.148.173.254:80
59.148.173.254:80
183.232.186.111:80
60.188.5.130:80
47.57.188.208:80
47.57.188.208:80
47.57.188.208:80
47.100.201.85:80
59.148.173.254:80
47.57.188.208:80
124.193.74.31:80
183.232.186.111:80
110.242.49.230:80
110.242.49.230:80
183.232.186.111:80
47.92.113.71:80
59.148.173.254:80
47.100.201.85:80
182.61.201.201:80
60.188.5.130:80
124.193.74.31:80
183.232.186.111:80
182.61.201.201:80
47.100.201.85: