In [None]:
# This notebook scrapes data from Weibo (weibo.com)
# Part of the program is from https://github.com/dataabc/weibo-crawler 

In [None]:
import requests
from requests.adapters import HTTPAdapter, Retry
import time
import json
from pathlib import Path
import os
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
class Downloader:
    def __init__(self):
        self.s = requests.session()
        retries = Retry(total=5,
                        backoff_factor=0.1,
                        status_forcelist=[ 500, 502, 503, 504 ])
        self.s.mount('http://', HTTPAdapter(max_retries=retries))
        self.s.mount('https://', HTTPAdapter(max_retries=retries))
    
    def get(self, keyword, page, start_time, end_time):
        timescope = f"custom:{start_time}:{end_time}"
        resp = self.s.get("https://s.weibo.com/weibo",
            params={
                "q": keyword,
                "typeall": "1",
                "suball": "1",
                "timescope": timescope,
                "Refer": "g",
                "page": str(page),
            },
        headers={
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "accept-encoding": "gzip, deflate, br, zstd",
            "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
            "cache-control": "max-age=0",
            "priority": "u=0, i",
            "referer": "https://passport.weibo.com/",
            "sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"macOS\"",
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "same-origin",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        },
        cookies={
            "_s_tentry": "-",
            "Apache": "2702705434452.588.1723222797133",
            "SINAGLOBAL": "2702705434452.588.1723222797133",
            "ULV": "1723222797169:1:1:1:2702705434452.588.1723222797133:",
            "SCF": "AkJEZvqK2YnhrqOs5VDcuiy9TaRKPLvDuKc2mdSr12a23ejc9uz1-ytkJXLfSLuDypwbPPWNP-ydzfpD5Y2scZE.",
            "ALF": "1726763082",
            "SUB": "_2A25LwLMaDeRhGeRI4lET8ijLyT6IHXVovErSrDV8PUJbkNANLW3ukW1NUs4-fWVTYDCfj3yakyQhcqTzWdERcq4t",
            "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9W5RiTdkdGGuk7WzMs_fbBje5JpX5KzhUgL.Fozc1KeEeoqNeoz2dJLoIpXLxKBLB.zL1h-LxKBLB.zL1h-LxKqL1KnLB-93M7tt",
        })
        return resp.content


In [None]:
import datetime

def parse_weibo_time_list(begin_time:str, end_time: str, day_interval: int = 10):
    """获取 begin 和end的列表集合

    Args:
        begin_time (str): 开始时间
        end_time (str): 结束时间
        day_interval (int, optional): 默认间隔时间10天. Defaults to 10.
    """
    time_list = []
    begin_time_date_ = datetime.datetime.strptime(begin_time,"%Y-%m-%d-%H")
    end_time_date_ = datetime.datetime.strptime(end_time,"%Y-%m-%d-%H")
    if (begin_time_date_+ datetime.timedelta(days=day_interval)) > end_time_date_:
        time_ = [begin_time_date_, end_time_date_]
        time_list.append(time_)
    else:
        time_begin_inter = begin_time_date_
        end_time_inter = begin_time_date_ +  datetime.timedelta(days=day_interval)
        while end_time_inter <= end_time_date_:
            time_list.append([time_begin_inter, end_time_inter])
            time_begin_inter = time_begin_inter + datetime.timedelta(days=day_interval)
            end_time_inter = time_begin_inter + datetime.timedelta(days=day_interval) 
    for i in time_list:
        i[0] = i[0].strftime('%Y-%m-%d-%H')
        i[1] = i[1].strftime('%Y-%m-%d-%H')
    return time_list


search_time = parse_weibo_time_list('2009-08-16-0', '2024-08-20-0', 5)

print(search_time)



In [None]:
from tqdm.notebook import tqdm
def download_one_key(keyword):
    d = Downloader()
    outdir = Path('output') / keyword
    if not outdir.exists():
        outdir.mkdir()
        
    for begin, end in tqdm(search_time):
        for page_num in range(1,51):            
            outf = outdir / f'{begin}_{end}_{page_num}.html'
            if not outf.exists():
                result = d.get(keyword, page_num, begin, end)
                try:
                    result = result.decode('utf-8')
                except:
                    print(result)
                    raise
                time.sleep(0.6)
                with open(outf, 'w') as f:
                    f.write(result)
                if "card-no-result" in result:
                    tqdm.write(f'reached no result {outf}')
                    break
                else:
                    tqdm.write(f'done {outf}')
            else:
                if "card-no-result" in outf.read_text():
                    tqdm.write(f'skip {outf}')
                    break

def main():
    for k in ['雇佣黑社会', '雇打手','雇小混混','雇社会闲散人员']:
        download_one_key(k)

main()

In [None]:
# filter out ‘抱歉，未找到相关结果‘

# Generate the list of file paths based on the output files
file_paths = []
for k in ['雇佣黑社会','雇打手','雇小混混','雇社会闲散人员']:
    for begin, end in search_time:
            for page_num in range(1,51):            
                file_paths.append(f'output/{k}/{begin}_{end}_{page_num}.html')


filtered_files = []

for file_path in file_paths:
    if Path(file_path).exists():
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            if "card-no-result" not in content:
                filtered_files.append(file_path)

# Print the filtered file paths
print("Files that do not contain 'card-no-result':")
for file_path in filtered_files:
    print(file_path)

In [None]:
# Etract Weibo ID as a list (and remove repetition)

import re

# Initialize an empty set to store unique tweet IDs
all_tweet_ids = set()

# Loop through each file path in filtered_files
for file_path in filtered_files:
    # Open the file and read its content
    with open(file_path, 'r', encoding='utf-8') as file:
        result_text = file.read()
        
        # Apply the regular expression to extract tweet IDs
        tweet_ids = re.findall(r'\d+/(.*?)\?refer_flag=1001030103_\'\)">复制微博地址</a>', result_text)
        
        # Add the tweet IDs found in the current file to the set
        all_tweet_ids.update(tweet_ids)

# Convert the set back to a list if you need to maintain the order
all_tweet_ids = list(all_tweet_ids)

# Print all unique extracted tweet IDs
print("Extracted Tweet IDs:")
for tweet_id in all_tweet_ids:
    print(tweet_id)



In [None]:
# download without logging in

class Downloader_new:
    url = "https://weibo.com/ajax/statuses/show"
    def __init__(self):
        self.s = requests.session()
        retries = Retry(total=5,
                        backoff_factor=0.1,
                        status_forcelist=[ 500, 502, 503, 504 ])
        self.s.mount('http://', HTTPAdapter(max_retries=retries))
        self.s.mount('https://', HTTPAdapter(max_retries=retries))
    
    def get(self, tweet_id):
        resp = self.s.get(self.url,
            params={
                "id": tweet_id,
            },
            headers={
                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
                "accept-encoding": "gzip, deflate, br, zstd",
                "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
                "cache-control": "max-age=0",
                "priority": "u=0, i",
                "sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"",
                "sec-ch-ua-mobile": "?0",
                "sec-ch-ua-platform": "\"macOS\"",
                "sec-fetch-dest": "document",
                "sec-fetch-mode": "navigate",
                "sec-fetch-site": "none",
                "sec-fetch-user": "?1",
                "upgrade-insecure-requests": "1",
                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
            },
            cookies={
                "XSRF-TOKEN": "bRT86qipdndDABwvrUrtgLHu",
                "SUB": "_2AkMR6sQzdcPxrAFXkfsXzm3lbo9H-jyiP63FAn7uJhMyOhgP7nM1qSdutBF-XAXYLzAXdMZbjazkR8IdeVv_eg7b",
                "SUBP": "0033WrSXqPxfM72wWs9jqgMF55529P9D9W5RiTdkdGGuk7WzMs_fbBje5JpVhgU4PcDuBh2ESb4odcXt",
                "_s_tentry": "-",
                "Apache": "2702705434452.588.1723222797133",
                "SINAGLOBAL": "2702705434452.588.1723222797133",
                "ULV": "1723222797169:1:1:1:2702705434452.588.1723222797133:",
            })

        return resp.content


def main():
    d = Downloader_new()
    for tweet_id in tqdm(all_tweet_ids):
        outf_new = Path(f'clean/{tweet_id}')
        if not outf_new.exists():
            result = d.get(tweet_id).decode('utf-8')
            time.sleep(0.1)
            with open(outf_new, 'w') as f:
                f.write(result)
        print(outf_new)

main()

In [None]:
import dateutil
def parse_time(s):
    """
    Wed Oct 19 23:44:36 +0800 2022 => 2022-10-19 23:44:36
    """
    # return "2022-10-19 23:44:36"
    return dateutil.parser.parse(s).strftime('%Y-%m-%d %H:%M:%S')


def parse_user_info(data):
    """
    解析用户信息
    """
    # 基础信息
    user = {
        "_id": str(data['id']),
        "nick_name": data['screen_name'],
        "verified": data['verified'],
    }
    return user

def parse_blog_info(data):
        # print(f"blog_data:{data}")
        user = parse_user_info(data['user'])
        tweet = {
            "_id": str(data['mid']),
            "mblogid": data['mblogid'],  # 博客id
            "created_at": parse_time(data['created_at']),  # 文章发布时间
            "user_id": user['_id'],
            'user_nickname': user['nick_name'],
            'user_verified': user['verified'],
            "content": data.get('text_raw',g_none_word).replace('\u200b', ''),
            'content_long':'',
            "geo": data.get('geo',g_none_word),
            "ip_location": data.get('region_name', g_none_word),
            "reposts_count": data.get('reposts_count',g_none_word),
            "comments_count": data.get('comments_count',g_none_word),
            "attitudes_count": data.get('attitudes_count',g_none_word),
            "source": data.get("source",g_none_word),
            "pic_urls": ["https://wx1.sinaimg.cn/orj960/" + pic_id for pic_id in data.get('pic_ids', [])],
            "pic_num": data['pic_num'],
            'is_long_text': 'continue_tag' in data and data['isLongText'],
            "video": 'N/A'
        }
        if 'page_info' in data and data['page_info'].get('object_type', '') == 'video':
            med_info = data['page_info'].get('media_info')
            if med_info is None:
                tweet['video'] = 'N/A'
            else:
                tweet['video'] = med_info['mp4_720p_mp4']
        tweet["url"] = f"https://weibo.com/{user['_id']}/{tweet['mblogid']}"  # 文章地址
        return tweet

In [None]:
# Define the directory containing the files
directory = "clean"

# Initialize a list to hold all parsed data
parsed_data = []

g_none_word = 'NA'

class DownloaderLongTxt(Downloader_new):
    url = "https://weibo.com/ajax/statuses/longtext"

downloader_long = DownloaderLongTxt()

def get_long_text(blog_id):
    dir_long = Path(directory) / 'long'
    if not dir_long.exists():
        dir_long.mkdir()
    file_path = dir_long / f'{blog_id}.json'
    if not file_path.exists():
        print('requesting long text for', blog_id)
        file_path.write_bytes(downloader_long.get(blog_id))

    with file_path.open('r') as fin:
        content = fin.read()
        if content == '<h2>400 Bad Request</h2>':
            print("long 400", blog_id)
            return "N/A due to 400"
        try:
            data = json.loads(content)
        except:
            print('bad json', file_path)
            print(content)
            raise
    if not data['ok']:
        if data['error_code']:
            return f'N/A due to {data["message"]}'
        return 'N/A due to not OK but empty error code'
    if not len(data['data']):
        return 'N/A due to empty data'
    return data['data']['longTextContent']

# Loop through all files in the directory
for filename in tqdm(os.listdir(directory)):
    file_path = os.path.join(directory, filename)

    # Skip if it's a directory or a checkpoint directory
    if os.path.isdir(file_path) or '.ipynb_checkpoints' in filename:
        continue
        
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        if content == '<h2>400 Bad Request</h2>':
            print('400', filename)
            continue
        # Parse the blog information
        data = None
        try:
            data = json.loads(content)
            if data.get('error_code', 0) == 20101:
                print('missing', filename)
                continue
            tweet_info = parse_blog_info(data)
            if tweet_info['is_long_text']:
                tweet_info['content_long'] = get_long_text(tweet_info['mblogid'])
            else:
                tweet_info['content_long'] = tweet_info['content']
        except:
            print('failed', filename)
            import pprint
            # pprint.pprint(data)
            raise
        # Append the parsed data to the list
        parsed_data.append(tweet_info)

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(parsed_data)

# Save the DataFrame to a CSV file or do further processing
df.to_excel("all_tweets.xlsx", index=False)
df.to_parquet("all_tweets.parquet", index=False)

print("DataFrame created and saved to all_tweets.xlsx")

In [None]:
df.info()

In [None]:
df.to_pickle("all_tweets.pkl")