In [None]:
# This notebook scrapes data from WQW (https://wqw2010.blogspot.com/)

In [None]:
import requests
from requests.adapters import HTTPAdapter, Retry
import time
import json
from pathlib import Path
import os
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
class Downloader:
    def __init__(self):
        self.s = requests.session()
        retries = Retry(total=5,
                        backoff_factor=0.1,
                        status_forcelist=[ 500, 502, 503, 504 ])
        self.s.mount('http://', HTTPAdapter(max_retries=retries))
        self.s.mount('https://', HTTPAdapter(max_retries=retries))
    
    def get(self, keyword, start_num):
        resp = self.s.get("https://wqw2010.blogspot.com/search",
        params={
            "q": keyword,
            "max-results": "20",
            "start": start_num,
            "by-date": "true",
        },
        headers={
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "accept-encoding": "gzip, deflate, br, zstd",
            "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
            "priority": "u=0, i",
            "sec-ch-ua": "\"Google Chrome\";v=\"129\", \"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"129\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"macOS\"",
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "none",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
        })
        
        return resp.content


In [None]:
from tqdm.notebook import tqdm
def download_one_key(keyword):
    d = Downloader()
    outdir = Path('../WQW') / keyword
    if not outdir.exists():
        outdir.mkdir()

    for num in range(0, 601, 20):
        outf = outdir / f'start={num}.html'
        if not outf.exists():
            result = d.get(keyword, num)
            try:
                result = result.decode('utf-8')
            except:
                print(result)
                raise
            time.sleep(0.6)
            with open(outf, 'w') as f:
                f.write(result)
                tqdm.write(f'done {outf}')
        else:
            tqdm.write(f'file {outf} already exists, skipping')
            break

# try new keywords
keywords = ['雇佣黑社会', '带领黑社会', '组织黑社会', '指使黑社会', '勾结黑社会', '安排黑社会', '聘请黑社会', '打手', '小混混', '闲散人员', '地痞流氓']

def main():
    for k in keywords:
        download_one_key(k)

main()

In [None]:
# filter out ‘没有符合“XX”查询条件的博文‘

# Generate the list of file paths based on the output files
file_paths = []
for k in keywords:
    for num in range(0, 601, 20):
        file_paths.append(f'../WQW/{k}/start={num}.html')


filtered_files = []

for file_path in file_paths:
    if Path(file_path).exists():
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            if "没有符合" not in content:
                filtered_files.append(file_path)

# Print the filtered file paths
print("Files that do not contain '没有符合':")
for file_path in filtered_files:
    print(file_path)

In [None]:
# Etract article ID as a list (and remove repetition)

import re

# Initialize an empty set to store unique tweet IDs
all_tweet_ids = set()

# Loop through each file path in filtered_files
for file_path in filtered_files:
    # Open the file and read its content
    with open(file_path, 'r', encoding='utf-8') as file:
        result_text = file.read()
        
        # Apply the regular expression to extract tweet IDs
        tweet_ids = re.findall(r"<a href='https://wqw2010\.blogspot\.com/([^']+)", result_text)

        # Filter out tweet_ids that start with 'search/label'
        tweet_ids = [tweet_id for tweet_id in tweet_ids if not tweet_id.startswith('search')]
        
        # Add the tweet IDs found in the current file to the set
        all_tweet_ids.update(tweet_ids)

# Convert the set back to a list if you need to maintain the order
all_tweet_ids = list(all_tweet_ids)

# Print all unique extracted tweet IDs
print("Extracted Tweet IDs:")
for tweet_id in all_tweet_ids:
    print(tweet_id)



In [None]:
# delete "#more"

# Use list comprehension to remove "#more" if present
cleaned_ids = set([s.split("#more")[0] for s in all_tweet_ids])

# Output the cleaned list
print(cleaned_ids)

In [None]:
# download articles with their links

class Downloader_new:
    url = "https://wqw2010.blogspot.com/"
    def __init__(self):
        self.s = requests.session()
        retries = Retry(total=5,
                        backoff_factor=0.1,
                        status_forcelist=[ 500, 502, 503, 504 ])
        self.s.mount('http://', HTTPAdapter(max_retries=retries))
        self.s.mount('https://', HTTPAdapter(max_retries=retries))
    
    def get(self, tweet_id):
        # Correctly format the URL
        full_url = f"{self.url}{tweet_id}"  

        # Make the GET request
        resp = self.s.get(full_url,  
                          
                          headers={
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "accept-encoding": "gzip, deflate, br, zstd",
            "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
            "cache-control": "max-age=0",
            "if-modified-since": "Wed, 16 Oct 2024 09:16:08 GMT",
            "if-none-match": "W/\"860da177119340014b0171b62494b980b609ec664d809496c24f4f8fd60ddd36\"",
            "priority": "u=0, i",
            "sec-ch-ua": "\"Google Chrome\";v=\"129\", \"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"129\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"macOS\"",
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "none",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
        })

        return resp.content


def main():
    d = Downloader_new()
    for tweet_id in tqdm(cleaned_ids):
        outf_new = Path(f'../WQW/all/{tweet_id.replace("/", "-")}')
        if not outf_new.exists():
            result = d.get(tweet_id).decode('utf-8')
            time.sleep(0.1)
            with open(outf_new, 'w') as f:
                f.write(result)
        print(outf_new)

main()

In [None]:
# parse information from html using BeautifulSoup

from bs4 import BeautifulSoup

def parse_blog_info(data):
    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(data, 'html.parser')

    # Extract blog id
    blog_id = soup.find('div', class_='post-body entry-content')['id'].split('-')[-1]

    # Extract time, title and content
    posted_date = soup.find('h2', class_='date-header').text.strip()
    title = soup.find('h3', class_='post-title entry-title').text.strip()
    content = soup.find('div', class_='post-body entry-content').text.strip()

    # Construct the tweet object with the relevant data
    tweet = {
        "blogid": blog_id,  # Blog id
        "posted_date": posted_date,  # Article posted time
        "title": title,  # Extracted title
        "content": content  # Extracted content
    }

    return tweet


In [None]:
# Define the directory containing the files
directory = "../WQW/all/"

# Initialize a list to hold all parsed data
parsed_data = []


# Loop through all files in the directory
for filename in tqdm(os.listdir(directory)):
    file_path = os.path.join(directory, filename)

    # Skip if it's a directory or a checkpoint directory
    if os.path.isdir(file_path) or '.ipynb_checkpoints' in filename:
        continue
        
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        try:
            tweet_info = parse_blog_info(content)
        except:
            print('failed', filename)
            import pprint
            # pprint.pprint(data)
            raise
        # Append the parsed data to the list
        parsed_data.append(tweet_info)

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(parsed_data)
df

In [None]:
# Save the DataFrame to a CSV file or do further processing
df.to_excel("../WQW/all_posts_wqw.xlsx", index=False)
df.to_parquet("../WQW/all_posts_wqw.parquet", index=False)