In [None]:
# This notebook uses GPT-4 API to extract key information from Weibo data
# Kai Jia (https://jiakai.xyz/) has contributed to the code

In [None]:
from openai import OpenAI, AsyncOpenAI
import asyncio
import re
import json
from tqdm.notebook import tqdm
from pathlib import Path

class GPTQuery:
    system_prompt = """You are an expert in the Chinese language and in Chinese politics. Answer all the questions from the user as accurately as possible."""
    log_filename = '../gpt-query-log.txt'
    
    prompt: str
    client: AsyncOpenAI

    _json_locator = re.compile(r'```json(.*)```', flags=re.DOTALL)

    _json_fixer: "GPTQuery" = None
    _log_file = None

    def __init__(self, prompt: str, system_prompt=None):
        self.prompt = prompt
        if system_prompt is not None:
            self.system_prompt = system_prompt
        with open('../openai-key.txt') as fin:
            self.client = AsyncOpenAI(api_key=fin.read().strip())

    def _print_log(self, msg):
        if self._log_file is None:
            self._log_file = open(self.log_filename, 'a')
            
        self._log_file.write(str(msg))
        self._log_file.write('\n')
        self._log_file.flush()

    def _close_log(self):
        if self._log_file is not None:
            self._log_file.close()
            del self._log_file

    @classmethod
    def _get_json_fixer(cls):
        """chatGPT sometimes outputs invalid json. Use itself to fix its output"""
        if cls._json_fixer is None:
            cls._json_fixer = cls(
                prompt="""The following json is invalid. Your task is to fix it to be a valid json. Your response should include the reason why it is invalid, followed by the corrected json. Do not produce any extra response after the corrected json.""",
                system_prompt="""You are an expert in computer science. Accurately answer the user's requests."""
            )
        return cls._json_fixer

    async def _auto_json_fix(self, jtxt: str) -> dict:
        """automatically try to fix the json response from GPT"""
        self._print_log(f'!!!! use json fixer: {jtxt}')

        if ((start_m := jtxt.find('```')) != -1 and
            (end_m := jtxt.find('```', start_m + 3)) != -1):
            # ChatGPT occasionally misses the json format marker
            try:
                return json.loads(jtxt[start_m+3:end_m])
            except:
                pass
        
        if start_m != -1:
            # if there are multiple jsons, use the first one
            try:
                return json.loads(jtxt[:start_m])
            except:
                pass
                
        if '//' in jtxt:
            # remove comments and try again
            lines = jtxt.split('\n')
            for i, j in enumerate(lines):
                if (cmt_m := j.find('//')) != -1 and '"' not in j[cmt_m:]:
                    lines[i] = j[:cmt_m]
            try:
                return json.loads('\n'.join(lines))
            except:
                pass

        fixer = self._get_json_fixer()
        assert self is not fixer  # avoid infinite recursion
        return await fixer._query(jtxt)

    async def _query(self, query: str) -> dict:
        resp = await self.client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": f"{self.prompt}\n\n{query}"}
            ]
        )
        msg = resp.choices[0].message
        try:
            assert msg.role == 'assistant' and msg.function_call is None and msg.tool_calls is None
            self._print_log('============')
            self._print_log(query)
            txt = msg.content
            jtxt_m = self._json_locator.search(txt)
            if jtxt_m is None:
                jtxt = txt  # gpt sometimes does not wrap the result in json blocks
            else:
                jtxt = jtxt_m.group(1)
            try:
                json_succ = False
                ret = json.loads(jtxt)
                json_succ = True
            except:
                ret = await self._auto_json_fix(jtxt)
                
            self._print_log(json.dumps(ret, ensure_ascii=False, indent=2))
            if jtxt_m is None or jtxt_m.group(0) != txt or not json_succ:
                ret['original_resp'] = txt
        except Exception as exc:
            raise RuntimeError(f'Query: {query}\nResp: {msg}') from exc
        return ret
        
    async def query(self, query: str) -> dict:
        """query a single input, asynchronously"""
        try:
            return await self._query(query)
        finally:
            self._close_log()

    async def batch_query(self, concurrency: int, result_file: Path, queries: dict[str, str]):
        """query multiple inputs concurrently and asynchrously.

        :param concurrency: number of concurrent queries allowed
        :param result_file: json file to save the results; when a new result arrives, it will be saved immediately. The old result will be read
        :param queries: a dict of the queries
        """
        
        if not isinstance(result_file, Path):
            result_file = Path(result_file)

        if result_file.exists():
            with result_file.open() as fin:
                result = json.load(fin)
                print(f'Loaded {len(result)} results')
        else:
            result = {}

        def save_result(force=False):
            if (not force) and len(result) % concurrency:
                # only save when we get a new batch of results to speed up
                return
            if result_file.exists():
                result_file.rename(result_file.with_suffix('.json.bak'))
            with result_file.open('w') as fout:
                json.dump(result, fout, ensure_ascii=False, indent=2)
            self._print_log(f'******* saved {len(result)} results')

        async def one_task(qid):
            qres = await self._query(queries[qid])
            result[qid] = qres
            save_result()

        queries = {str(k): v for k, v in queries.items()}
        tasks = [one_task(k) for k in queries.keys() if k not in result]

        try:
            with tqdm(total=len(queries)) as pbar:
                pbar.update(len(result))
                async for i in self._limit_concurrency(tasks, concurrency):
                    await i
                    pbar.update(1)
        finally:
            save_result(True)
            self._close_log()

    @classmethod
    async def _limit_concurrency(cls, aws, limit):
        """run awaitables with limited concurrency"""
        # see https://death.andgravity.com/limit-concurrency#asyncio-wait
        aws = iter(aws)
        aws_ended = False
        pending = set()
    
        while pending or not aws_ended:
            while len(pending) < limit and not aws_ended:
                try:
                    aw = next(aws)
                except StopIteration:
                    aws_ended = True
                else:
                    pending.add(asyncio.ensure_future(aw))
    
            if not pending:
                return
    
            done, pending = await asyncio.wait(
                pending, return_when=asyncio.FIRST_COMPLETED
            )
            while done:
                yield done.pop()


In [None]:
gpt_query = GPTQuery(
    """You will be provided with a tweet from Weibo in Chinese. The tweet will likely involve narratives of bullying, harassment, or threats by thugs, hooligans, ruffians, gangs, or unorganized stragglers hired by someone else.

Your tasks are as follows:

1. **Relevance Check:** Determine whether the tweet is relevant to the type of narratives described above. If relevant, continue with the following tasks; otherwise, skip the following information extraction tasks and mark "relevance" as "0".
2. **Information Extraction:** Extract and summarize the key information from the tweet:
   - **Perpetrators:** Identify and summarize the identities of the violence perpetrators (e.g., names, organizations, occupations).
   - **Victims:** Identify and summarize the identities of the victims (e.g., names, organizations, occupations).
   - **Relationships:** Describe the relationships between the perpetrators and the victims.
   - **Issue Areas:** Identify the areas of dispute (e.g., commerce, finance, employment).
   - **Locations:** Specify the locations mentioned (e.g., province, city, specific places) and categorize them as rural or urban.
   - **Degrees of Violence:** Describe the level of violence (e.g., damage, casualties, deaths).

3. **Response Format:** Provide your response in well-formed JSON with the following keys:
   - `"reason"`: A string summarizing your reasoning about the relevance and elements involved.
   - `"relevance"`: An integer (`1` for relevant, `0` for irrelevant, `-1` if the text is too short or incomplete for evaluation).
   - `"identities of violence perpetrators"`: A string extracted or interpreted from the text.
   - `"identities of victims"`: A string extracted or interpreted from the text.
   - `"relationships between perpetrators and victims"`: A string extracted or interpreted from the text.
   - `"dispute issues"`: A string extracted or interpreted from the text.
   - `"locations"`: A string extracted or interpreted from the text.
   - `"rural or urban"`: A string (`农村` for rural, `城市` for urban, 'NA' if hard to tell).
   - `"degrees of violence"`: A string extracted or interpreted from the text.

4. **Special Cases:**
   - If the text is too short or incomplete, mark `"relevance"` as `-1`.
   - If the event described occurs outside of China, mark `"relevance"` as `0`.

**IMPORTANT:** The provided text is already the complete post; do not request additional text. Your response must be in Chinese.

Below is the text for analysis:
""")

import pandas as pd
df = pd.read_excel('tweets_pilot.xlsx')

queries = {row['mblogid']: row['content_long'] for _, row in df.iterrows()}

await gpt_query.batch_query(32, 'extracted_info.json', queries)

In [None]:
gpt_query = GPTQuery(
    """You will be provided with a tweet from Weibo in Chinese language. The tweet indicates an incident of harassment or violence conducted by thugs or hooligans on behalf of some government or private entities.

Your tasks are to determine the categories the tweet belongs that best describe the characteristics of the incident. Provide your response in well-formed JSON with the following keys:
   - `"relevance"`: Determine whether the tweet describes a real incident of harassment or violence conducted by thugs or hooligans on behalf of some government or private entities. The response should be an integer: `1` for relevant, `0` for irrelevant, fiction, or occurs in countries outside China, `-1` if the text is too short for evaluation.   
   - `"dispute areas"`: Determine which one of the following areas best describes the dispute issue within this tweet. Your response should be an integer from "1" to "6" that indicates the best-matching category. If the dispute issue is unspecified, your response should be "-1". If the given tweet cannot be categoried into any of the 6 categories, your response should be "99".
   - `" perpetrators"`: Determine which one of the following categories best describes the identities of violence perpetrator. 1. Government officials or local authorities; 2. Businesses and Corporations; 3. Other public sectors (e.g., schools, hospitals). Your response should be an integer from "1" to "3" that indicates the best-matching category. If the perpetrator is unidentified individuals or groups, your response should be "-1". If the given tweet cannot be categoried into any of the 3 categories, your response should be "99".
   - `"relationships between perpetrators and victims"`: Determine which one of the following categories best describes the relationship between violence perpetrator and victim. 1. Government officials vs. citizens; 2. Employment relationships; 3. Commercial relations (e.g., business owners vs. customers, business rivalries); 4. Real estate developers vs. residences; 5. Creditors vs. Debtors. Your response should be an integer from "1" to "5" that indicates the best-matching category. If the relationship is not specified, your response should be "-1". If the given tweet cannot be categoried into any of the 5 categories, your response should be "99".
   - `"degrees of violence"`: Determine which of following categories of violence are involved in this incident. 1. Verbal and psychological intimidation; 2. Property damage; 3. Freedom restriction (e.g., stalking, illegal detention); 4. Physical assult and bodily harm; 5. Property destruction; 6. Life threatening actions and deaths. Your response should be a list that covers all kinds of violence involved. If the degree of violence is not specified, your response should be "-1". 

Below is the text for analysis:""")

import pandas as pd
df = pd.read_parquet('../pilot_data_for_gpt.parquet')

queries = {row['mblogid']: row['content_long'] for _, row in df.iterrows()}

await gpt_query.batch_query(32, 'extracted_info_pilot.json', queries)


In [None]:
gpt_query = GPTQuery(
    """You will be provided with a tweet in Chinese. Your task is to determine whether the tweet describes a real-world, violent incident conducted by thugs or hooligans on behalf of some entities or individuals. The incident must happen in China. 
Provide your response in well-formed JSON with two keys:
- “reason”: Your reasoning for the task.
- “relevance”:  Your response should be an integer: `1` for relevant, `0` for irrelevant. If the text is a narrative that describes a specific violent incident with clear violent perpetrators and dispute issues, mark it as `1`. If the text describes a fiction, occurs outside China, or is commentary rather than the description of an incident, mark it as `0`.
Below is the text for analysis:""")

import pandas as pd
df = pd.read_pickle('../for_gpt_cleaning.pkl')

queries = {row['mblogid']: row['content_clean'] for _, row in df.iterrows()}

await gpt_query.batch_query(32, 'gpt_cleaned_data.json', queries)




In [None]:
# 1st layer of classification
gpt_query = GPTQuery(
    """You will be given a Weibo post in Chinese that describes an incident involving harassment or violence by thugs on behalf of certain individuals or entities.

Your task is to analyze the causes of violence and categorize the post into one of the following areas:
1. Land or housing (e.g., land seizure, forced demolition, residents protests, unfinished housing projects)
2. Financial (e.g., fraud, debt, business competition)
3. Employment (e.g., unpaid wages, workplace safety)
4. Political, policing, or legal (e.g., local election disputes, corruption, police violence)
5. COVID-19 (e.g., mentions of "疫情", "口罩", "封城" or similar words)
6. Personal conflicts
7. Other (if none of the above apply)

Provide the response in well-formed JSON format with two keys:
- “reason”: Your reasoning for the task.
- "dispute_area": Return an integer from 1 to 7 that indicates the best-matching category. Your answer must be based on the available information from the post. DO NOT speculate. Return -1 if the disputed issue is unspecified in the text. 

Text for analysis:""")

import pandas as pd
df = pd.read_pickle('../for_gpt_classification.pkl')

queries = {row['mblogid']: row['content_clean'] for _, row in df.iterrows()}

await gpt_query.batch_query(32, 'classification_1st_layer.json', queries)


In [None]:
# 2nd layer of classification -- Class 1: Land or Housing 
gpt_query = GPTQuery("""You will be given a Weibo post in Chinese describing an incident involving harassment or violence by thugs on behalf of certain individuals or entities related to land or housing issues.

Your tasks are as follows:
Categorize the specific issue in the post into one of these categories:
1. Land seizure or forced demolition of houses (e.g., mentions of "征地" or "拆迁")
2. Residents' living rights (e.g., mentions of "业主"， "居民")
3. New or unfinished housing projects (e.g., mentions of “新楼盘”, "烂尾楼")
4. Others (if none of the above apply)

Identify the perpetrator of the violence in the incident from the following categories:
1. Government officials or local authorities
2. Real estate developers or construction companies
3. Property management companies
4. Other organizations
5. Individuals

Your answer must be based on available information from the post. DO NOT speculate. 

Provide the response in well-formed JSON format with two keys:
- "reason": Your reasoning for the tasks.
- "specific_issue": Return an integer from 1 to 5 representing the best-matching category. Return -1 if the specific issue is unclear from the text. 
- "perpetrator": Return an integer from 1 to 5 representing the best-matching category. Return -1 if the specific issue is unclear from the text. 

Text for analysis:""")

import pandas as pd
df = pd.read_pickle('../for_gpt_class1.pkl')

queries = {row['mblogid']: row['content_clean'] for _, row in df.iterrows()}

await gpt_query.batch_query(32, 'class_1_issues.json', queries)

In [None]:
# 2nd layer of classification -- Class 2: Financial
gpt_query = GPTQuery("""You will be given a Weibo post in Chinese describing an incident involving harassment or violence by thugs on behalf of certain individuals or entities related to financial disputes.

Your tasks are as follows:
Categorize the specific issue in the post into one of these categories:
1. Fraud
2. Debt
3. Business competition
4. Others (if none of the above apply)

Identify who hire thugs, i.e., the perpetrator of the violence in the incident from the following categories. 
1. Government officials or local authorities
2. Banks or other financial institutes
3. Private companies 
4. Other organizations
5. Individuals

Your answer must be based on available information from the post. DO NOT speculate. 

Provide the response in well-formed JSON format with two keys:
- "reason": Your reasoning for the tasks.
- "specific_issue": Return an integer from 1 to 5 representing the best-matching category. Return -1 if the specific issue is unclear from the text. 
- "perpetrator": Return an integer from 1 to 5 representing the best-matching category. Return -1 if the specific issue is unclear from the text. 

Text for analysis:""")

import pandas as pd
df = pd.read_pickle('../all_tweets_class2.pkl')

queries = {row['mblogid']: row['content_clean'] for _, row in df.iterrows()}

await gpt_query.batch_query(32, 'class_2_issues.json', queries)

In [None]:
# 2nd layer of classification -- Class 4: Political, policing, legal
gpt_query = GPTQuery("""You will be given a Weibo post in Chinese describing an incident involving harassment or violence by thugs on behalf of certain individuals or entities related to political, policing, or legal issues.

Your tasks are as follows:

Categorize the specific issue in the post into one of these categories:
1. Official corruption  
2. Village election 
3. Policing
4. Legal cases
5. Others (if none of the above apply)

Identify who hire thugs, i.e., the perpetrator of the violence in the incident from the following categories. 
1. Government officials
2. Village authorities 
3. Polices and auxiliary polices (e.g., mentions of “协管”, "城管", "协警", "执法")
4. Legal officials
5. Others (if none of the above apply) 

Your answer must be based on available information from the post. DO NOT speculate. 

Provide the response in well-formed JSON format with two keys:
- "reason": Your reasoning for the tasks.
- "specific_issue": Return an integer from 1 to 5 representing the best-matching category. Return -1 if the specific issue is unclear in the text. 
- "perpetrator": Return an integer from 1 to 5 representing the best-matching category. Return -1 if the perpetrator is specified in the text. 

Text for analysis:""")

import pandas as pd
df = pd.read_pickle('../all_tweets_class4.pkl')

queries = {row['mblogid']: row['content_clean'] for _, row in df.iterrows()}

await gpt_query.batch_query(32, 'class_4_issues.json', queries)

In [None]:
gpt_query = GPTQuery("""You will be given a Weibo post in Chinese that describes an incident involving harassment or violence by thugs on behalf of certain individuals or entities.
    
Your third task is to identify whether the incident happens in the rural or urban area:
1. Rurual
2. Urban

Your fourth task is to determine which types of following violence are involved in this incident. 
1. Verbal intimidation 
2. Property damage
3. Freedom restriction (e.g., stalking, illegal detention)
4. Physical assult and harm
5. Life threatening actions/deaths

Your answer must be based on available information from the post. DO NOT speculate. 

Provide the response in well-formed JSON format with two keys:
- "reason": Your reasoning for the tasks.
- "region": Return an integer from 1 to 2 that indicates the best-matching category. Return -1 if the location is unclear from the text. 
- "types_of_violence": Your response should be a list that covers all kinds of violence involved. If the degree of violence is not specified, your response should be "-1". 

Text for analysis:""")

df = pd.read_pickle('../all_tweets_cleaned_final.pkl')

queries = {row['mblogid']: row['content_clean'] for _, row in df.iterrows()}

await gpt_query.batch_query(32, 'region&violencetypes.json', queries)

In [None]:
import pandas as pd

gpt_query = GPTQuery("""You will be given a Weibo post in Chinese that describes an incident involving harassment or violence by thugs on behalf of certain individuals or entities. Your task is to identify in which province of China this incident happened.

Your answer must be based on available information from the post. DO NOT speculate. 

Provide the response in well-formed JSON format with the key 'province'. Your response should be a string. If the location of this incident is not specified, your response should be "-1". 

Text for analysis:""")

df = pd.read_pickle('../all_tweets_cleaned_final.pkl')

queries = {row['mblogid']: row['content_clean'] for _, row in df.iterrows()}

await gpt_query.batch_query(32, 'province.json', queries)