In [None]:
from openai import OpenAI, AsyncOpenAI
import asyncio
import re
import json
from tqdm.notebook import tqdm
from pathlib import Path

class GPTQuery:
    system_prompt = """You are an expert in the Chinese language and in the modern Chinese freight industry. Answer all the questions from the user as accurately as possible."""
    log_filename = '../gpt-query-log.txt'
    
    prompt: str
    client: AsyncOpenAI

    _json_locator = re.compile(r'```json(.*)```', flags=re.DOTALL)

    _json_fixer: "GPTQuery" = None
    _log_file = None

    def __init__(self, prompt: str, system_prompt=None):
        self.prompt = prompt
        if system_prompt is not None:
            self.system_prompt = system_prompt
        with open('../Document/openai-key.txt') as fin:
            self.client = AsyncOpenAI(api_key=fin.read().strip())

    def _print_log(self, msg):
        if self._log_file is None:
            self._log_file = open(self.log_filename, 'a')
            
        self._log_file.write(str(msg))
        self._log_file.write('\n')
        self._log_file.flush()

    def _close_log(self):
        if self._log_file is not None:
            self._log_file.close()
            del self._log_file

    @classmethod
    def _get_json_fixer(cls):
        """chatGPT sometimes outputs invalid json. Use itself to fix its output"""
        if cls._json_fixer is None:
            cls._json_fixer = cls(
                prompt="""The following json is invalid. Your task is to fix it to be a valid json. Your response should include the reason why it is invalid, followed by the corrected json. Do not produce any extra response after the corrected json.""",
                system_prompt="""You are an expert in computer science. Accurately answer the user's requests."""
            )
        return cls._json_fixer

    async def _auto_json_fix(self, jtxt: str) -> dict:
        """automatically try to fix the json response from GPT"""
        self._print_log(f'!!!! use json fixer: {jtxt}')

        if ((start_m := jtxt.find('```')) != -1 and
            (end_m := jtxt.find('```', start_m + 3)) != -1):
            # ChatGPT occasionally misses the json format marker
            try:
                return json.loads(jtxt[start_m+3:end_m])
            except:
                pass
        
        if start_m != -1:
            # if there are multiple jsons, use the first one
            try:
                return json.loads(jtxt[:start_m])
            except:
                pass
                
        if '//' in jtxt:
            # remove comments and try again
            lines = jtxt.split('\n')
            for i, j in enumerate(lines):
                if (cmt_m := j.find('//')) != -1 and '"' not in j[cmt_m:]:
                    lines[i] = j[:cmt_m]
            try:
                return json.loads('\n'.join(lines))
            except:
                pass

        fixer = self._get_json_fixer()
        assert self is not fixer  # avoid infinite recursion
        return await fixer._query(jtxt)

    async def _query(self, query: str) -> dict:
        resp = await self.client.chat.completions.create(
            model="gpt-4-turbo-preview",
            messages=[
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": f"{self.prompt}\n\n{query}"}
            ]
        )
        msg = resp.choices[0].message
        try:
            assert msg.role == 'assistant' and msg.function_call is None and msg.tool_calls is None
            self._print_log('============')
            self._print_log(query)
            txt = msg.content
            jtxt_m = self._json_locator.search(txt)
            if jtxt_m is None:
                jtxt = txt  # gpt sometimes does not wrap the result in json blocks
            else:
                jtxt = jtxt_m.group(1)
            try:
                json_succ = False
                ret = json.loads(jtxt)
                json_succ = True
            except:
                ret = await self._auto_json_fix(jtxt)
                
            self._print_log(json.dumps(ret, ensure_ascii=False, indent=2))
            if jtxt_m is None or jtxt_m.group(0) != txt or not json_succ:
                ret['original_resp'] = txt
        except Exception as exc:
            raise RuntimeError(f'Query: {query}\nResp: {msg}') from exc
        return ret
        
    async def query(self, query: str) -> dict:
        """query a single input, asynchronously"""
        try:
            return await self._query(query)
        finally:
            self._close_log()

    async def batch_query(self, concurrency: int, result_file: Path, queries: dict[str, str]):
        """query multiple inputs concurrently and asynchrously.

        :param concurrency: number of concurrent queries allowed
        :param result_file: json file to save the results; when a new result arrives, it will be saved immediately. The old result will be read
        :param queries: a dict of the queries
        """
        
        if not isinstance(result_file, Path):
            result_file = Path(result_file)

        if result_file.exists():
            with result_file.open() as fin:
                result = json.load(fin)
                print(f'Loaded {len(result)} results')
        else:
            result = {}

        def save_result(force=False):
            if (not force) and len(result) % concurrency:
                # only save when we get a new batch of results to speed up
                return
            if result_file.exists():
                result_file.rename(result_file.with_suffix('.json.bak'))
            with result_file.open('w') as fout:
                json.dump(result, fout, ensure_ascii=False, indent=2)
            self._print_log(f'******* saved {len(result)} results')

        async def one_task(qid):
            qres = await self._query(queries[qid])
            result[qid] = qres
            save_result()

        queries = {str(k): v for k, v in queries.items()}
        tasks = [one_task(k) for k in queries.keys() if k not in result]

        try:
            with tqdm(total=len(queries)) as pbar:
                pbar.update(len(result))
                async for i in self._limit_concurrency(tasks, concurrency):
                    await i
                    pbar.update(1)
        finally:
            save_result(True)
            self._close_log()

    @classmethod
    async def _limit_concurrency(cls, aws, limit):
        """run awaitables with limited concurrency"""
        # see https://death.andgravity.com/limit-concurrency#asyncio-wait
        aws = iter(aws)
        aws_ended = False
        pending = set()
    
        while pending or not aws_ended:
            while len(pending) < limit and not aws_ended:
                try:
                    aw = next(aws)
                except StopIteration:
                    aws_ended = True
                else:
                    pending.add(asyncio.ensure_future(aw))
    
            if not pending:
                return
    
            done, pending = await asyncio.wait(
                pending, return_when=asyncio.FIRST_COMPLETED
            )
            while done:
                yield done.pop()


In [None]:
gpt_query = GPTQuery(
    """You will be given a social media post in Chinese language. These texts involve truck drivers’ experiences or opinions about freight transportation platforms. Possible freight platform names include yunmanman(运满满)、huochebang (货车帮)、huolala (货拉拉)、and manbang (满帮). Please keep in mind that abbreviations, homophones, symbols, and placeholders may also be used to avoid direct mentioning of platform names. 
Based on this text, You need to complete two tasks:

1.	Summarize the main topic of the text;
2.	On a scale of 1-7, evaluate how positive or negative are truck drivers’ sentiments towards these platforms? A score of 1 means "very negative". A score of 7 means "very positive". 

IMPORTANT: If the text is too short or seems incomplete for a useful evaluation, then use 4 as the score. If you can not evaluate the sentiment, provide the reason and use 4 as the score. The text provided later is already the complete post. NEVER ask for more text.

Your response should be in English.

Your response must be in well-formed JSON format, with three keys as decribed below:

- "reason": a string that summarizes your reasoning about the topic and the sentiment score; 
- "topic": a string that summarizes the main topic of the text;
- "result": an integer, the sentiment score of the text

Below is the text for analysis:""")

import pandas as pd
df = pd.read_parquet('../processed_data/clean_data_final.parquet')

queries = {row['tid']: row['fulltext'] for _, row in df.iterrows()}

# await gpt_query.batch_query(32, '../sentiment_and_topic.json', queries)

In [None]:
gpt_query = GPTQuery(
    """You will be given a corpus that contains multiple sentences with indexes. The sentences are separated by line breaks. These sentences are automatically-generated summarizations of social media posts involving truck drivers' experiences or opinions about freight transportation platforms. Possible freight platform names include yunmanman(运满满)、huochebang (货车帮)、huolala (货拉拉)、and manbang (满帮). The text may be a mixture of English or Chinese.

Your task is to categorize them based on their content, focusing on recurring themes or issues mentioned. Please identify 5 to 10 common topics from these sentences. The topics should be specific. Each sentence should have a corresponding topic. 

Your response should be in English.

Your response must be in well-formed JSON format that is a list containing multiple dicts, where each dict corresponds to one topic you identified. Each dict should have three keys as described below:

- "explanation": a string that explains the topic and your reasoning for giving the topic;
- "topic": a string shows the name of the topic; 
- "indexes": a list that contains the indexes of 5 sentences that best illustrate this topic.

Below is the text for analysis:""")

import pandas as pd
df = pd.read_pickle('../processed_data/grouped_corpus.pkl')

def make_query(group):
    return '\n'.join(f'{i}: {j}' for i, j in sorted(group.items()))

queries = {row['group']: make_query(row['corpus']) for _, row in df.iterrows()}

# await gpt_query.batch_query(8, '../topic_group.json', queries)

In [None]:
gpt_query = GPTQuery(
"""You will be given a social media post in Chinese language. The topic of the text is about freight transportation platforms. Possible freight platform names include yunmanman(运满满), huochebang (货车帮), huolala (货拉拉), and manbang (满帮). Abbreviations, homophones, symbols, and placeholders may also be used to avoid directly mentioning platform names. 

Here is the list of themes to which the given text may belong. The theme's definition is given following the colon. 

1. Financial Considerations: This theme encompasses discussions about the financial aspects of freight platforms, including platform fees, pricing and rates, payment, earnings, profitability, and other financial considerations of truck drivers.

2. Fraud, Scams, Privacy, and Security: This theme addresses fraudulent activities, scams, and discussions about data privacy and information security on freight platforms.

3. Regulatory and Legal Prospects: This theme involves discussions on the government’s regulatory practices and calls for regulatory oversight, compliance enforcement, and government intervention in the freight transportation industry.

4. Community Engagement and Solidarity: This theme involves community and solidarity among truck drivers, including sharing experiences, advice, and mutual support among truck drivers, as well as collective actions and boycotts against platforms.

5. Vehicle Selection and Purchase Strategies: This theme focuses on discussions and advice on selecting and purchasing vehicles for freight platforms.

6. Inquiries and Advice for Platform Usage: This theme involves inquiries and advice regarding the practical and operational aspects of working with freight platforms, including strategies for navigating platform policies and solving technical problems.

7. Evaluation of Platform Service Quality: This theme includes truck drivers' experiences working with freight transportation platforms and their evaluations of different platforms, focusing on customer support, service quality, and overall satisfaction.

8. General Discussions of Freight Industry and Job Market: This theme involves discussions on the overall situations of the freight transportation industry and the job market for truck drivers, including technological adoptions, market competition, and challenges for job seekers, etc.

You have two tasks. Your first task is to evaluate the relevance scores of the eight themes that describe how well each theme matches the given text. Each score should be an integer between 0 and 9. You should provide eight scores corresponding to each theme. Your second task is to determine which one of the above eight themes BEST describes the topic of the given text. You should respond by providing the number corresponding to the best-matching theme. 

The provided text is always complete. NEVER ask for more text. If the text is too short or incomplete for your analysis, please use -1 as the best matching theme's number and an empty list as the relevance score.

Your response must be in well-formed JSON format that is a dictionary containing three keys described below:
- "explanation": a string that explains your understanding of the given text's content and your reasoning of giving the relevance scores.
- "scores": a list of integers as the relevance scores of the themes.
- "theme": an integer that represents the theme the given text belongs to.

Make sure that your response is well-formated. Avoid the mistake of including comments in the scores list in the JSON.

Below is the text for analysis:""")


import pandas as pd
df = pd.read_parquet('../processed_data/with_gpt_results.parquet')

queries = {row['tid']: row['fulltext'] for _, row in df.iterrows()}

# await gpt_query.batch_query(32, '../theme_categorization.json', queries)

In [None]:
# Generate sub-themes
with open('../prompt.txt', 'r') as file:
    prompt = file.read()
prompts = [i.strip().replace('\xa0', '') for i in prompt.split('--------------------')]
assert len(prompts) == 8

In [None]:
import pickle
with open('theme_with_chunks.pkl', 'rb') as fin:
    data = pickle.load(fin)
assert len(data) == len(prompts)

In [None]:
for theme_id in range(len(prompts)):
    print(f'working on {theme_id} ...')
    queries = {}
    for _, row in data[theme_id].iterrows():
        assert row['theme'] == theme_id + 1
        queries[f"{row['theme']}.{row['chunk_no']}"] = make_query(row['corpus'])
    gpt_query = GPTQuery(prompts[theme_id])
    await gpt_query.batch_query(32, f'../subthemes-{theme_id+1}.json', queries)

In [None]:
# Fine tune themes

gpt_query = GPTQuery(
"""You will be given a social media post in Chinese language. The topic of the text is about freight transportation platforms. Possible freight platform names include yunmanman(运满满), huochebang (货车帮), huolala (货拉拉), and manbang (满帮). Abbreviations, homophones, symbols, and placeholders may also be used to avoid directly mentioning platform names. 

Here is the list of themes to which the given text may belong. The theme's definition is given following the colon. 

1. Vehicle selection and platform evaluation: This theme focuses on inquiries and discussions on selecting and purchasing vehicles and on the economic viability of different freight platforms, showing truck drivers' interest in entering the market through platforms. 

2. Shipping prices and Profitability on Platforms: This theme focuses on discussions on shipping prices and freight order availabilities on platforms, possible causes for low shipping prices and market downturn, and their impacts on truck drivers' earnings and livelihood.

3. Platform Fees and Service Issues: This theme focuses on platform payment issues (fees, charges, refund, etc.), strategies for nagivating platform features, and truck drivers' experience interacting with platform customer service.

4. Fraudulence, disputes, and platform mediation: This theme focuses on truck drivers' experience of fraudulence, scams, delayed payment, or nonpayment issues by freight owners and platforms' practices in resolving disputes and complaints.

5. Regulation and Governance: This theme involves truck drivers' discussions on state activities or policies in regulating freight platforms and comments on the effectiveness of government intervention. 

6. Community and Solidarity: This theme highlights the importance of community and solidarity among truck drivers, including sharing experiences, advice, and mutual support among truck drivers, as well as collective actions and boycotts against platforms.

7. General Discussions on Industry and Platforms: This theme involves discussions on the overall situations of the freight transportation industry (such as technological changes, market competition, and challenges for job seekers) and other general discussions on platforms.

You have two tasks. Your first task is to evaluate the relevance scores of the seven themes that describe how well each theme matches the given text. Each score should be an integer between 0 and 9. You should provide seven scores corresponding to each theme. Your second task is to determine which one of the above seven themes BEST describes the topic of the given text. You should respond by providing the number corresponding to the best-matching theme. 

The provided text is always complete. NEVER ask for more text. If the text is too short or too vague that you find challenging to assign a theme, please use -1 as the best matching theme's number and an empty list as the relevance score.

Your response must be in well-formed JSON format that is a dictionary containing three keys described below:
- "explanation": a string that explains your understanding of the given text's content and your reasoning of giving the relevance scores.
- "scores": a list of integers as the relevance scores of the themes.
- "theme": an integer that represents the theme the given text belongs to.

Make sure that your response is well-formated. Avoid the mistake of including comments in the scores list in the JSON. Remeber to escape the quotes in JSON strings if needed.

Below is the text for analysis:""")


import pandas as pd
df = pd.read_parquet('../processed_data/with_gpt_results.parquet')

queries = {row['id']: row['fulltext'] for _, row in df.iterrows()}

await gpt_query.batch_query(32, '../theme_categorization_2nd_round.json', queries)

In [None]:
s = '\n{\n  "explanation": "The text is inquiring about the viability and profitability of running a 6.8-meter high fence truck on the Yunmanman platform. The phrase \'干得来吃吗？\' is colloquially asking if it is profitable or feasible to work ("干") with such a vehicle on this specific freight platform. The user is seeking advice from experienced truck drivers (\'各位大神们\') about entering the market, specifically focusing on vehicle selection for the Yunmanman platform.",\n  "scores": [8, 2, 0, 0, 0, 0, 2],\n  "theme": 1\n}\n'
print(s)