In [None]:
from openai import OpenAI


client = OpenAI(
    api_key="", # your OpenAI api key
)

In [None]:
import pandas as pd
from tqdm import tqdm
import re


def extract_scores(output_text):
    scores = re.findall(r'-?\d+\.\d+', output_text)
    if len(scores) >= 2:
        us_score, cn_score = map(float, scores[:2])
        return us_score, cn_score
    else:
        return None, None

def analyze_sentiment(row, model_name, lang):

    try:

        if lang == 'CN':

            title = row['label_3']
            comment = row['text_cleaned']
            ip_label = row['ip_label']
            
            completion = client.chat.completions.create(
                model = model_name, 
                top_p = 0.1,
                presence_penalty = -2.0,
    

                messages = [
                    {
                        "role": "system",
                        "content": f"你是一位文本情感分析师，你需要给文本的情感分类。\
                                     请将评论分为3类，分别是：1.仇恨、攻击性言论以及批评和抱怨，2.相对中立的评论，3.支持和认可。\
                                     你必须在回答的开头回复两个分类（分别代表对美国的分类和对中国的分类），并在后面给出相应的分析过程。 \
                                     对于无法判断指向美国还是中国的评论，请先尝试分析指向的对象，如果难以判断，请给出相对中立范围内的分类。请尽可能不要给出完全中立的分类。",
                    },
                    {
                        "role": "user",
                        "content": f"现在你需要对来自中国的短视频平台上的评论进行情感分析。 \
                                     此条短视频题目是：<text>{title}</text>，此条评论的内容是：<text>{comment}</text>，评论者的ip地址为<text>{ip_label}</text>。 \
                                     请根据以上信息，并结合美国和中国的现实情况，仔细揣摩评论在对应的视频下指向的对象和蕴含的信息，并考虑可能存在的反讽的情况， \
                                     分析这条短视频评论对美国和中国的整体国内社会，包括但不限于政策、企业、生活等方面的支持程度。 \
                                     请回复两个分类，分别代表对美国的分类和对中国的分类。",
                    },
    
                ]           
            )
            
            output_text = completion.choices[0].message.content
            us_score, cn_score = extract_scores(output_text)    
            return pd.Series([output_text, us_score, cn_score])
            
        elif lang == 'EN':

            title = row['title']
            comment = row['text_cleaned']
    
            completion = client.chat.completions.create(
                model = model_name,  
                top_p = 0.1,
                presence_penalty = -2.0,

                # stop = ["The" ,"Analysis"],
    
                messages = [
                    {
                        "role": "system",
                        "content": f"You are a sentiment analysis expert, and your task is to assign sentiment category to a given text. \
                                    Please categorize comments into 3 categories: 1. Hate, offensive speech and criticism and complaints, 2. Relatively neutral comments, 3. Support and approval.\
                                    You must provide two categories at the beginning of your response (one representing sentiment towards the U.S. and the other towards China), followed by a detailed analysis of the reasoning behind these categories. \
                                    For comments where it's unclear whether the sentiment is directed towards the U.S. or China, try to infer the target of the comment. If it's too ambiguous to determine, assign categories within the neutral range. Please avoid giving a fully neutral category whenever possible."
                    },
                    {
                        "role": "user",
                        "content": f"You are now required to perform sentiment analysis on a comment from an American short video platform. \
                                    The title of the video is: <text>{title}</text>, and the content of the comment is: <text>{comment}</text>. The commenter’s account is registered in the U.S.  \
                                    Based on this information, and considering the current situations in both the U.S. and China, \
                                    carefully analyze the underlying meaning of the comment, including any potential sarcasm, irony, or hidden comparisons. \
                                    For example, a comment that appears to praise another country might be indirectly criticizing domestic issues in the U.S. or China. \
                                    Please analyze the sentiment of this comment toward the overall domestic situation in both the U.S. and China, including but not limited to policies, culture, businesses, and living conditions. \
                                    Respond with two scores, representing the level of support for the U.S. and China, respectively."
                    },
                ]
            
            )
            
            output_text = completion.choices[0].message.content
            us_score, cn_score = extract_scores(output_text)
            return pd.Series([output_text, us_score, cn_score])

    except Exception as e:
        print(f"Error processing row {row.name}: {e}")
        return pd.Series([None, None, None])



def handle_failed_rows(df_output, model_name, output_file, chunk_size, start_row, end_row, csv_file, lang):
    try:
        failed_rows = df_output[df_output[[f'LLM_analysis_{model_name}', f'US_score_{model_name}', f'CN_score_{model_name}']].isnull().any(axis=1)]
        
        if start_row is not None and end_row is not None:
            failed_rows = failed_rows[(failed_rows['row_id'] >= start_row) & (failed_rows['row_id'] <= end_row)]

        if not failed_rows.empty:
            print(f"Found {len(failed_rows)} failed rows in range {start_row} to {end_row}. Reprocessing them...")

            for idx in tqdm(range(0, len(failed_rows), chunk_size), desc="Reprocessing failed rows"):
                chunk = failed_rows.iloc[idx:idx + chunk_size]

                row_ids = chunk['row_id'].values
                original_rows = pd.read_csv(csv_file)
                
                original_rows = original_rows.loc[original_rows.index.isin(row_ids - 1)]  # -1 是因为 row_id 从1开始
                
                result_chunk = original_rows.apply(lambda row: analyze_sentiment(row, model_name, lang), axis=1)

                if len(result_chunk) == len(chunk):
                    df_output.loc[df_output['row_id'].isin(row_ids), [f'LLM_analysis_{model_name}', f'US_score_{model_name}', f'CN_score_{model_name}']] = result_chunk.values
                else:
                    print(f"Error: Result length {len(result_chunk)} does not match chunk length {len(chunk)}")
                    continue

            df_output.to_csv(output_file, index=False, encoding='utf-8-sig')
        else:
            print(f"No failed rows found in range {start_row} to {end_row}.")

    except Exception as e:
        print(f"Error reprocessing failed rows: {e}")










def process(csv_file, output_file, model_name, lang, chunk_size=10, start_row=None, end_row=None):
    try:
        df = pd.read_csv(csv_file)

        if lang not in ['CN', 'EN']:
            raise ValueError("variable 'lang' must be 'CN' or 'EN'")
        
        
        if lang == 'CN':
            df['label_3'] = df['label_3'].str.replace('#', '', regex=False)  
        
        elif lang == 'EN':
            df['title'] = df['title'].str.replace('#', '', regex=False)
        

        df['row_id'] = df.index + 1  
        
        if start_row is not None and end_row is not None:
            df = df.iloc[start_row-1:end_row]  
            print(f"Processing rows from {start_row} to {end_row}.")
        
        try:
            df_output = pd.read_csv(output_file)
            if 'row_id' not in df_output.columns:
                print("Warning: Output file does not contain 'row_id'. Cannot accurately check processed rows.")
                processed_row_ids = set()
            else:
                processed_row_ids = set(df_output['row_id'])
            print(f"Loaded {len(processed_row_ids)} already processed rows.")
        
        except FileNotFoundError:
            df_output = pd.DataFrame(columns=['row_id', f'LLM_analysis_{model_name}', 
                                              f'US_score_{model_name}', 
                                              f'CN_score_{model_name}'])
            processed_row_ids = set()

        df_to_process = df[~df['row_id'].isin(processed_row_ids)]

        for start_idx in tqdm(range(0, len(df_to_process), chunk_size), desc="Processing data"):
            chunk = df_to_process.iloc[start_idx:start_idx + chunk_size]
            
            result_chunk = chunk.apply(lambda row: analyze_sentiment(row, model_name, lang), axis=1)
            
            if len(result_chunk) == len(chunk):
                chunk.loc[:, [f'LLM_analysis_{model_name}', f'US_score_{model_name}', f'CN_score_{model_name}']] = result_chunk.values
            else:
                print(f"Error: Result length {len(result_chunk)} does not match chunk length {len(chunk)}")
                continue
                
            chunk_clean = chunk.dropna(how='all', axis=1)

            chunk_clean = chunk.dropna(how='any', axis=0)

            
            df_output = pd.concat([df_output, chunk[['row_id', f'LLM_analysis_{model_name}', 
                                                     f'US_score_{model_name}', 
                                                     f'CN_score_{model_name}']]])
            
            df_output.to_csv(output_file, index=False, encoding='utf-8-sig')

        handle_failed_rows(df_output, model_name, output_file, chunk_size, start_row, end_row, csv_file, lang)

    except Exception as e:
        print(f"Processing failed: {e}")


In [None]:
# Variables

# csv_file: the read document
# output_file: the written document
# model_name: the model to use
# chunk_size: how many rows to process in a batch
# start_row: The first row to process (included) 
# end_row=None: Last row processed (included)

process('labeled_CN_douyin.csv', 'labeled_CN_douyin_.csv', model_name = "gpt-4o-mini", lang='EN', chunk_size=10, start_row=1, end_row=1000)