### 将csv文件根据cause进行分割

In [3]:
import pandas as pd
from pathlib import Path

file_path='./added_CAMS_data.csv'
# Load the CSV file
try:
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv(file_path, encoding='ISO-8859-1')  # 'latin1' is another common encoding
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, encoding='cp1252')  # Another common encoding in Windows


# Ensure the output directory exists
output_dir = Path('./added_output/')
output_dir.mkdir(parents=True, exist_ok=True)

# Group the dataframe by the 'ANNOTATIONS' column
grouped = df.groupby('cause')

# Iterate over each group and save to a separate CSV file
for name, group in grouped:
    output_file_path = output_dir / f'{name}.csv'
    group.to_csv(output_file_path, index=False)

print("Files have been saved successfully.")


Files have been saved successfully.


### 翻译csv文件并导出为中文csv文件

In [2]:
import csv
from googletrans import Translator
import pandas as pd

translator = Translator()

def translate_to_chinese(text):
    try:
        # 尝试翻译文本
        translation = translator.translate(text, dest="zh-cn")
        return translation.text
    except Exception as e:
        print(f"An error occurred during translation: {e}")
        return text  # 如果翻译失败，返回原始文本


# 读取CSV文件
df = pd.read_csv('./added_output/0.csv')

# 对第一列进行翻译
df.iloc[:, 0] = df.iloc[:, 0].apply(translate_to_chinese)

# 保存修改后的CSV文件
df.to_csv('translated_output_file.csv', index=False)

print("Translation completed and file saved.")


An error occurred during translation: 'Translator' object has no attribute 'raise_Exception'
An error occurred during translation: 'Translator' object has no attribute 'raise_Exception'
An error occurred during translation: 'Translator' object has no attribute 'raise_Exception'
An error occurred during translation: 'Translator' object has no attribute 'raise_Exception'
An error occurred during translation: 'Translator' object has no attribute 'raise_Exception'
An error occurred during translation: 'Translator' object has no attribute 'raise_Exception'
An error occurred during translation: 'Translator' object has no attribute 'raise_Exception'
An error occurred during translation: 'Translator' object has no attribute 'raise_Exception'
An error occurred during translation: 'Translator' object has no attribute 'raise_Exception'
An error occurred during translation: 'Translator' object has no attribute 'raise_Exception'
An error occurred during translation: 'Translator' object has no attri

### 删选一些单词少的event


In [4]:
from langchain.llms import Ollama
from guidance import models, gen
import json
import os
import pandas as pd
from tqdm import tqdm


input_filename='0_test_event.json'
output_filename = "0_test_event_filtered_data.json"


with open(input_filename, 'r') as file:
        try:
            data = json.load(file)
        except:
            print("error")

filtered_data_list = [element for element in data if len(element["Event"].split()) >= 5]

# 保存为新的 JSON 文件

with open(output_filename, 'w', encoding='utf-8') as json_file:
    json.dump(filtered_data_list, json_file, indent=4)

print(f"Before:{len(data)}")
print(f"After:{len(filtered_data_list)}")
print(f"已将过滤后的数据保存到 '{output_filename}' 文件中。")

Before:50
After:40
已将过滤后的数据保存到 '0_test_event_filtered_data.json' 文件中。


In [4]:
import json
import glob
import os

# 定义原始文件夹和目标文件夹
source_folder = '/home/qiang/projects/CAMS/CAMS/data/added_output/IAS_EAS_fewshot_70B'
target_folder = '/home/qiang/projects/CAMS/CAMS/data/added_output/IAS_EAS_fewshot_70B_modified'

# 确保目标文件夹存在
os.makedirs(target_folder, exist_ok=True)

# 获取所有json文件的列表
files = glob.glob(os.path.join(source_folder, '*.json'))

# 初始化ID
current_id = 1

for file in files:
    # 读取文件内容
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # 为每个Post添加ID
    for item in data:
        item['ID'] = current_id
        current_id += 1
    
    # 写回修改后的数据到新文件夹中
    # 获取原始文件名
    base_name = os.path.basename(file)
    new_file_name = os.path.join(target_folder, f"modified_{base_name}")
    with open(new_file_name, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(file)

print("所有文件处理完毕。")


/home/qiang/projects/CAMS/CAMS/data/added_output/IAS_EAS_fewshot_70B/3_2_event_filtered_IAS_IAS2EAS.json
/home/qiang/projects/CAMS/CAMS/data/added_output/IAS_EAS_fewshot_70B/5_2_event_filtered_IAS_IAS2EAS.json
/home/qiang/projects/CAMS/CAMS/data/added_output/IAS_EAS_fewshot_70B/4_8_event_filtered_IAS_IAS2EAS.json
/home/qiang/projects/CAMS/CAMS/data/added_output/IAS_EAS_fewshot_70B/5_0_event_filtered_IAS_IAS2EAS.json
/home/qiang/projects/CAMS/CAMS/data/added_output/IAS_EAS_fewshot_70B/5_1_event_filtered_IAS_IAS2EAS.json
/home/qiang/projects/CAMS/CAMS/data/added_output/IAS_EAS_fewshot_70B/3_7_event_filtered_IAS_IAS2EAS.json
/home/qiang/projects/CAMS/CAMS/data/added_output/IAS_EAS_fewshot_70B/4_3_event_filtered_IAS_IAS2EAS.json
/home/qiang/projects/CAMS/CAMS/data/added_output/IAS_EAS_fewshot_70B/4_2_event_filtered_IAS_IAS2EAS.json
/home/qiang/projects/CAMS/CAMS/data/added_output/IAS_EAS_fewshot_70B/5_9_event_filtered_IAS_IAS2EAS.json
/home/qiang/projects/CAMS/CAMS/data/added_output/IAS_EA