In [1]:
from langchain.llms import Ollama
import json
import os
import pandas as pd
from tqdm import tqdm
llm = Ollama(model="llama3:70b")


prompt_few_shot="An event is defined as any stimulus that an individual\'s \
                environment or within that individual(e.g. thoughts of feelings) that has a good or bad effect from the individual\'s \
                point of view. Events can be mental(e.g. I was afraid), social(e.g. I got a pay raise) or physical(e.g. I got in a car accident).\
                Events should be unambiguously good or bad from the individual\'s point of view and may occur in the past, present or hypothetical future.  \
                Please analyze and determine the provided events. We should retain events that are unfavorable for the participants, i.e., bad events. \
                These events should not state facts and one's emotion or pose questions but rather describe a trigger for a bad situation for the participants. \
                Please keep such events and rewrite them into concise and fluent versions. For events that do not meet the requirements, please output:None\
                If there is no event in the post, you should output:\"Event:None\"\
                Please refer to the examples I gave:\
                <example>\
                Event: I got in a fight with a good friend.\
                Event: I got in a fight with friend.\
                </example>\
                <example>\
                Event: not well\
                Event: None\
                </example>\
                <example>\
                Event: why are you sad?\
                Event: None\
                </example>\
                <example>\
                Event: I bet you received lots of hit from that tweet; at work i cannot, wish i could\
                Event: I received hit from that tweet\
                </example>\
                <example>\
                Event: I`m so sad, really really sad\
                Event: None\
                </example>\
                You should answer with a specific format. For example, you should output:\"Event:...\"\
                Please understand what is event that we need from above explanation and extract the main event from the following paragraph:"
          

def event_extraction(input_file, output_folder):
    
    with open(input_file, 'r') as file:
        data = json.load(file)
        posts = [o["Event"] for o in data]


    data_list = []

    for data in tqdm(posts, desc="Processing Event"):
        try:
            prompt=prompt_few_shot+str(data)
        except:
            print("some errors")
        res = llm.predict(prompt)
        result_str = res.split("Event:", 1)[-1].strip()
        data_res_dict = {
            "Origin_Event": data,
            "Event": result_str
        }
        data_list.append(data_res_dict)

    output_json_path = os.path.join(output_folder, os.path.basename(input_file).split('.')[0]+'_event_filter.json')


    with open(output_json_path, 'w') as json_file:
        json.dump(data_list, json_file, indent=4)

input_folder =f"/home/qiang/projects/Digital_mental_health/Dataset/Condidate_sentiment_dataset/1_event_full/"
output_folder = f"/home/qiang/projects/Digital_mental_health/Dataset/Condidate_sentiment_dataset/1_event_filtered_all/"

# 确保输出文件夹存在
os.makedirs(output_folder, exist_ok=True)

for filename in tqdm(os.listdir(input_folder), desc="Processing Json"):
    if filename.endswith(".json"):
        input_filename = os.path.join(input_folder, filename)
        event_extraction(input_filename, output_folder)

print("处理完成。")

Processing Event: 100%|██████████| 50/50 [00:55<00:00,  1.11s/it]
Processing Event: 100%|██████████| 50/50 [00:38<00:00,  1.31it/s]]
Processing Event: 100%|██████████| 50/50 [00:35<00:00,  1.39it/s]]
Processing Event: 100%|██████████| 50/50 [00:37<00:00,  1.33it/s]]
Processing Event: 100%|██████████| 50/50 [00:36<00:00,  1.37it/s]]
Processing Event: 100%|██████████| 50/50 [00:37<00:00,  1.33it/s]]
Processing Event: 100%|██████████| 50/50 [00:36<00:00,  1.37it/s]]
Processing Event: 100%|██████████| 50/50 [00:37<00:00,  1.33it/s]]
Processing Event: 100%|██████████| 50/50 [00:38<00:00,  1.31it/s]]
Processing Event: 100%|██████████| 50/50 [00:37<00:00,  1.35it/s]]
Processing Event: 100%|██████████| 50/50 [00:40<00:00,  1.24it/s]t]
Processing Event: 100%|██████████| 50/50 [00:39<00:00,  1.25it/s]t]
Processing Event: 100%|██████████| 50/50 [00:37<00:00,  1.32it/s]t]
Processing Event: 100%|██████████| 50/50 [00:35<00:00,  1.43it/s]t]
Processing Event: 100%|██████████| 50/50 [00:37<00:00,  1.3

处理完成。





Filter the none

In [2]:
import os
import json

def remove_none_events(folder_path):
    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
            
            updated_data = [element for element in data if 'NONE' not in element.get('Event', '').upper()]
            
            with open(file_path, 'w', encoding='utf-8') as file:
                json.dump(updated_data, file, indent=4, ensure_ascii=False)

# Example usage:
folder_path = '/home/qiang/projects/Digital_mental_health/Dataset/Condidate_sentiment_dataset/1_event_filtered_all'
remove_none_events(folder_path)

100%|██████████| 156/156 [00:00<00:00, 2815.82it/s]
