In [2]:
data.columns

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at',
       'can_the_verb_span_stylecolorblueverb_span_be_anchored_in_time',
       'can_the_verb_span_stylecolorblueverb_span_be_anchored_in_time:confidence',
       'after', 'before', 'bodytext',
       'can_the_verb_span_stylecolorblueverb_span_be_anchored_in_time_gold',
       'docid', 'eventid', 'verb'],
      dtype='object')

In [10]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter

# List of CSV files to load
files = ['MATRES.csv', 'MATRES2.csv', 'MATRES3.csv']
all_cleaned_sentences = []

# Tokenize text into sentences from each file
nltk.download('punkt')

for file in files:
    df = pd.read_csv(file)

    # Tokenize text into sentences
    df['sentences'] = df['bodytext'].apply(sent_tokenize)

    # Tokenize words and create a word frequency list
    all_words = [word for sentences in df['sentences'] for sent in sentences for word in word_tokenize(sent)]
    word_freq = Counter(all_words)

    # Extract temporal keywords
    temporal_keywords = [word for word, freq in word_freq.items() if any(kw in word.lower() for kw in ['before', 'after', 'during', 'when', 'while', 'since', 'until', 'day', 'month', 'year'])]

    # Flatten all sentences into a single list and ensure no repetitions
    all_sentences = df['sentences'].sum()
    unique_sentences = list(set(all_sentences))  # Removes repeated sentences

    # Filter sentences that contain temporal keywords and have <= 20 words
    filtered_temporal_sentences = [sent for sent in unique_sentences if len(word_tokenize(sent)) <= 20 and any(keyword in sent for keyword in temporal_keywords)]

    # Clean sentences to remove <p> tags
    cleaned_sentences = [sent.replace('<p>', '').replace('</p>', '').strip() for sent in filtered_temporal_sentences]

    # Append to the master list
    all_cleaned_sentences.extend(cleaned_sentences)

# Save the filtered sentences to a .txt file
with open('filtered_temporal_sentences.txt', 'w') as file:
    for sent in set(all_cleaned_sentences):  # Ensure unique sentences across all files
        file.write(sent + '\n')

print("Filtered temporal sentences saved to 'filtered_temporal_sentences.txt'.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Filtered temporal sentences saved to 'filtered_temporal_sentences.txt'.
