In [None]:
from EntityDataset import TrainDataset
from LoadData import LoadData
import pandas as pd
import numpy as np

In [1]:

sentiments = ["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"]
main_roles = ["Protagonist", "Antagonist", "Innocent"]
sentiment_to_main_role = {
    "Very Negative": "Antagonist",
    "Negative": "Antagonist",
    "Neutral": "Innocent",
    "Positive": "Protagonist",
    "Very Positive": "Protagonist"
}
languages = ["EN", "RU", "PT", "HI", "BG"]
base_dir = "train"
txt_file = "subtask-1-annotations.txt"

ld = LoadData()

results_df = pd.DataFrame(columns = ['Language', 'Accuracy', 'Precision_Protagonist', 'Precision_Antagonist', 'Precision_Innocent', 'Recall_Protagonist', 'Recall_Antagonist', 'Recall_Innocent'])

for lang_idx, lang in enumerate(languages):

    total_instances = 0
    match = 0
    # Load data for each language
    data = ld.load_data(base_dir, txt_file, lang)
    train_dataset = TrainDataset(data, base_dir, language=lang, return_sentiment=True, coref=False)
    
    df = pd.DataFrame(columns=['article_id', 'entity_mention', 'main_role', 'predicted_main_role'])

    for idx, sample in enumerate(train_dataset):

        index = np.argmax(sample['sent_sent'])

        sent_main_role = sentiment_to_main_role[sentiments[index]]
        true_main_role = sample["main_role"]

        if sent_main_role == true_main_role:
            match += 1
        
        total_instances += 1

        df = pd.concat([df, pd.DataFrame([{
            'article_id': sample['article_id'],
            'entity_mention': sample['entity_mention'],
            'main_role': true_main_role,
            'predicted_main_role': sent_main_role
        }])], ignore_index=True)
        
    accuracy = match / total_instances

    # Calculate precision and recall for each main role
    precision_dict = {}
    recall_dict = {}

    for role in main_roles:
        true_positives = len(df[(df['main_role'] == role) & (df['predicted_main_role'] == role)])
        predicted_positives = len(df[df['predicted_main_role'] == role])
        actual_positives = len(df[df['main_role'] == role])
        
        precision = true_positives / predicted_positives if predicted_positives > 0 else 0
        recall = true_positives / actual_positives if actual_positives > 0 else 0
        
        precision_dict[role] = precision
        recall_dict[role] = recall

    # Create a row for the CSV
    results_df = pd.concat([results_df,pd.DataFrame({
        'Language': [lang],
        'Accuracy': [accuracy],
        **{f'Precision_{role}': [precision_dict[role]] for role in main_roles},
        **{f'Recall_{role}': [recall_dict[role]] for role in main_roles}
        
    })], ignore_index= True)
    
    # Append to CSV file, create if doesn't exist
    

    df.to_csv(f"./sentiment_res/{lang}_sentiment.csv", index=False)
results_df.to_csv('./sentiment_res/evaluation.csv', header=not pd.io.common.file_exists('./sentiment_res/sentiment.csv'), index=False)

NameError: name 'LoadData' is not defined

## Dependency Parsing Playground

In [5]:
import spacy
from spacy import displacy
from spacy.tokens import Span

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Sample text (you can replace this with reading from a file)
text = """
NASA launched a new satellite into orbit. The satellite will observe solar storms. 
Elon Musk announced a new partnership with NASA. 
The mission is expected to last two years.
"""

# Process the document
doc = nlp(text)

# Extract named entities we care about
entities = {ent.text for ent in doc.ents if ent.label_ in {"PERSON", "ORG", "GPE"}}

# Recursive function to trace up the dependency tree until a verb or meaningful head is found
def trace_dependency_path(token, depth=0):
    if token.pos_ == "VERB" or token.dep_ in {"attr", "acomp", "dobj"}:
        return token
    if token.head == token or depth > 5:  # prevent infinite loop
        return token
    return trace_dependency_path(token.head, depth + 1)

# Collect (entity_token, defining_token) pairs
arrows = []
for token in doc:
    for ent in entities:
        if token.text == ent.split()[0]:  # basic match on first word of entity
            root = trace_dependency_path(token)
            if root != token:
                arrows.append((token, root))

# Convert to custom spans for visualization
custom_spans = []
for start, end in arrows:
    label = f"↠ {end.text}"
    span = Span(doc, start.i, start.i + 1, label=label)
    custom_spans.append(span)

# Replace the document's entities with our custom ones
doc.set_ents([], default="unmodified")
doc.ents = custom_spans

# Generate unique colors for each arrow label
options = {
    "colors": {span.label_: "lightblue" for span in custom_spans}
}

# Launch visualization in browser
displacy.serve(doc, style="ent", options=options, auto_select_port=True)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5001 ...



127.0.0.1 - - [11/Mar/2025 16:49:20] "GET / HTTP/1.1" 200 1553
127.0.0.1 - - [11/Mar/2025 16:49:20] "GET /favicon.ico HTTP/1.1" 200 1553


Shutting down server on port 5001.


In [8]:
import spacy
from collections import defaultdict

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample article
text = """
NASA launched a new satellite into orbit. The satellite will observe solar storms. 
Elon Musk announced a new partnership with NASA. 
The mission is expected to last two years.
"""

# Process text
doc = nlp(text)

# Extract named entities
entities = {ent.text: ent.root for ent in doc.ents if ent.label_ in {"PERSON", "ORG", "GPE", "NORP"}}

# Helper to build a short action phrase
def get_action_phrase(verb_token):
    phrase = [verb_token.text]
    for child in verb_token.children:
        if child.dep_ in {"dobj", "pobj", "attr", "xcomp", "acomp", "nmod"}:
            subtree = ' '.join([t.text for t in child.subtree])
            phrase.append(subtree)
    return ' '.join(phrase)

# Store results
entity_actions = defaultdict(list)

for sent in doc.sents:
    for ent_text, ent_root in entities.items():
        if ent_root in sent:
            verb = None
            role = None

            if ent_root.dep_ in {"nsubj", "nsubjpass"}:
                verb = ent_root.head
                role = "Subject"
            elif ent_root.dep_ in {"dobj", "pobj"}:
                verb = ent_root.head
                role = "Object"

            if verb:
                phrase = get_action_phrase(verb)
                entity_actions[ent_text].append([role, phrase])

# Pretty-print results
for ent, actions in entity_actions.items():
    print(f'"{ent}": {actions}')


"NASA": [['Object', 'with NASA']]
"Elon Musk": [['Subject', 'announced a new partnership with NASA']]


In [None]:
from LoadData import LoadData
from EntityDataset import DataLoader
from TextFilter import TextFilter

base_dir = "train"
txt_file = "subtask-1-annotations.txt"
subdirs = "EN"
load_data = LoadData()
data = load_data.load_data(base_dir, txt_file, subdirs)
data_loader = DataLoader(data, base_dir)

filter = TextFilter()

for target, text in data_loader.yield_text():
    print(target)
    
    print(filter.extract_target_context(text, target))
    print()
    break