In [None]:
from FoRC4CL import FoRC4CLData
from utils.utils import get_all_labels

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier,  VotingClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

import warnings
import numpy as np
import pandas as pd


In [None]:
# Suppress an annoying warning
warnings.filterwarnings("ignore", message="Label not .* is present in all training examples.")

## Trains and tests basic ML models on the FoRC4CL train/test split. 

In [None]:
# Get the training and testing data + labels
data_train = FoRC4CLData(forc4cl_data_path="data/forc4cl_fulltext/train_fulltext.csv")
data_test = FoRC4CLData(forc4cl_data_path="data/forc4cl_fulltext/test_fulltext.csv")

X_train = data_train._get_documents(lowercase=True,stem=True)
y_train = data_train._get_labels()

X_test = data_test._get_documents(lowercase=True,stem=True)
y_test = data_test._get_labels()

print("Got training and testing data.")

Got FoRC4CL data
Got FoRC4CL data
Got training and testing data.


In [None]:
# Tfidf-vectorize the training and testing data
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Data is vectorized.")

Data is vectorized.


In [None]:
#Initialise the classifier. Initialise multiple classifiers if using an ensemble later
clf1 = OneVsRestClassifier(LogisticRegression(class_weight='balanced',solver='liblinear',penalty='l1'))


In [None]:
# Train the classifier. Again, train multiple classifiers if using an ensemble later
clf1.fit(X_train_tfidf, y_train)
print("Classifier initialized.")




Classifier 1 initialized.


In [11]:
# Make predictions
y_pred_1 = clf1.predict(X_test_tfidf)
print("Predictions have been made by classifier 1.")


Predictions have been made by classifier 1.


### The following two cells define and run a majority voting ensemble using as many models as you like

In [48]:
def majority_voting(predictions, threshold=0.5):
    """
    Perform majority voting on multi-label predictions.
    
    :param predictions: List of numpy arrays of shape (num_samples, num_labels)
                        Each array is a model’s binary predictions.
    :param threshold: Fraction of models that must predict 1 for the final vote.
                      Default is 0.5 (majority voting).
    :return: Final ensemble predictions (numpy array of shape (num_samples, num_labels))
    """
    predictions = np.array(predictions)  # Shape: (num_models, num_samples, num_labels)
    vote_counts = np.sum(predictions, axis=0)  # Sum over models, shape: (num_samples, num_labels)
    
    # Apply threshold: If more than (threshold * num_models) models predict 1, assign 1
    num_models = predictions.shape[0]
    final_predictions = (vote_counts >= (threshold * num_models)).astype(int)
    
    return final_predictions

In [59]:
# Create ensemble predictions
y_pred = majority_voting([y_pred_1, y_pred_2, y_pred_3], threshold=0.4)
print(y_pred[0])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [60]:
# Print evaluations
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=get_all_labels()))


Classification Report:
                                                     precision    recall  f1-score   support

             Abstract Meaning Representation (AMR)       0.00      0.00      0.00         0
                    Abstractive Text Summarization       1.00      1.00      1.00         1
Acronyms and Abbreviations Detection and Expansion       0.00      0.00      0.00         0
                                   Active Learning       0.00      0.00      0.00         1
                Adversarial Attacks and Robustness       1.00      1.00      1.00         2
                              Adversarial Learning       0.50      0.33      0.40         3
                               Anaphora Resolution       0.00      0.00      0.00         1
                              Annotation Processes       0.46      0.30      0.36        20
                                   Argument Mining       1.00      1.00      1.00         2
                            Aspect-Based SA (ABSA)     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Generate weakly labeled predictions for the ACL dataset

In [None]:
# Turns a prediction vector into a list of labels
def vector_to_label_list(predictions):
    labels = []
    for i in range(len(predictions)):
        pred = predictions[i]
        to_labels = [get_all_labels()[j] for j in range(len(pred)) if pred[j] == 1]
        labels.append(to_labels)
    return labels

# Unflatten a list of labels
def sort_predictions_in_hierarchy(predictions):
    level1labels, level2labels, level3labels = get_all_labels(level='lvl1'), get_all_labels(level='lvl2'), get_all_labels(level='lvl3')
    hierarchical_predictions = []
    for prediction in predictions:
        level1 = []
        level2 = []
        level3 = []
        for label in prediction:
            if label in level1labels:
                level1.append(label)
            elif label in level2labels:
                level2.append(label)
            elif label in level3labels:
                level3.append(label)
        prediction_hierarchical = [level1, level2, level3]
        hierarchical_predictions.append(prediction_hierarchical)
    return hierarchical_predictions

In [None]:
# Get the training data and the data to label.
# This code takes 6 minutes to run.
data_train = FoRC4CLData(forc4cl_data_path="data/forc4cl_fulltext/train_fulltext.csv")
data_test = FoRC4CLData(forc4cl_data_path="data/acl/acl_with_fulltext.csv") #acl data can be processed with FoRC4CLData because the formatting is the same

X_train = data_train._get_documents(lowercase=True,stem=True,full_text=False)
print("Got training documents")
y_train = data_train._get_labels()
print("Got training labels")

# y_test is not needed as it is used only for evaluation
X_test = data_test._get_documents(lowercase=True,stem=True)
print("Got training and testing data.")

Got FoRC4CL data
Got FoRC4CL data
Got training documents
Got training labels
Got training and testing data.


In [30]:
# Tfidf-vectorize the training and testing data
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Data is vectorized.")

Data is vectorized.


In [6]:
# Make the classifier
#classifier = OneVsRestClassifier(LogisticRegression(class_weight='balanced'))
classifier = OneVsRestClassifier(LogisticRegression(class_weight='balanced',solver='liblinear',penalty='l1'))
#classifier = OneVsRestClassifier(SGDClassifier(class_weight='balanced'))
#classifier = OneVsRestClassifier(RandomForestClassifier(class_weight='balanced'))
#classifier = OneVsRestClassifier(SVC(class_weight='balanced',kernel='linear',probability=True))
#classifier = XGBClassifier(n_estimators = 2, max_depth=8, learning_rate=1, objective='binary:logistic')

In [None]:
# Train the classifier
classifier.fit(X_train_tfidf, y_train)
print("Classifier initialized.")

# If you want to use a model ensemble, copy the relevant code from the previous section.

Classifier initialized.


In [28]:
# Make predictions
y_pred = classifier.predict(X_test_tfidf)
print("Predictions have been made.")
print(y_pred[0])

Predictions have been made.
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
# Process the predictions to get hierarchical labels
# This code runs for 4-5 minutes.
labels = vector_to_label_list(y_pred)
hierarchical_labels = sort_predictions_in_hierarchy(labels)

In [None]:
# Create a new file with the predictions
data_test._write_predictions_to_new_file(hierarchical_labels, "preds.csv")

In [None]:
# Convert to the formatting required for the FoRC CodaBench competition
df = pd.read_csv("preds.csv")

new_df = df[['data_index', 'Level1', 'Level2', 'Level3']]
new_df = new_df.rename(columns={'Level1': 'Level1_pred', 'Level2': 'Level2_pred', 'Level3': 'Level3_pred'})

new_df.to_csv("predictions.csv", index=False)

## Analyse the weakly supervised dataset

In [None]:
# Load the file with predictions
finished_data = FoRC4CLData("acl_predictions/acl_logisticregression_preds_with_fulltext.csv")

Got FoRC4CL data


In [None]:
finished_data._count_missingclasses()

Number of classes missing: 40
['Acronyms and Abbreviations Detection and Expansion', 'Annotation Processes', 'Aspect-Based SA (ABSA)', 'Backdoor Attacks', 'Biomedical NLP', 'Causality Relations Extraction', 'Citation Analysis', 'Constituency Parsing', 'Dependency Parsing', 'Disfluency Detection', 'Explainability and Interpretability', 'Fake News Detection', 'Fake Review Detection', 'Human-machine Interaction', 'Infrastructure or Platform Development', 'Irony Detection', 'Long Short-Term Memory (LSTM) Models', 'Multi-document Summarization', 'NER for Nested Entities', 'NLP for Arts', 'NLP for Climate', 'NLP for Education', 'NLP for Literature', 'NLP for Mental Health', 'NLP for Music', 'NLP for Politics', 'NLP for Social Media', 'Narrative Plot in Storytelling', 'Natural Language Inference (NLI)', 'Ontology Construction', 'Ontology Extension', 'Ontology Matching', 'Recommender Systems', 'Robotics', 'Scientific Document Summarization', 'Semantic Role Labeling', 'Sentence Segmentation', '

In [None]:
finished_data._count_instances_per_class()

Number of rows: 41107
Author Detection: 2
Speech Synthesis: 3
Idiomatic Expressions: 4
Scientific Document Summarization: 4
Ontology Construction: 9
Email Spam and Phishing Detection: 14
Long Form QA: 21
Lyrics Generation: 23
NER for Nested Entities: 30
Narrative Plot in Storytelling: 31
Mathematical QA: 32
Causality Relations Extraction: 33
Hope Speech Detection: 35
Fake Review Detection: 39
Authorship Verification: 50
Automated Essay Scoring: 51
Hypernymy Extraction: 52
Latent Dirichlet Allocation (LDA): 55
Poetry Generation: 57
Rumor Detection: 61
Optical Character Recognition (OCR): 62
Story Generation: 74
Table-to-Text Generation: 82
Citation Analysis: 84
Anaphora Resolution: 86
Plagiarism Detection: 97
Sign Language and Fingerspelling Recognition: 99
Text-to-SQL: 105
Recommender Systems: 110
Multi-agent Communication Systems: 116
Intelligent Agents: 116
Multihop Reasoning: 116
Personality Trait Prediction: 118
Multiple Choice QA (MCQA): 133
Video Captioning: 136
Sentence Segmenta