<a href="https://colab.research.google.com/github/rachanabn20/MUCS-IberLEF2025/blob/main/Hope_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Hope Ensemble Models**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pyspellchecker
!pip install contractions
!pip install num2words
!pip install emoji



 ***Import necessary libraries***

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from spellchecker import SpellChecker
from nltk.tokenize import word_tokenize
import string
import emoji
from num2words import num2words
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from itertools import chain
from scipy.sparse import hstack
from sklearn.feature_selection import SelectKBest, chi2

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')



# Stopwords
en_stopwords = set(nltk.corpus.stopwords.words('english'))
es_stopwords = set(nltk.corpus.stopwords.words('spanish'))
all_stopwords = list(chain(en_stopwords, es_stopwords))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


***DATA LOADING AND EXPLORING***

In [None]:
# English train and dev datasets
en_train = pd.read_csv('/content/drive/MyDrive/PolyHopeAtIberLEF_Train_and_Dev_dat/English Train and Dev datasets/en_train.csv')
en_dev = pd.read_csv('/content/drive/MyDrive/PolyHopeAtIberLEF_Train_and_Dev_dat/English Train and Dev datasets/en_dev.csv')

# Spanish train and dev datasets
es_train = pd.read_csv('/content/drive/MyDrive/PolyHopeAtIberLEF_Train_and_Dev_dat/Spanish Train and Dev datasets/es_train.csv')
es_dev= pd.read_csv('/content/drive/MyDrive/PolyHopeAtIberLEF_Train_and_Dev_dat/Spanish Train and Dev datasets/es_dev.csv')

# Test sets
en_test=pd.read_csv('/content/drive/MyDrive/TestSets/TestSets/en_test_without_labels.csv')
es_test=pd.read_csv('/content/drive/MyDrive/TestSets/TestSets/es_test_without_labels.csv')

In [None]:
train = pd.concat([en_train, es_train], ignore_index=True)

Pre-processing

In [None]:
def expand_contractions(text):
    return contractions.fix(text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in all_stopwords]
    return ' '.join(filtered_words)

def lemmatize_text(text, language='english'):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def stem_text(text, language='english'):
    stemmer = SnowballStemmer(language)
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

def convert_emojis(text):
    return emoji.demojize(text)

def convert_numbers_to_words(text, language='en'):
    superscript_map = str.maketrans('⁰¹²³⁴⁵⁶⁷⁸⁹', '0123456789')
    text = text.translate(superscript_map)
    tokens = word_tokenize(text)
    converted_tokens = []
    for token in tokens:
        if token.isdigit():
            try:
                converted_tokens.append(num2words(int(token), lang=language))
            except NotImplementedError:
                converted_tokens.append(token)
        else:
            converted_tokens.append(token)
    return ' '.join(converted_tokens)

def remove_non_ascii(text):
    return ''.join(char for char in text if ord(char) < 128)

def remove_tags(text):
    return re.sub(r'#\w+', '', text)

def preprocess_text(text, language='english'):
    text = text.lower()
    text = remove_non_ascii(text)
    text = expand_contractions(text)
    text = remove_tags(text)
    text = remove_punctuation(text)
    text = convert_emojis(text)
    text = convert_numbers_to_words(text, language='en' if language == 'english' else 'es')
    text = remove_stopwords(text)   #, language)
    #text = lemmatize_text(text)
    #text = stem_text(text, language)
    return text


In [None]:
# Remove rows with missing values
train = train.dropna(subset=['text'])
en_dev = en_dev.dropna(subset=['text'])
es_dev = es_dev.dropna(subset=['text'])
en_test = en_test.dropna(subset=['text'])
es_test = es_test.dropna(subset=['text'])

In [None]:
# Preprocess the text data
train['processed_text'] = train['text'].apply(lambda x: preprocess_text(x))
en_dev['processed_text']=en_dev['text'].apply(lambda x:preprocess_text(x))
es_dev['processed_text']=es_dev['text'].apply(lambda x:preprocess_text(x))
en_test['processed_text'] = en_test['text'].apply(lambda x: preprocess_text(x))
es_test['processed_text'] = es_test['text'].apply(lambda x: preprocess_text(x))

In [None]:
# Train a single BPE tokenizer on the combined training dataset
def train_bpe_tokenizer(texts, vocab_size=30000):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.decoder = decoders.ByteLevel()
    trainer = trainers.BpeTrainer(vocab_size=vocab_size, show_progress=True)
    tokenizer.train_from_iterator(texts, trainer=trainer)
    return tokenizer

# Train the tokenizer on the combined training data
train_tokenizer = train_bpe_tokenizer(train['processed_text'])

def process(text):
  text=  text.replace('Ġ', '')
  return [word for word in text.split() if len(word)>2]

# Tokenize texts using the trained BPE tokenizer
def tokenize_texts(texts, tokenizer):
  text = texts.apply(lambda x: ' '.join(tokenizer.encode(x).tokens))
  return text.apply(lambda x: ' '.join(process(x)))


In [None]:

# Tokenize the training and test datasets
X_train_bpe = tokenize_texts(train['processed_text'], train_tokenizer)
X_dev_bpe_en = tokenize_texts(en_dev['processed_text'], train_tokenizer)
X_dev_bpe_es = tokenize_texts(es_dev['processed_text'], train_tokenizer)

# TF-IDF for words
tfidf_word = TfidfVectorizer(analyzer='word')
train_tfidf_word = tfidf_word.fit_transform(train['processed_text'])
X_dev_tfidf_word_en = tfidf_word.transform(en_dev['processed_text'])
X_dev_tfidf_word_es = tfidf_word.transform(es_dev['processed_text'])

# TF-IDF for characters
tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
train_tfidf_char = tfidf_char.fit_transform(train['processed_text'])
X_dev_tfidf_char_en = tfidf_char.transform(en_dev['processed_text'])
X_dev_tfidf_char_es = tfidf_char.transform(es_dev['processed_text'])

# TF-IDF for BPE tokenized texts
tfidf_bpe = TfidfVectorizer()
train_tfidf_bpe = tfidf_bpe.fit_transform(X_train_bpe)
X_dev_tfidf_bpe_en = tfidf_bpe.transform(X_dev_bpe_en)
X_dev_tfidf_bpe_es = tfidf_bpe.transform(X_dev_bpe_es)


In [None]:

# Tokenize the training and test datasets
X_train_bpe = tokenize_texts(train['processed_text'], train_tokenizer)
X_test_bpe_en = tokenize_texts(en_test['processed_text'], train_tokenizer)
X_test_bpe_es = tokenize_texts(es_test['processed_text'], train_tokenizer)

# TF-IDF for words
tfidf_word = TfidfVectorizer(analyzer='word')
train_tfidf_word = tfidf_word.fit_transform(train['processed_text'])
X_test_tfidf_word_en = tfidf_word.transform(en_test['processed_text'])
X_test_tfidf_word_es = tfidf_word.transform(es_test['processed_text'])

# TF-IDF for characters
tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
train_tfidf_char = tfidf_char.fit_transform(train['processed_text'])
X_test_tfidf_char_en = tfidf_char.transform(en_test['processed_text'])
X_test_tfidf_char_es = tfidf_char.transform(es_test['processed_text'])

# TF-IDF for BPE tokenized texts
tfidf_bpe = TfidfVectorizer()
train_tfidf_bpe = tfidf_bpe.fit_transform(X_train_bpe)
X_test_tfidf_bpe_en = tfidf_bpe.transform(X_test_bpe_en)
X_test_tfidf_bpe_es = tfidf_bpe.transform(X_test_bpe_es)


# **Binary Classification**

**DEV set**

In [None]:
y_train=train['binary']

In [None]:
# Select top k features from each matrix (adjust k as needed)
k = 1000
selector_word = SelectKBest(chi2, k=k)
selector_char = SelectKBest(chi2, k=k)
selector_bpe = SelectKBest(chi2, k=k)

train_tfidf_word_selected = selector_word.fit_transform(train_tfidf_word, y_train)
train_tfidf_char_selected = selector_char.fit_transform(train_tfidf_char, y_train)
train_tfidf_bpe_selected = selector_bpe.fit_transform(train_tfidf_bpe, y_train)

# Combine selected features
train_combined = hstack([train_tfidf_word_selected, train_tfidf_char_selected, train_tfidf_bpe_selected])

In [None]:
# Define classifiers
clf1 = LogisticRegression(max_iter=1000)
clf2 = MultinomialNB()
clf3 = SVC(probability=True)

# Train classifiers on different features
clf1.fit(train_tfidf_word_selected, y_train)
clf2.fit(train_tfidf_char_selected, y_train)
clf3.fit(train_tfidf_bpe_selected, y_train)

# Voting classifier with soft voting
voting_clf_soft = VotingClassifier(estimators=[
    ('word_tfidf', clf1),
    ('char_tfidf', clf2),
    ('bpe_tfidf', clf3)
], voting='soft')

# Voting classifier with hard voting
voting_clf_hard = VotingClassifier(estimators=[
    ('word_tfidf', clf1),
    ('char_tfidf', clf2),
    ('bpe_tfidf', clf3)
], voting='hard')


In [None]:
# Fit voting classifiers
voting_clf_soft.fit(train_combined, y_train)
voting_clf_hard.fit(train_combined, y_train)


In [None]:
#feature selection applied for test sets

X_dev_tfidf_word_sel_en = selector_word.transform(X_dev_tfidf_word_en)
X_dev_tfidf_char_sel_en = selector_char.transform(X_dev_tfidf_char_en)
X_dev_tfidf_bpe_sel_en = selector_bpe.transform(X_dev_tfidf_bpe_en)

X_dev_tfidf_word_sel_es = selector_word.transform(X_dev_tfidf_word_es)
X_dev_tfidf_char_sel_es = selector_char.transform(X_dev_tfidf_char_es)
X_dev_tfidf_bpe_sel_es = selector_bpe.transform(X_dev_tfidf_bpe_es)

In [None]:
# Combine the features for the voting classifiers
X_dev_combined_en = hstack([X_dev_tfidf_word_sel_en, X_dev_tfidf_char_sel_en, X_dev_tfidf_bpe_sel_en])
X_dev_combined_es = hstack([X_dev_tfidf_word_sel_es, X_dev_tfidf_char_sel_es, X_dev_tfidf_bpe_sel_es])

# Make predictions using voting classifiers for English test data
y_pred_soft_en = voting_clf_soft.predict(X_dev_combined_en)
y_pred_hard_en = voting_clf_hard.predict(X_dev_combined_en)

# Make predictions using voting classifiers for Spanish test data
y_pred_soft_es = voting_clf_soft.predict(X_dev_combined_es)
y_pred_hard_es = voting_clf_hard.predict(X_dev_combined_es)

In [None]:
from sklearn.metrics import classification_report

# Generate classification reports for English dev dataset
print("Classification Report for English Dev Dataset (Soft Voting):")
print(classification_report(en_dev['binary'], y_pred_soft_en))

print("Classification Report for English Dev Dataset (Hard Voting):")
print(classification_report(en_dev['binary'], y_pred_hard_en))

# Generate classification reports for Spanish dev dataset
print("Classification Report for Spanish Dev Dataset (Soft Voting):")
print(classification_report(es_dev['binary'], y_pred_soft_es))

print("Classification Report for Spanish Dev Dataset (Hard Voting):")
print(classification_report(es_dev['binary'], y_pred_hard_es))




Classification Report for English Dev Dataset (Soft Voting):
              precision    recall  f1-score   support

        Hope       0.79      0.73      0.76       899
    Not Hope       0.77      0.83      0.80      1003

    accuracy                           0.78      1902
   macro avg       0.78      0.78      0.78      1902
weighted avg       0.78      0.78      0.78      1902

Classification Report for English Dev Dataset (Hard Voting):
              precision    recall  f1-score   support

        Hope       0.80      0.74      0.76       899
    Not Hope       0.78      0.83      0.80      1003

    accuracy                           0.79      1902
   macro avg       0.79      0.78      0.78      1902
weighted avg       0.79      0.79      0.78      1902

Classification Report for Spanish Dev Dataset (Soft Voting):
              precision    recall  f1-score   support

        Hope       0.77      0.73      0.75      1926
    Not Hope       0.77      0.81      0.79      2162


**Test Set**

In [None]:
#feature selection applied for test sets

X_test_tfidf_word_sel_en = selector_word.transform(X_test_tfidf_word_en)
X_test_tfidf_char_sel_en = selector_char.transform(X_test_tfidf_char_en)
X_test_tfidf_bpe_sel_en = selector_bpe.transform(X_test_tfidf_bpe_en)

X_test_tfidf_word_sel_es = selector_word.transform(X_test_tfidf_word_es)
X_test_tfidf_char_sel_es = selector_char.transform(X_test_tfidf_char_es)
X_test_tfidf_bpe_sel_es = selector_bpe.transform(X_test_tfidf_bpe_es)

In [None]:
# Combine the features for the voting classifiers
X_test_combined_en = hstack([X_test_tfidf_word_sel_en, X_test_tfidf_char_sel_en, X_test_tfidf_bpe_sel_en])
X_test_combined_es = hstack([X_test_tfidf_word_sel_es, X_test_tfidf_char_sel_es, X_test_tfidf_bpe_sel_es])

# Make predictions using voting classifiers for English test data
y_pred_soft_en = voting_clf_soft.predict(X_test_combined_en)
y_pred_hard_en = voting_clf_hard.predict(X_test_combined_en)

# Make predictions using voting classifiers for Spanish test data
y_pred_soft_es = voting_clf_soft.predict(X_test_combined_es)
y_pred_hard_es = voting_clf_hard.predict(X_test_combined_es)

In [None]:
#submission file
# to create .csv file consisting of tweet ids and predicted labels
y_pred_en = pd.DataFrame(data=y_pred_soft_en, columns=['final_prediction'])
Submisssion_en = pd.DataFrame()
Submisssion_en['Text'] = en_test['text']
Submisssion_en['Tag'] =y_pred_soft_en
Submisssion_en.to_csv('predictions.csv', index = None)

In [None]:
#submission file
# to create .csv file consisting of tweet ids and predicted labels
y_pred_en = pd.DataFrame(data=y_pred_soft_en, columns=['final_prediction'])
Submisssion_en = pd.DataFrame()
Submisssion_en['Text'] = en_test['text']
Submisssion_en['Tag'] =y_pred_hard_en
Submisssion_en.to_csv('predictions.csv', index = None)

In [None]:
#submission file
# to create .csv file consisting of tweet ids and predicted labels
y_pred_es = pd.DataFrame(data=y_pred_hard_es, columns=['final_prediction'])
Submisssion_es = pd.DataFrame()
Submisssion_es['Text'] = es_test['text']
Submisssion_es['Tag'] = y_pred_hard_es
Submisssion_es.to_csv('predictions.csv', index = None)

In [None]:
#submission file
# to create .csv file consisting of tweet ids and predicted labels
y_pred_es = pd.DataFrame(data=y_pred_hard_es, columns=['final_prediction'])
Submisssion_es = pd.DataFrame()
Submisssion_es['Text'] = es_test['text']
Submisssion_es['Tag'] = y_pred_soft_es
Submisssion_es.to_csv('predictions.csv', index = None)

# Multi-class Classifier

**DEV set**

In [None]:
y_train_m = train['multiclass']

# Define classifiers
m_clf1 = LogisticRegression(max_iter=1000)
m_clf2 = MultinomialNB()
m_clf3 = SVC(probability=True)

# Train classifiers on different features
m_clf1.fit(train_tfidf_word_selected, y_train)
m_clf2.fit(train_tfidf_char_selected, y_train)
m_clf3.fit(train_tfidf_bpe_selected, y_train)

# Voting classifier with soft voting
m_voting_clf_soft = VotingClassifier(estimators=[
    ('word_tfidf', m_clf1),
    ('char_tfidf', m_clf2),
    ('bpe_tfidf', m_clf3)
], voting='soft')

# Voting classifier with hard voting
m_voting_clf_hard = VotingClassifier(estimators=[
    ('word_tfidf', m_clf1),
    ('char_tfidf', m_clf2),
    ('bpe_tfidf', m_clf3)
], voting='hard')

In [60]:
m_voting_clf_soft.fit(train_combined, y_train_m)
m_voting_clf_hard.fit(train_combined, y_train_m)

In [61]:
#feature selection applied for test sets

X_dev_tfidf_word_sel_en = selector_word.transform(X_dev_tfidf_word_en)
X_dev_tfidf_char_sel_en = selector_char.transform(X_dev_tfidf_char_en)
X_dev_tfidf_bpe_sel_en = selector_bpe.transform(X_dev_tfidf_bpe_en)

X_dev_tfidf_word_sel_es = selector_word.transform(X_dev_tfidf_word_es)
X_dev_tfidf_char_sel_es = selector_char.transform(X_dev_tfidf_char_es)
X_dev_tfidf_bpe_sel_es = selector_bpe.transform(X_dev_tfidf_bpe_es)

In [62]:
# Combine the features for the voting classifiers
X_dev_combined_en = hstack([X_dev_tfidf_word_sel_en, X_dev_tfidf_char_sel_en, X_dev_tfidf_bpe_sel_en])
X_dev_combined_es = hstack([X_dev_tfidf_word_sel_es, X_dev_tfidf_char_sel_es, X_dev_tfidf_bpe_sel_es])

# Make predictions using voting classifiers for English test data
y_pred_soft_en = m_voting_clf_soft.predict(X_dev_combined_en)
y_pred_hard_en = m_voting_clf_hard.predict(X_dev_combined_en)

# Make predictions using voting classifiers for Spanish test data
y_pred_soft_es = m_voting_clf_soft.predict(X_dev_combined_es)
y_pred_hard_es = m_voting_clf_hard.predict(X_dev_combined_es)

In [63]:
from sklearn.metrics import classification_report

# Generate classification reports for English dev dataset
print("Classification Report for English Dev Dataset (Soft Voting):")
print(classification_report(en_dev['multiclass'], y_pred_soft_en))

print("Classification Report for English Dev Dataset (Hard Voting):")
print(classification_report(en_dev['multiclass'], y_pred_hard_en))

# Generate classification reports for Spanish dev dataset
print("Classification Report for Spanish Dev Dataset (Soft Voting):")
print(classification_report(es_dev['multiclass'], y_pred_soft_es))

print("Classification Report for Spanish Dev Dataset (Hard Voting):")
print(classification_report(es_dev['multiclass'], y_pred_hard_es))




Classification Report for English Dev Dataset (Soft Voting):
                  precision    recall  f1-score   support

Generalized Hope       0.56      0.60      0.58       467
        Not Hope       0.64      0.85      0.73       816
  Realistic Hope       0.54      0.18      0.27       196
         Sarcasm       0.85      0.54      0.66       252
Unrealistic Hope       0.54      0.30      0.39       171

        accuracy                           0.63      1902
       macro avg       0.63      0.50      0.53      1902
    weighted avg       0.63      0.63      0.61      1902

Classification Report for English Dev Dataset (Hard Voting):
                  precision    recall  f1-score   support

Generalized Hope       0.55      0.62      0.58       467
        Not Hope       0.61      0.86      0.71       816
  Realistic Hope       0.57      0.16      0.25       196
         Sarcasm       0.88      0.37      0.52       252
Unrealistic Hope       0.55      0.27      0.36       171

   

In [64]:
#feature selection applied for test sets

X_test_tfidf_word_sel_en = selector_word.transform(X_test_tfidf_word_en)
X_test_tfidf_char_sel_en = selector_char.transform(X_test_tfidf_char_en)
X_test_tfidf_bpe_sel_en = selector_bpe.transform(X_test_tfidf_bpe_en)

X_test_tfidf_word_sel_es = selector_word.transform(X_test_tfidf_word_es)
X_test_tfidf_char_sel_es = selector_char.transform(X_test_tfidf_char_es)
X_test_tfidf_bpe_sel_es = selector_bpe.transform(X_test_tfidf_bpe_es)

In [65]:
# Combine the features for the voting classifiers
X_test_combined_en = hstack([X_test_tfidf_word_sel_en, X_test_tfidf_char_sel_en, X_test_tfidf_bpe_sel_en])
X_test_combined_es = hstack([X_test_tfidf_word_sel_es, X_test_tfidf_char_sel_es, X_test_tfidf_bpe_sel_es])

# Make predictions using voting classifiers for English test data
y_pred_soft_en = voting_clf_soft.predict(X_test_combined_en)
y_pred_hard_en = voting_clf_hard.predict(X_test_combined_en)

# Make predictions using voting classifiers for Spanish test data
y_pred_soft_es = voting_clf_soft.predict(X_test_combined_es)
y_pred_hard_es = voting_clf_hard.predict(X_test_combined_es)

In [66]:
#submission file

y_pred_en = pd.DataFrame(data=y_pred_soft_en, columns=['final_prediction'])
Submisssion_en = pd.DataFrame()
Submisssion_en['Text'] = en_test['text']
Submisssion_en['Tag'] =y_pred_soft_en
Submisssion_en.to_csv('predictions.csv', index = None)

In [67]:
#submission file

y_pred_en = pd.DataFrame(data=y_pred_soft_en, columns=['final_prediction'])
Submisssion_en = pd.DataFrame()
Submisssion_en['Text'] = en_test['text']
Submisssion_en['Tag'] =y_pred_hard_en
Submisssion_en.to_csv('predictions.csv', index = None)

In [68]:
#submission file

y_pred_es = pd.DataFrame(data=y_pred_hard_es, columns=['final_prediction'])
Submisssion_es = pd.DataFrame()
Submisssion_es['Text'] = es_test['text']
Submisssion_es['Tag'] = y_pred_hard_es
Submisssion_es.to_csv('predictions.csv', index = None)

In [69]:
#submission file

y_pred_es = pd.DataFrame(data=y_pred_hard_es, columns=['final_prediction'])
Submisssion_es = pd.DataFrame()
Submisssion_es['Text'] = es_test['text']
Submisssion_es['Tag'] = y_pred_soft_es
Submisssion_es.to_csv('predictions.csv', index = None)