## import library

In [4]:
# built-in
import re, codecs
import time
import os

# pandas
import pandas as pd
import numpy as np

# nlp
import nltk 

# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# plottong
import matplotlib.pyplot as plt


In [5]:
import sklearn
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold,train_test_split,cross_val_score,cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import seaborn as sns
import matplotlib.pyplot as plt

import pickle

In [6]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

from nltk.corpus import stopwords

In [7]:
import random

def set_seeds(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    # tf.random.set_random_seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    
set_seeds(seed=42)

## read data

In [8]:
import pandas as pd

def read_semeval_data(data_type='train', language='EN'):
    if data_type == 'train':
        annotation = f'training_data/{language}/subtask-2-annotations.txt'
        directory_path = f'training_data/{language}/raw-documents'
    elif data_type == 'test':
        annotation = f'dev_data/{language}/subtask-2-annotations.txt'
        directory_path = f'dev_data/{language}/subtask-2-documents'
        

    label_info = pd.read_csv(annotation, sep='\t', header=None)
    label_info.columns = ['filename', 'narrative', 'sub_narrative']
    
    file_info = []
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, "r", encoding="utf-8") as file:
                file_info.append((filename, file.read()))

    text_df = pd.DataFrame(file_info, columns=['filename', 'text'])
    df = label_info.merge(text_df, on='filename', how='inner')
    df['language'] = language
    # df['narrative_labels'] = df['narrative'].apply(lambda x: x.split(';'))
    # df['sub_narrative_labels'] = df['sub_narrative'].apply(lambda x: x.split(';'))
    return df

In [9]:
train_df = pd.concat([
    read_semeval_data(data_type='train', language='EN'),
    read_semeval_data(data_type='train', language='PT')
], ignore_index=True)
train_df

Unnamed: 0,filename,narrative,sub_narrative,text,language
0,EN_CC_100013.txt,CC: Criticism of climate movement,CC: Criticism of climate movement: Ad hominem ...,Bill Gates Says He Is ‘The Solution’ To Climat...,EN
1,EN_UA_300009.txt,Other,Other,Russia: Clashes erupt in Bashkortostan as righ...,EN
2,EN_UA_300017.txt,Other,Other,"McDonald's to exit Russia, sell business in co...",EN
3,EN_CC_100021.txt,Other,Other,"Collaborative plans, innovation keys to circul...",EN
4,EN_UA_300041.txt,Other,Other,Russia intends to supply light ‘Mountain’ tank...,EN
...,...,...,...,...,...
794,PT_272.txt,URW: Praise of Russia;URW: Praise of Russia;UR...,URW: Praise of Russia: Praise of Russian milit...,Exército da Rússia anuncia conquista da cidade...,PT
795,PT_323.txt,CC: Amplifying Climate Fears,CC: Amplifying Climate Fears: Other,Cabo Verde vai ter novo laboratório de Observa...,PT
796,PT_74.txt,URW: Praise of Russia,URW: Praise of Russia: Praise of Russian milit...,Rússia assume controlo de mais uma povoação no...,PT
797,PT_31.txt,URW: Praise of Russia;URW: Discrediting Ukrain...,URW: Praise of Russia: Praise of Russian milit...,Quais foram as consequências do ataque de Iska...,PT


In [10]:
test_df = pd.concat([
    read_semeval_data(data_type='test', language='EN'),
    read_semeval_data(data_type='test', language='PT')
], ignore_index=True)
test_df

Unnamed: 0,filename,narrative,sub_narrative,text,language
0,EN_UA_DEV_100012.txt,"URW: Discrediting the West, Diplomacy;URW: Dis...","URW: Discrediting the West, Diplomacy: The Wes...",Ukraine's Minerals: What the West is Fighting ...,EN
1,EN_CC_200053.txt,Other,Other,UK’s Sunak Reverses Decision to Skip COP27 Cli...,EN
2,EN_CC_200040.txt,CC: Criticism of climate movement;CC: Criticis...,CC: Criticism of climate movement: Other;CC: C...,Climate Protesters Out Of Control As They Atta...,EN
3,EN_CC_200070.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...,Wat? L.A. Mayor Garcetti Flies to Argentina to...,EN
4,EN_UA_DEV_100034.txt,URW: Overpraising the West,URW: Overpraising the West: The West belongs i...,Opinion: Restructuring Ukrainian debt is a ste...,EN
...,...,...,...,...,...
71,PT_207.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...,Zequinha critica UE por adiar obrigatoriedade ...,PT
72,PT_217.txt,CC: Amplifying Climate Fears;CC: Amplifying Cl...,CC: Amplifying Climate Fears: Other;CC: Amplif...,O que é a cúpula de calor? Entenda fenómeno qu...,PT
73,PT_204.txt,CC: Amplifying Climate Fears;CC: Amplifying Cl...,CC: Amplifying Climate Fears: Amplifying exist...,COP28: Papa Francisco planeia participar nas n...,PT
74,PT_229.txt,CC: Amplifying Climate Fears,CC: Amplifying Climate Fears: Amplifying exist...,Queda do tráfego pelo Canal do Panamá pode cus...,PT


## cleansing

In [19]:
import os
import re
import spacy
import spacy.cli
from typing import Dict

# Download SpaCy models
# spacy.cli.download("en_core_web_sm")
# spacy.cli.download("pt_core_news_sm")

nlp_en = spacy.load("en_core_web_sm")
nlp_pt = spacy.load("pt_core_news_sm")

# Contraction maps for English and Portuguese
english_contractions = {
    "it's": "it is",
    "he's": "he is",
    "she's": "she is",
    "that's": "that is",
    "who's": "who is",
    "what's": "what is",
    "you're": "you are",
    "we're": "we are",
    "they're": "they are",
    "i'm": "i am",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "won't": "will not",
    "can't": "cannot",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "couldn't": "could not",
    "wouldn't": "would not",
    "shouldn't": "should not",
}

portuguese_contractions = {
    "da": "de a",
    "das": "de as",
    "do": "de o",
    "dos": "de os",
    "dele": "de ele",
    "dela": "de ela",
    "deles": "de eles",
    "delas": "de elas",
    "nesta": "em esta",
    "nestas": "em estas",
    "neste": "em este",
    "nestes": "em estes",
    "nisto": "em isto",
    "pro": "para o",
    "pra": "para a",
    "pros": "para os",
    "pras": "para as",
}

# Expand contractions
def expand_contractions(text: str, language: str) -> str:
    if language == "EN":
        contractions_map = english_contractions
    elif language == "PT":
        contractions_map = portuguese_contractions


    pattern = re.compile(r'\b(' + '|'.join(contractions_map.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_map[x.group()], text)

# Cleanse file
def cleanse_text(text, language='EN'):
    # to lowercase
    content = text.lower()
    # remove contractions
    content = expand_contractions(content, language)
    # remove non ASCII
    content = re.sub(r'[^\x20-\x7EÀ-ÿ]+', ' ', content)
    # remove numbers
    content = re.sub(r'\d+', '', content)

    if language == "EN":
        doc = nlp_en(content)
    elif language == "PT":
        doc = nlp_pt(content)
    

    #lemmtize ,if not stop words, punctuation, and short tokens
    filtered_tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and not token.is_punct and len(token.text) > 1
    ]

    # rejoin
    cleansed_content = ' '.join(filtered_tokens)
    
    # Remove extra whitespaces (replace multiple spaces with one)
    cleansed_content = re.sub(r'\s+', ' ', cleansed_content).strip()
    
    return cleansed_content

In [22]:
%%time

train_df['cleaned'] = train_df[['text', 'language']].apply(lambda x: cleanse_text(x['text'], x['language']), axis=1)

CPU times: total: 1min 1s
Wall time: 1min 2s


In [21]:
%%time

test_df['cleaned'] = test_df[['text', 'language']].apply(lambda x: cleanse_text(x['text'], x['language']), axis=1)

CPU times: total: 7.36 s
Wall time: 7.4 s


In [29]:
train_df['narrative'].str.split(';').explode().value_counts()

narrative
URW: Discrediting Ukraine                                 285
URW: Discrediting the West, Diplomacy                     221
CC: Amplifying Climate Fears                              197
Other                                                     196
CC: Criticism of institutions and authorities             175
URW: Praise of Russia                                     162
URW: Amplifying war-related fears                         134
URW: Russia is the Victim                                 134
CC: Criticism of climate policies                          88
URW: Blaming the war on others rather than the invader     83
CC: Criticism of climate movement                          81
URW: Negative Consequences for the West                    53
CC: Hidden plots by secret schemes of powerful groups      51
CC: Downplaying climate change                             50
URW: Speculating war outcomes                              42
CC: Controversy about green technologies                   3

In [30]:
test_df['narrative'].str.split(';').explode().value_counts()

narrative
CC: Amplifying Climate Fears                              34
CC: Criticism of institutions and authorities             23
URW: Discrediting the West, Diplomacy                     23
URW: Discrediting Ukraine                                 18
CC: Criticism of climate movement                         14
Other                                                     12
CC: Criticism of climate policies                         12
URW: Praise of Russia                                      9
URW: Blaming the war on others rather than the invader     7
URW: Russia is the Victim                                  6
CC: Questioning the measurements and science               5
CC: Hidden plots by secret schemes of powerful groups      5
URW: Speculating war outcomes                              4
URW: Amplifying war-related fears                          4
URW: Distrust towards Media                                4
CC: Controversy about green technologies                   3
CC: Downplayin

In [None]:
train_df.to_csv('training_data.csv', index=False)
test_df.to_csv('target.csv', index=False)