In [78]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
from transformers import RobertaTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score


nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rodolfocacacho/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rodolfocacacho/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
file_train = 'data/subtask1/train.json'
file_validation = 'data/subtask1/validation.json'

# Open the JSON file
with open(file, 'r') as file:
    # Load the JSON data
    data = json.load(file)

# Create a list of dictionaries
entries_list = []

# Iterate through each entry in the JSON data
for entry in data:
    # Extract relevant information from each entry
    entry_dict = {
        "id": entry["id"],
        "text": entry["text"],
        "labels": entry["labels"],
        "link": entry["link"]
    }

    # Append the dictionary to the list
    entries_list.append(entry_dict)

# Now 'entries_list' contains a list of dictionaries, each representing an entry in your JSON file
# You can access the information as needed
for entry_dict in entries_list:
    print(f"ID: {entry_dict['id']}, Text: {entry_dict['text']}, Labels: {entry_dict['labels']}, Link: {entry_dict['link']}")


ID: 65635, Text: THIS IS WHY YOU NEED\n\nA SHARPIE WITH YOU AT ALL TIMES, Labels: ['Black-and-white Fallacy/Dictatorship'], Link: https://www.facebook.com/photo/?fbid=4023552137722493&set=g.633131750534436
ID: 67927, Text: GOOD NEWS!\n\nNAZANIN ZAGHARI-RATCLIFFE AND ANOOSHEH ASHOORI HAVE BEEN RELEASED\n\nAfter years of being unjustly detained in Iran, they are making their way safely back to the UK., Labels: ['Loaded Language', 'Glittering generalities (Virtue)'], Link: https://www.facebook.com/amnesty/photos/5311988665480629/
ID: 68031, Text: PAING PHYO MIN IS FREE!, Labels: [], Link: https://www.facebook.com/amnesty/photos/4274191309260375/
ID: 77490, Text: Move your ships away!\n\noooook\n\nMove your ships away!\n\nNo, and I just added 10 more, Labels: [], Link: https://www.facebook.com/rightpatriots/photos/pb.100064494145299.-2207520000./606109399734747/?type=3
ID: 67641, Text: WHEN YOU'RE THE FBI, THEY LET YOU DO IT., Labels: ['Thought-terminating cliché'], Link: https://www.faceb

In [92]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Replace multiple whitespace characters with a single space
    text = re.sub(r'\s+', ' ', text)
    # Replace line breaks with a space
    text = text.replace('\\n', ' ')
    text = text.replace('\n', ' ')

    # Replace dashes with spaces
    text = text.replace('-', ' ')

    # Remove special characters, punctuation (except apostrophes), and symbols
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stop words
    # stop_words = set(stopwords.words('english'))
    # tokens = [token for token in tokens if token not in stop_words]
    
    # Porter stemming (optional)
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Example usage
original_text = "Your text with some special characters and line breaks."
preprocessed_text = preprocess_text(original_text)
print(preprocessed_text)


your text with some special characters and line breaks


In [93]:
# Create a new list to store the preprocessed dictionaries
preprocessed_list = []

for entry in entries_list:
    preprocessed_entry = entry.copy()
    preprocessed_entry['text'] = preprocess_text(entry['text'])
    if len(preprocessed_entry['labels']) == 0:
        preprocessed_entry['labels'] = ['nocat']
    else:
        t = preprocessed_entry.get('labels',[0])[0]
        # print(f'aos {t}')
        preprocessed_entry['labels'] = [t]
        # preprocessed_entry['labels'] = preprocessed_entry['labels']    
    preprocessed_list.append(preprocessed_entry)


# Display the preprocessed data
for id,entry in enumerate(preprocessed_list):
    print(f"ID: {entry['id']}, Text: {entry['text']} Label: {entry['labels']}")
    # print(f"ID: {entries_list[id]['id']}, Text: {entries_list[id]['text']}")

ID: 65635, Text: this is why you need a sharpie with you at all times Label: ['Black-and-white Fallacy/Dictatorship']
ID: 67927, Text: good news nazanin zaghari ratcliffe and anoosheh ashoori have been released after years of being unjustly detained in iran they are making their way safely back to the uk Label: ['Loaded Language']
ID: 68031, Text: paing phyo min is free Label: ['nocat']
ID: 77490, Text: move your ships away oooook move your ships away no and i just added more Label: ['nocat']
ID: 67641, Text: when youre the fbi they let you do it Label: ['Thought-terminating cliché']
ID: 66402, Text: putins secret camouflage army Label: ['nocat']
ID: 79204, Text: january you dont buy a million dollar waterfront house on marthas vineyard if you really believe the oceans are rising Label: ['Whataboutism']
ID: 79372, Text: term limits are everywhere politicians cant ignore it term limits ahead Label: ['Slogans']
ID: 68254, Text: nicola sturgeon were scottish getusoutofhere will his bushtu

In [72]:

# Initialize a dictionary to store label counts per category
category_counts = {}
ids_ohne_label = []
ids_mit_label = []
max_id = 0

# Loop through the dataset
for entry in preprocessed_list:
    labels = entry.get('labels', [])  # Get the labels for the current entry

    if not labels:
        ids_ohne_label.append(entry['id'])
    else:
        ids_mit_label.append(entry['id'])
        for label in labels:
            if label not in category_counts:
                category_counts[label] = 1
            else:
                category_counts[label] += 1
    if len(labels) > max_id:
        max_id = len(labels)
        idx = entry['id']


# Display the label counts per category
for category, count in category_counts.items():
    print(f"Category: {category}, Count: {count}")

f_list = [d for d in preprocessed_list if d.get('id') == idx]

print(f'ids ohne labels: {len(ids_ohne_label)} ids mit labels: {len(ids_mit_label)} max_labels {max_id}')
print(f'id {idx} text: {f_list}')

Category: Black-and-white Fallacy/Dictatorship, Count: 371
Category: Loaded Language, Count: 1384
Category: nocat, Count: 1264
Category: Thought-terminating cliché, Count: 136
Category: Whataboutism, Count: 81
Category: Slogans, Count: 252
Category: Causal Oversimplification, Count: 110
Category: Name calling/Labeling, Count: 639
Category: Appeal to authority, Count: 816
Category: Repetition, Count: 132
Category: Smears, Count: 714
Category: Exaggeration/Minimisation, Count: 142
Category: Flag-waving, Count: 225
Category: Doubt, Count: 287
Category: Glittering generalities (Virtue), Count: 238
Category: Obfuscation, Intentional vagueness, Confusion, Count: 11
Category: Appeal to fear/prejudice, Count: 86
Category: Bandwagon, Count: 48
Category: Presenting Irrelevant Data (Red Herring), Count: 27
Category: Reductio ad hitlerum, Count: 20
Category: Misrepresentation of Someone's Position (Straw Man), Count: 17
ids ohne labels: 0 ids mit labels: 7000 max_labels 1
id 65635 text: [{'id': '6

In [77]:
# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the text for each entry in the dataset
for entry in preprocessed_list:
    text = entry['text']
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    # Update the entry with tokenized information
    entry['input_ids'] = tokens['input_ids']
    entry['attention_mask'] = tokens['attention_mask']

# Display the tokenized entries
for entry in preprocessed_list:
    print(f"ID: {entry['id']}")
    print(f"Text: {entry['text']}")
    print("Token IDs:", entry['input_ids'])
    print("Attention Mask:", entry['attention_mask'])
    print()

ID: 65635
Text: this is why you need a sharpie with you at all times
Token IDs: tensor([[   0, 9226,   16,  596,   47,  240,   10, 4406,  324,   19,   47,   23,
           70,  498,    2]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

ID: 67927
Text: good news nazanin zaghari ratcliffe and anoosheh ashoori have been released after years of being unjustly detained in iran they are making their way safely back to the uk
Token IDs: tensor([[    0,  8396,   340,   295,  1222,   260,   179,   992,  7669,  1512,
         12378, 20152,     8,    41, 16154,   700,   298, 14016,  4623,   118,
            33,    57,   703,    71,   107,     9,   145, 20134,   352,  5624,
            11, 10209,   260,    51,    32,   442,    49,   169,  7385,   124,
             7,     5,  1717,   330,     2]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

ID: 6

In [95]:
def read_json_labels(file):
    # Open the JSON file
    with open(file, 'r') as file:
        # Load the JSON data
        data = json.load(file)

    # Create a list of dictionaries
    entries_list = []
    # Iterate through each entry in the JSON data
    for entry in data:
        # Extract relevant information from each entry
        entry_dict = {
            "id": entry["id"],
            "text": entry["text"],
            "labels": entry["labels"],
            "link": entry["link"]
        }

        # Append the dictionary to the list
        entries_list.append(entry_dict)
    
    return entries_list


def preprocess_text_list(entries_list):
    # Create a new list to store the preprocessed dictionaries
    preprocessed_list = []

    for entry in entries_list:
        preprocessed_entry = entry.copy()
        preprocessed_entry['text'] = preprocess_text(entry['text'])
        if len(preprocessed_entry['labels']) == 0:
            preprocessed_entry['labels'] = ['nocat']
        else:
            t = preprocessed_entry.get('labels',[0])[0]
            # print(f'aos {t}')
            preprocessed_entry['labels'] = [t]
            # preprocessed_entry['labels'] = preprocessed_entry['labels']    
        preprocessed_list.append(preprocessed_entry)
    
    return preprocessed_list

def preprocess_labels(entries_list):

    # Create a new list to store the preprocessed dictionaries
    preprocessed_list = []

    for entry in entries_list:
        preprocessed_entry = entry.copy()
        preprocessed_entry['text'] = preprocess_text(entry['text'])
        if len(preprocessed_entry['labels']) == 0:
            preprocessed_entry['labels'] = ['nocat']
        else:
            t = preprocessed_entry.get('labels',[0])[0]
            # print(f'aos {t}')
            preprocessed_entry['labels'] = [t]
            # preprocessed_entry['labels'] = preprocessed_entry['labels']    
        preprocessed_list.append(preprocessed_entry)

    return preprocessed_list

def tokenize_text(preprocessed_list):
    # Tokenize the text for each entry in the dataset
    for entry in preprocessed_list:
        text = entry['text']
        tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

        # Update the entry with tokenized information
        entry['input_ids'] = tokens['input_ids']
        entry['attention_mask'] = tokens['attention_mask']
    
    return preprocessed_list


In [99]:
file_train = 'data/subtask1/train.json'
file_validation = 'data/subtask1/validation.json'

# Train
train_json = read_json_labels(file_train)
train_list = preprocess_text_list(train_json)
train_list = preprocess_labels(train_list)
train_list = tokenize_text(train_list)

# # Validation
val_json = read_json_labels(file_validation)
val_list = preprocess_text_list(val_json)
val_list = preprocess_labels(val_list)
val_list = tokenize_text(val_list)


# Display the tokenized entries
# for entry in val_list:
#     print(f"ID: {entry['id']}")
#     print(f"Text: {entry['text']}")
#     print("Token IDs:", entry['input_ids'])
#     print("Attention Mask:", entry['attention_mask'])
#     print()



In [100]:
# Example: Replace with your tokenized data and labels
# Assume 'input_ids', 'attention_mask', and 'labels' are keys in your dataset

# Convert data to PyTorch tensors
train_inputs = torch.cat([entry['input_ids'] for entry in train_list])
train_masks = torch.cat([entry['attention_mask'] for entry in train_list])
train_labels = torch.tensor([entry['labels'] for entry in train_list])

val_inputs = torch.cat([entry['input_ids'] for entry in val_list])
val_masks = torch.cat([entry['attention_mask'] for entry in val_list])
val_labels = torch.tensor([entry['labels'] for entry in val_list])

# Create DataLoader
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)

# batch_size = 8
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# # Load pre-trained RoBERTa model and tokenizer
# model_name = 'roberta-base'
# model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=len(train_labels[0]))
# tokenizer = RobertaTokenizer.from_pretrained(model_name)

# # Optimizer and Scheduler
# optimizer = AdamW(model.parameters(), lr=5e-5)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader))

# # Loss Function
# criterion = torch.nn.BCEWithLogitsLoss()

# # Training Loop
# epochs = 3
# for epoch in range(epochs):
#     model.train()
#     total_loss = 0

#     for batch in train_dataloader:
#         input_ids, attention_mask, labels = batch

#         optimizer.zero_grad()
#         outputs = model(input_ids, attention_mask=attention_mask)
#         logits = outputs.logits

#         loss = criterion(logits, labels.float())
#         total_loss += loss.item()

#         loss.backward()
#         optimizer.step()
#         scheduler.step()

#     # Print average training loss for the epoch
#     average_loss = total_loss / len(train_dataloader)
#     print(f"Epoch {epoch + 1}/{epochs}, Average Training Loss: {average_loss}")

#     # Validation
#     model.eval()
#     all_val_labels = []
#     all_val_preds = []

#     with torch.no_grad():
#         for batch in val_dataloader:
#             input_ids, attention_mask, labels = batch
#             outputs = model(input_ids, attention_mask=attention_mask)
#             logits = outputs.logits

#             val_preds = (torch.sigmoid(logits) > 0.5).long()
#             all_val_labels.append(labels.numpy())
#             all_val_preds.append(val_preds.numpy())

#     all_val_labels = torch.tensor(all_val_labels).float()
#     all_val_preds = torch.tensor(all_val_preds).float()

#     # Calculate F1-score for validation set
#     f1 = f1_score(all_val_labels, all_val_preds, average='micro')
#     print(f"Epoch {epoch + 1}/{epochs}, Validation F1-Score: {f1}")




RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 15 but got size 45 for tensor number 1 in the list.