In [None]:
!pip install transformers pandas scikit-learn torch datasets iterative-stratification


In [None]:
import pandas as pd
import torch
import json
import re
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

In [None]:
# Function to read JSON file and convert to DataFrame
def json_to_dataframe(json_file_path):
    data_list = []
    with open(json_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                data = json.loads(line)
                data_list.append(data)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
    df = pd.json_normalize(data_list)
    return df

# Function to remove Arabic text, hyphens, and process the string
def remove_arabic_and_hyphen(text):
    text = re.sub(r'[\u0600-\u06FF]', '', text)  # Remove Arabic text
    text = text.replace('-', '')                # Remove hyphens
    text = text.replace('Offensive  /','Offensive')
    text = ' '.join(text.split())               # Remove extra spaces
    text = text.lower()                         # Convert to lowercase
    return text

# Function to process a list of strings
def process_list(lst):
    return [remove_arabic_and_hyphen(item) for item in lst]

# Load and preprocess the data
json_file_path = '/content/sample_data/output_1287.json'  # Replace with your JSON file path
df = json_to_dataframe(json_file_path)
df['offensiveness'] = df['offensiveness'].apply(process_list)
df['Emotion_Label'] = df['emotions'].apply(process_list)


In [None]:
# Function to replace labels
def simplify_labels(labels):
    if 'clean' in labels:
        return ['clean']
    else:
        return ['offensive']

# Apply the function to each row
df['offensiveness_label'] = df['offensiveness'].apply(simplify_labels)


In [None]:
# Function to filter and clean the rows
def filter_and_clean(labels):
    # Check if the list contains any 'hate speech:' label
    if any(label.startswith('hate speech:') for label in labels):
        # Remove 'offensive' from the list if present
        labels = [label for label in labels if label != 'offensive']
        return labels
    return None

# Apply the function and filter out None values
df['offensiveness_cleaned'] = df['offensiveness'].apply(filter_and_clean)

# Drop rows where the result is None (no 'hate speech:' label)
df_filtered = df.dropna(subset=['offensiveness_cleaned'])


In [None]:
# Function to filter and clean labels
def filter_hate_speech(labels):
    # Keep only the labels that start with 'hate speech:'
    filtered_labels = [label for label in labels if label.startswith('hate speech:')]
    # Clean each label by replacing commas, spaces, and dots with underscores
    cleaned_labels = [label.replace("hate_speech:"," ")for label in filtered_labels]
    return cleaned_labels

# Apply the function to each row
df_filtered['offensiveness_cleaned2'] = df_filtered['offensiveness_cleaned'].apply(filter_hate_speech)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['offensiveness_cleaned2'] = df_filtered['offensiveness_cleaned'].apply(filter_hate_speech)


In [None]:
# Function to remove 'hate_speech:' from each label
def remove_hate_speech_prefix(labels):
    cleaned_labels = [label.replace('hate_speech:_', '') for label in labels]
    return cleaned_labels

# Apply the function to each row
df_filtered['offensiveness_cleaned'] = df_filtered['offensiveness_cleaned'].apply(remove_hate_speech_prefix)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['offensiveness_cleaned'] = df_filtered['offensiveness_cleaned'].apply(remove_hate_speech_prefix)


In [None]:
df_filtered['offensiveness_cleaned'].value_counts()

Unnamed: 0_level_0,count
offensiveness_cleaned,Unnamed: 1_level_1
[other],1632
[political_ideology__sports__etc],1570
[origin__ethnicity__or_country],1349
[religion_or_sect],632
"[origin__ethnicity__or_country, political_ideology__sports__etc]",143
"[origin__ethnicity__or_country, religion_or_sect]",139
"[political_ideology__sports__etc, other]",137
"[origin__ethnicity__or_country, other]",107
[social_class__profession__etc],92
"[religion_or_sect, political_ideology__sports__etc]",66


In [None]:
df_filtered.head(2)

Unnamed: 0,id,text,emotions,offensiveness,Emotion_Label,offensiveness_cleaned,offensiveness_cleaned2
1,374173,قلنا ميت مره قضاء فاسد وقذر قطيع ضال https://t...,[Disgust - قرف],"[offensive, hate speech: other]",[disgust],[other],[]
5,374169,@AydaNews الله يلع؛ ـْكم يا كذبكم.😁😂,"[Anger - غضب, Disgust - قرف]","[offensive, hate speech: other]","[anger, disgust]",[other],[]


In [None]:
# Binarize the labels (one-hot encoding for multi-label classification)
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df_filtered['offensiveness_cleaned'])

# Split the data into train, dev, and test sets
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8, random_state=42)
for train_dev_idx, test_idx in msss.split(df_filtered, labels):
    train_dev_df = df_filtered.iloc[train_dev_idx]
    test_df = df_filtered.iloc[test_idx]
    train_dev_labels = labels[train_dev_idx]

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.125, train_size=0.875, random_state=42)
for train_idx, dev_idx in msss.split(train_dev_df, train_dev_labels):
    train_df = train_dev_df.iloc[train_idx]
    dev_df = train_dev_df.iloc[dev_idx]

# Binarize the labels for train and dev sets
train_labels = mlb.fit_transform(train_df['offensiveness_cleaned'])
dev_labels = mlb.transform(dev_df['offensiveness_cleaned'])
test_labels = mlb.transform(test_df['offensiveness_cleaned'])


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn as nn

class CustomCamelBERTForSequenceClassification(AutoModelForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        # Add a dropout layer with a dropout probability of 0.1 (you can change this value)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)  # Apply dropout
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

# Load the tokenizer and the custom model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = CustomCamelBERTForSequenceClassification.from_pretrained(model_name, num_labels=len(mlb.classes_))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize the datasets
train_encodings = tokenizer(list(train_df['text']), padding="max_length", truncation=True, max_length=128)
dev_encodings = tokenizer(list(dev_df['text']), padding="max_length", truncation=True, max_length=128)
test_encodings = tokenizer(list(test_df['text']), padding="max_length", truncation=True, max_length=128)


In [None]:
# Create a PyTorch dataset
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmotionDataset(train_encodings, train_labels)
dev_dataset = EmotionDataset(dev_encodings, dev_labels)
test_dataset = EmotionDataset(test_encodings, test_labels)

In [None]:
# Define metrics for evaluation
def compute_metrics(p):
    pred_labels = torch.sigmoid(torch.tensor(p.predictions)) > 0.5
    true_labels = p.label_ids

    accuracy = accuracy_score(true_labels, pred_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='micro')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
# Custom callback to save the best model
class SaveBestModelCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.best_f1 = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is not None:
            f1 = metrics.get("eval_f1", 0)
            if f1 > self.best_f1:
                self.best_f1 = f1
                control.should_save = True  # Save the model
                print(f"Best model saved with F1: {f1:.4f}")
            else:
                control.should_save = False

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # Evaluate every epoch
    save_strategy="epoch",        # Save every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,           # Keep only the best model
    load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="f1",   # Use F1 score to determine the best model
    greater_is_better=True,       # Higher F1 score is better
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[SaveBestModelCallback()]  # Add custom callback to save the best model
)



In [None]:
# Train the model
train_result = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2792,0.255812,0.533011,0.717514,0.548991,0.622041
2,0.2283,0.23613,0.628019,0.744681,0.65562,0.697318
3,0.1855,0.234949,0.639291,0.739269,0.670029,0.702948


Best model saved with F1: 0.6220
Best model saved with F1: 0.6973
Best model saved with F1: 0.7029


In [None]:
# Predict on the test set with the best model
predictions = trainer.predict(test_dataset)
pred_labels = torch.sigmoid(torch.tensor(predictions.predictions)) > 0.5

# Calculate metrics for the test set
accuracy = accuracy_score(test_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, pred_labels, average='micro')

# Print the test set metrics
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")

Test Accuracy: 0.5985
Test Precision: 0.7172
Test Recall: 0.6628
Test F1 Score: 0.6889


In [None]:
# Convert predictions back to label names
predicted_emotions = mlb.inverse_transform(pred_labels)

In [None]:
test_df['Predicted_hate_speech']=predicted_emotions

In [None]:
test_df.head(2)

Unnamed: 0,id,text,emotions,offensiveness,Emotion_Label,Predicted_Emotions
4,374170,@latifaalnaff3 لا ، معي ٨ كلاب ماعندي مكان وبم...,[No emotions - لا عواطف محددة],[clean],[no emotions],"(disgust,)"
5,374169,@AydaNews الله يلع؛ ـْكم يا كذبكم.😁😂,"[Anger - غضب, Disgust - قرف]","[offensive, hate speech: other]","[anger, disgust]","(disgust,)"


In [None]:
import matplotlib.pyplot as plt

# Load logs from the log file if you saved them to a file
# For example, if you saved them in 'logs/trainer.log':
import pandas as pd
logs = pd.read_csv('logs/trainer.log', delimiter='\t')

# Alternatively, if you use the output returned by `trainer.train()`
logs = train_result.train_logs  # Example for extraction, adjust as needed

# Extract loss values and epochs
epochs = list(range(1, len(logs['loss']) + 1))
train_loss = logs['loss'].tolist()
val_loss = logs['eval_loss'].tolist() if 'eval_loss' in logs else []

# Plotting the loss curves
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_loss, label='Training Loss', color='blue', marker='o')
if val_loss:
    plt.plot(epochs, val_loss, label='Validation Loss', color='red', marker='o')

# Adding titles and labels
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# Show the plot
plt.show()


NameError: name 'epochs' is not defined

<Figure size 1000x600 with 0 Axes>