In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [9]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [15]:
import pandas as pd
import re
import torch
from sklearn.metrics import roc_auc_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

#Loading dataFiles
train_data = pd.read_csv('/content/drive/MyDrive/DataSets_competition/train_set.csv')
dev_data = pd.read_csv('/content/drive/MyDrive/DataSets_competition/dev_set.csv')
test_data = pd.read_csv('/content/drive/MyDrive/DataSets_competition/test_set.csv')

#Cleaning and normalization
def clean_text(text):
    text = re.sub(r"[^\w\s]", " ", text)# Remove punctuation
    text = re.sub(r"\d+", " ", text)# Remove numbers
    text = re.sub(r"[إأآا]", "ا", text)# Normalize alef
    text = re.sub(r"ى", "ي", text)# Normalize ya
    text = re.sub(r"ؤ", "و", text)# Normalize waw
    text = re.sub(r"ء", "", text)#Remove hamza
    text = re.sub(r"ئ", "ي", text)#Normalize ya with hamza
    text = re.sub(r"\s+", " ", text).strip()#Remove extra spaces
    return text

#Applying cleaning
train_data["text"] = train_data["text"].apply(clean_text)
dev_data["text"] = dev_data["text"].apply(clean_text)
test_data["text"] = test_data["text"].apply(clean_text)

#Converting datasets to Hugging Face format
train_dataset = Dataset.from_pandas(train_data[["text", "generated"]])
dev_dataset = Dataset.from_pandas(dev_data[["text", "generated"]])
test_dataset = Dataset.from_pandas(test_data[["text"]])

#Loading AraBERT tokenizer and model
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

#Ensure num_labels=2 for binary classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

#Tokenization function
def tokenize(batch):
    #Tokenize the text
    encoding = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)
    if "generated" in batch:
        encoding["labels"] = batch["generated"]
    return encoding

#Apply tokenization
train_dataset = train_dataset.map(tokenize, batched=True)
dev_dataset = dev_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)  # No labels for the test set

#Set format for PyTorch tensors
train_dataset = train_dataset.with_format("torch")
dev_dataset = dev_dataset.with_format("torch")

#Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=1,
    metric_for_best_model="eval_roc_auc",
    greater_is_better=True,
    load_best_model_at_end=True,
    seed=42,
    report_to="none"
)

#Defining metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
    pred_binary = (predictions >= 0.5).astype(int)
    roc_auc = roc_auc_score(labels, predictions)
    accuracy = accuracy_score(labels, pred_binary)
    return {"roc_auc": roc_auc, "accuracy": accuracy}

#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

#Training the model
trainer.train()

#Evaluating on dev set
eval_results = trainer.evaluate()
print(f"Dev ROC-AUC: {eval_results['eval_roc_auc']:.4f}")
print(f"Dev Accuracy: {eval_results['eval_accuracy']:.4f}")

#Predicting on test set
predictions = trainer.predict(test_dataset)
test_preds = torch.softmax(torch.tensor(predictions.predictions), dim=1)[:, 1].numpy()

#Prepare submission
submission = pd.DataFrame({
    "text_id": test_data["text_id"],
    "generated": test_preds
})
submission.to_csv("submission.csv", index=False)
print("Submission saved as 'submission.csv'")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6763 [00:00<?, ? examples/s]

Map:   0%|          | 0/1450 [00:00<?, ? examples/s]

Map:   0%|          | 0/1451 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Roc Auc,Accuracy
1,0.0442,0.019965,0.999802,0.995862
2,0.0074,0.027028,0.99997,0.993793
3,0.0056,0.022828,0.999935,0.994483


Dev ROC-AUC: 1.0000
Dev Accuracy: 0.9938
Submission saved as 'submission.csv'
