In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

In [3]:
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline ,GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
import random
import re
from sklearn.model_selection import train_test_split
from datasets import Dataset , DatasetDict
from math import exp
import shutil
import warnings
warnings.filterwarnings("ignore")
import logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)
logging.getLogger('absl').setLevel(logging.ERROR)
transformers.logging.set_verbosity_error()




2025-05-12 19:09:34.677317: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747076974.699457     830 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747076974.706275     830 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
df = pd.read_csv("/kaggle/input/arabic-classification/arabic_dataset_classifiction.csv/arabic_dataset_classifiction.csv")


In [5]:
def remove_tashkeel(text):
    tashkeel_pattern = r'[\u0617-\u061A\u064B-\u0652]'
    return re.sub(tashkeel_pattern, '', text)

def normalize_arabic(text):
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ة', 'ه', text)
    return text

def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
        u"\u200d"               
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def clean_text(text):
    text = text.lower()
    text = remove_emojis(text)
    text = remove_tashkeel(text)
    text = normalize_arabic(text)
    text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text)  
    text = re.sub(r'\d+', '', text)                   
    text = re.sub(r'\s+', ' ', text).strip()      
    return text

In [6]:
df = df.dropna()
df = df.drop_duplicates()
df["text"] = df["text"].apply(clean_text)
texts = df["text"].tolist()

In [7]:
train_texts, test_texts = train_test_split(texts, test_size=0.2, random_state=42)

In [8]:
train_dataset = Dataset.from_dict({"text": train_texts})
test_dataset = Dataset.from_dict({"text": test_texts})

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})


In [9]:
model_name = "akhooli/gpt2-small-arabic"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [10]:
def tokenize_function(examples):
    output = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    output["labels"] = output["input_ids"].copy()
    return output

In [11]:
dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/84468 [00:00<?, ? examples/s]

Map:   0%|          | 0/21118 [00:00<?, ? examples/s]

In [12]:
dataset = dataset.remove_columns(["text"])

In [13]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 84468
Test dataset size: 21118


In [14]:
training_args = TrainingArguments(
    output_dir="./gpt2-arabic-finetuned",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    save_steps=1000,
    run_name="gpt2-arabic-finetuned",
    logging_steps=500, 
    logging_dir='./logs',
    report_to="none",
    disable_tqdm=False, 
    save_total_limit=2,
    weight_decay=0.01,
    fp16=True,
    eval_strategy="steps",
)



In [15]:
trainer = Trainer(
    model=model,                     
    args=training_args,              
    train_dataset=train_dataset,     
    eval_dataset=test_dataset,       
    tokenizer=tokenizer,             
)


In [16]:
trainer.train()

Step,Training Loss,Validation Loss
500,2.6363,4.720624
1000,2.3513,4.473229
1500,2.2572,4.339206
2000,2.1999,4.249968
2500,2.1563,4.185018
3000,2.1271,4.13646
3500,2.1019,4.100063
4000,2.0861,4.069813
4500,2.0722,4.048211
5000,2.0618,4.03159


TrainOutput(global_step=6600, training_loss=2.165949348680901, metrics={'train_runtime': 10179.1099, 'train_samples_per_second': 41.491, 'train_steps_per_second': 0.648, 'total_flos': 2.758851919872e+16, 'train_loss': 2.165949348680901, 'epoch': 5.0})

In [17]:
eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]
perplexity = exp(eval_loss)

print(f"Evaluation Loss: {eval_loss}")
print(f"Perplexity: {perplexity}")

Evaluation Loss: 4.0097856521606445
Perplexity: 55.13505121768219


In [18]:
# text-generation pipeline
text_generator = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    device=0 if torch.cuda.is_available() else -1
)

def predict_next_word(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    max_len = input_ids.shape[1] + 1 

    output = text_generator(
        input_text,
        max_length=max_len,
        do_sample=False,  
        num_return_sequences=1
    )

    generated_text = output[0]["generated_text"]
    predicted_part = generated_text[len(input_text):].strip().split(" ")[0]
    return predicted_part

user_input = input("اكتب كلمة أو جملة: ")
next_word = predict_next_word(user_input)
print(f"الكلمة المتوقعة التالية: {next_word}")

اكتب كلمة أو جملة:  في صباح


الكلمة المتوقعة التالية: اليوم


In [19]:
trainer.save_model("./gpt2-arabic-finetuned")

In [20]:
model = GPT2LMHeadModel.from_pretrained("./gpt2-arabic-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-arabic-finetuned")

In [21]:
shutil.make_archive('/kaggle/working/gpt2-arabic-finetuned', 'zip', './gpt2-arabic-finetuned')

'/kaggle/working/gpt2-arabic-finetuned.zip'

In [35]:
model_path = "/kaggle/working/gpt2-arabic-finetuned"  
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

reference_sentence = "يعتبر مرض هشاشة العظام أكثر شيوعًا بين كبار السن"
input_text = "يعتبر مرض هشاشة العظام أكثر شيوعًا بين كبار"
references = [["السن"], ["العمر"], ["الكبار"]] 

inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(
    **inputs,
    max_new_tokens=1,
    pad_token_id=tokenizer.eos_token_id,
    num_beams=5,
    early_stopping=True,
    no_repeat_ngram_size=2
)


generated_word = tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip()
print(f"النص المدخل: '{input_text}'")
print(f"الكلمة المولدة: '{generated_word}'")
print(f"المراجع المقبولة: {[ref[0] for ref in references]}")


generated_word_list = generated_word.split()

accuracy = 1 if generated_word in [ref[0] for ref in references] else 0
print(f"الدقة (Accuracy): {accuracy:.2f}")

النص المدخل: 'يعتبر مرض هشاشة العظام أكثر شيوعًا بين كبار'
الكلمة المولدة: 'السن'
المراجع المقبولة: ['السن', 'العمر', 'الكبار']
الدقة (Accuracy): 1.00
