# Install dependices

In [1]:
! git clone https://gitlab.com/bigirqu/quranqa.git
!pip install farasa
!pip install transformers farasapy datasets
!pip install arabic-reshaper python-bidi
!pip install pyyaml==5.4.1
!pip install accelerate -U
! pip install optuna

Cloning into 'quranqa'...
remote: Enumerating objects: 333, done.[K
remote: Counting objects: 100% (86/86), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 333 (delta 43), reused 86 (delta 43), pack-reused 247[K
Receiving objects: 100% (333/333), 312.88 KiB | 827.00 KiB/s, done.
Resolving deltas: 100% (130/130), done.
Collecting farasa
  Downloading Farasa-0.0.1-py2.py3-none-any.whl (12.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.6/12.6 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: farasa
Successfully installed farasa-0.0.1
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m

# Import libraries

In [2]:
import pandas as pd
import numpy as np
import json
import re
import string
import os
import sys
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoConfig
from transformers import default_data_collator as data_collator
from transformers import pipeline
from scipy.special import softmax
import torch
import ast
import joblib
import optuna
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer

In [3]:
sys.path.insert(0,"/content/quranqa/code/")
import quranqa22_eval as eval
from tokenization import tokenize_fun

# load data

In [4]:
train_data_path = r'/content/quranqa/datasets/qrcd_v1.1_train.jsonl'
dev_data_path = r'/content/quranqa/datasets/qrcd_v1.1_dev.jsonl'
test_data_path=r'/content/quranqa/datasets/qrcd_v1.1_test_gold.jsonl'

In [5]:
train_data = pd.read_json(train_data_path, lines=True)
val_data = pd.read_json(dev_data_path, lines=True)
test_data=pd.read_json(test_data_path, lines=True)

In [6]:
train_datadf = pd.DataFrame(train_data)
val_datadf = pd.DataFrame(val_data)
test_datadf=pd.DataFrame(test_data)


In [7]:
data = pd.concat([train_datadf, val_datadf])

In [8]:
len(data)

819

In [9]:
data.to_csv('/content/quranqa/datasets/data.csv', index=False, encoding='utf-8-sig')

In [10]:
train_df=pd.read_csv("/content/quranqa/datasets/data.csv")

In [11]:
train_df['answers'] = train_df['answers'].apply(ast.literal_eval)

# Tokenization

In [12]:
tokenizer_name = "Damith/AraELECTRA-discriminator-QuranQA"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/761k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [13]:
token_train_data = tokenize_fun(train_df, tokenizer)
token_val_data = tokenize_fun(val_datadf, tokenizer)
train_dataset = Dataset.from_dict(token_train_data)
val_dataset= Dataset.from_dict(token_val_data)

# Configure Parameters

In [14]:
model_name = 'Damith/AraELECTRA-discriminator-QuranQA'
per_device_train_batch_size=2
per_device_eval_batch_size=2
num_train_epochs=5
training_path ='/content/quranqa/datasets/data.csv'
model_path = '/content/quranqa/model.hd5'
result_path = '/content/quranqa/result_run01.json'

In [15]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

In [16]:
training_args = TrainingArguments(
    output_dir="/content/quranqa/result_run01",
    save_strategy='no',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset ,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Train model

In [17]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.828349
2,0.622600,0.415353
3,0.252800,0.185989
4,0.107800,0.106321
5,0.044400,0.096594


TrainOutput(global_step=2050, training_loss=0.2510019558813514, metrics={'train_runtime': 379.9014, 'train_samples_per_second': 10.779, 'train_steps_per_second': 5.396, 'total_flos': 802507664125440.0, 'train_loss': 0.2510019558813514, 'epoch': 5.0})

# Predictions

In [18]:
def predict_answer(passage , question, show_all=False):

    ranked_answers=[]


    inputs = tokenizer(question, passage, add_special_tokens=True, return_tensors="pt").to("cuda")
    input_ids = inputs["input_ids"].tolist()[0]


    outputs = model(**inputs)


    answer_start_scores = outputs["start_logits"]
    answer_end_scores = outputs["end_logits"]


    answer_starts_probs = softmax(torch.topk(answer_start_scores , 5).values.cpu().data.numpy())
    answer_starts =  torch.topk(answer_start_scores , 5).indices


    answer_ends_probs = softmax(torch.topk(answer_end_scores, 5).values.cpu().data.numpy())
    full_probs = softmax((np.multiply(answer_starts_probs,answer_ends_probs)))[0]


    answer_ends = torch.topk(answer_end_scores, 5).indices +1

    print(f"Question: {question}")
    print('top predicted answers:')
    idx =0
    for answer_start ,  answer_end in zip(answer_starts.tolist()[0], answer_ends.tolist()[0]):

        idx+=1
        answer = tokenizer.convert_tokens_to_string( tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))



        if answer.strip() !='':


            if (full_probs[idx-1] > 0.1):


                print(f"Answer number {idx}: {answer}")

                ranked_answers.append( { 'answer': answer, 'rank' : len(ranked_answers)+1, 'score':float(full_probs[idx-1])})

    if len(ranked_answers) == 0:
        print(' Empty Answer ')
    return ranked_answers

In [19]:
result ={}
for index, sample in val_datadf.iterrows():
    result[sample['pq_id']]=predict_answer(sample['passage'], sample['question'], show_all=True)

Question: ما هي مصارف الزكاة؟
top predicted answers:
Answer number 1: للفقراء والمساكين والعاملين عليها والمؤلفة قلوبهم وفي الرقاب والغارمين وفي سبيل الله وابن السبيل
Answer number 3: إنما الصدقات للفقراء
Answer number 4: الصدقات للفقراء والمساكين والعاملين عليها والمؤلفة قلوبهم وفي الرقاب والغارمين وفي سبيل الله وابن
Question: متى يحل الإسلام دم الشخص؟
top predicted answers:
Answer number 1: كتب عليكم القصاص في القتلى الحر بالحر والعبد بالعبد والأنثى بالأنثى
Answer number 3: يا أيها الذين آمنوا كتب عليكم القصاص في القتلى
Answer number 4: والأنثى بالأنثى فمن عفي له من أخيه شيء فاتباع بالمعروف وأداء إليه بإحسان
Answer number 5: والعبد بالعبد
Question: متى يحل الإسلام دم الشخص؟
top predicted answers:
Answer number 1: وقاتلوا في سبيل الله الذين يقاتلونكم
Answer number 4: واقتلوهم حيث ثقفتموهم وأخرجوهم من حيث أخرجوكم
Answer number 5: ##تلوا في سبيل الله الذين يقاتلون
Question: متى يحل الإسلام دم الشخص؟
top predicted answers:
Answer number 1: وقاتلوا في سبيل الله
Answer number 3: ##تلوا
Ans

In [20]:
with open(result_path, 'w' , encoding= 'utf8') as fp:
    json.dump(result , fp, ensure_ascii=False)

# Evaluation

In [21]:
!python /content/quranqa/code/quranqa22_eval.py --gold_answers_file=/content/quranqa/datasets/qrcd_v1.1_dev.jsonl --run_file=/content/quranqa/result_run01.json

100% 241M/241M [00:19<00:00, 12.2MiB/s]
Loaded 109 records from /content/quranqa/datasets/qrcd_v1.1_dev.jsonl
The run file is correct.
{"pRR": 0.9791160809968148, "exact_match": 0.908256880733945, "f1": 0.9791160809968148}


In [22]:
result ={}
for index, sample in test_datadf.iterrows():
    result[sample['pq_id']]=predict_answer(sample['passage'], sample['question'])
result_test_path = '/content/quranqa/result_run02.json'
with open(result_test_path, 'w' , encoding= 'utf8') as fp:
    json.dump(result , fp, ensure_ascii=False)

Question: من هم الملائكة المذكورون في القرآن؟
top predicted answers:
Answer number 1: عيسى ابن مريم
Answer number 2: مريم البينات وأيدناه بروح القدس
Answer number 3: موسى الكتاب وقفينا من بعده بالرسل وآتينا عيسى
Answer number 5: ولقد آتينا موسى الكتاب وقفينا من بعده بالرسل
Question: من هم الملائكة المذكورون في القرآن؟
top predicted answers:
Answer number 1: وجبريل وميكال
Answer number 2: من كان عدوا لله وملائكته
Answer number 3: ##كال فإن الله عدو للكافرين
Answer number 4: لله وملائكته ورسله وجبريل
Answer number 5: قل من كان عدوا لجبريل فإنه نزله على قلبك بإذن الله مصدقا لما بين يديه وهدى وبشرى للمؤمنين. من كان عدوا لله وملائكته ورسله وجب
Question: من هم الملائكة المذكورون في القرآن؟
top predicted answers:
Answer number 1: هاروت وماروت
Answer number 3: واتبعوا ما تتلو الشياطين على ملك سليمان وما كفر سليمان ولكن الشياطين كفروا يعلمون الناس السحر وما أنزل على الملكين ببابل
Answer number 4: سليمان وما كفر سليمان ولكن الشياطين كفروا يعلمون الناس السحر وما أنزل على الملكين ببابل هاروت
Quest

In [23]:
!python /content/quranqa/code/quranqa22_eval.py --gold_answers_file=/content/quranqa/datasets/qrcd_v1.1_test_gold.jsonl --run_file=/content/quranqa/result_run02.json

Loaded 238 records from /content/quranqa/datasets/qrcd_v1.1_test_gold.jsonl
The run file is correct.
{"pRR": 0.53031857526226, "exact_match": 0.24789915966386555, "f1": 0.5116277357173454}


# Hyperparameter tuning

In [None]:
def objective(trial):
    # Define the hyperparameters to optimize
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 2, 5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [2, 4, 8])
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)

    # Load the tokenizer and the model
    tokenizer = AutoTokenizer.from_pretrained("Damith/AraELECTRA-discriminator-QuranQA")
    model = AutoModelForQuestionAnswering.from_pretrained("Damith/AraELECTRA-discriminator-QuranQA")

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir="/content/quranqa/result_ft",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=2,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        metric_for_best_model='eval_f1'
    )

    # Define the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    result = trainer.evaluate()

    # Print the keys of the result dictionary
    print("Result keys: ", result.keys())

    # Return the F1 score to optimize
    return _


In [None]:
# Define the study
study = optuna.create_study(direction="maximize")

# Run the study
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best F1 score
print("Best hyperparameters: ", study.best_params)