# Mount from dive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install dependecies

In [None]:
! pip install transformers

In [None]:
!pip install Arabic-Stopwords

In [None]:
! pip install python-terrier

# Import libraries

In [None]:
import re
import pandas as pd
from snowballstemmer import stemmer
import arabicstopwords.arabicstopwords as ar_stp
import pyterrier as pt
# pyterrier is a Python API for Terrier. Link: https://github.com/terrier-org/pyterrier
# Terrier IR Platform is a modular open source software for the rapid development of large-scale information retrieval applications.
if not pt.started():
   pt.init(helper_version="0.0.6")
import os
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import time
import uuid

# Define important constant

In [None]:
# define some global constants
TEXT = "text"
QUERY = "query"
LABEL = "label"
RANK = "rank"
TAG = "tag"
SCORE = "score"
QID = "qid"
DOC_NO = "docno"
DOCID = "docid"

### Preprocessing
Preprocess the arabic input text by performing normalization, stemming, and removing stop words.

In [None]:
# Clean text from urls, handles, special characters, tabs, line jumps, and extra white space.
def clean(text):
    text = re.sub(r"http\S+", " ", text)  # remove urls
    text = re.sub(r"@[\w]*", " ", text)  # remove handles
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text) # remove special characters
    text = re.sub(r"\t", " ", text)  # remove tabs
    text = re.sub(r"\n", " ", text)  # remove line jump
    text = re.sub(r"\s+", " ", text)  # remove extra white space
    text = re.sub(r'[^\w\s]', '', text) # Removing punctuations in string using regex
    text = text.strip()
    return text

# arabic stemmer
ar_stemmer = stemmer("arabic")
# remove arabic stop words
def ar_remove_stop_words(sentence):
    terms=[]
    stopWords= set(ar_stp.stopwords_list())
    for term in sentence.split() :
        if term not in stopWords :
            terms.append(term)
    return " ".join(terms)


# normalize the arabic text
def normalize_arabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return(text)

# stem the arabic text
def ar_stem(sentence):
    return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()])


# apply all preprocessing steps needed for Arabic text
def preprocess_arabic(text):
    text = normalize_arabic(text)
    text = ar_remove_stop_words(text)
    text = ar_stem(text)
    return text


def prepare_query_for_search(query):
        print("Cleaning queries and applying preprocessing steps")
        id = uuid.uuid4()
        processed_query = preprocess_arabic(query)
        processed_id = str(id) # convert the id column to string
        df_query = pd.DataFrame(data={QUERY:[processed_query], QID:[processed_id]},columns=[QUERY, QID])
        print("Done with preparation!")
        return df_query


# Create index for search

In [None]:
train = pd.read_csv("/content/drive/MyDrive/Quran_QA/DataCsv/train.csv")
validation = pd.read_csv("/content/drive/MyDrive/Quran_QA/DataCsv/validation.csv")

In [None]:
all_data = pd.concat([train, validation])
all_data = all_data[["pq_id", "passage"]]
all_data = all_data.drop_duplicates()
all_data["passage"] = all_data["passage"].apply(preprocess_arabic)
df = all_data.rename(columns={"pq_id":"docno","passage":"text"})

In [None]:
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [None]:
# index the text, record the docnos as metadata
# pd_indexer = pt.DFIndexer("/content/drive/MyDrive/Quran_QA/QPC_Index")
# pd_indexer.setProperty("tokeniser", "UTFTokeniser")
# indexref = pd_indexer.index(df["text"], df["docno"])

## Search in the index

Before searching in the index, we need to prepare some functions to clean the input text.

In [None]:
def load_index(index_path):
    try:
        index = pt.IndexFactory.of(index_path)
        print("Index was loaded successfully from this path: ", index_path)
        return index
    except Exception as e:
        print('Cannot load the index, check exception details {}'.format(e))
        return []

In [None]:
index_path = "/content/drive/MyDrive/Quran_QA/QPC_Index/data.properties"

index = load_index(index_path=index_path)

## Search
Search in the index and find the relevant passages.

In [None]:
# 1. initialize the BM25 retrieval model
BM25_model = pt.BatchRetrieve(index, wmodel=pipe, metadata=["docno"],num_results=5)

# 2. read the query file and prepare it for search to match pyterrier format
df_query = prepare_query_for_search(query)

# 3. search using BM25 model
df_run = BM25_model.transform(df_query)

# 4. save the run in trec format to a file
df_run["Q0"] = ["Q0"] * len(df_run)
df_run["tag"] = ["BM25"] * len(df_run)
df_run

# Prepare data to get final context

In [None]:
train.iloc[30]["passage"]

In [None]:
retrivd_passage = list(df_run["docno"].values)

In [None]:
retrivd_passage

In [None]:
train[train["pq_id"].isin(retrivd_passage)]

In [None]:
context = train[train["pq_id"]=="6:91-92_407"]["passage"].values[0]

In [None]:
context

In [None]:
def answer_question(message):
    pipe = pipeline("question-answering", model="hemagamal/mdeberta_Quran_qa", tokenizer="hemagamal/mdeberta_Quran_qa")
    result_row = pipe(question=message,context=context,handle_impossible_answer=True, topk=1)
    return result_row["answer"]

In [None]:
answer_question(query)

In [None]:
query