In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!git clone https://gitlab.com/bigirqu/quranqa.git

Cloning into 'quranqa'...
remote: Enumerating objects: 333, done.[K
remote: Counting objects: 100% (86/86), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 333 (delta 43), reused 86 (delta 43), pack-reused 247[K
Receiving objects: 100% (333/333), 312.88 KiB | 2.61 MiB/s, done.
Resolving deltas: 100% (130/130), done.


In [2]:
! pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-8.8.2-py3-none-any.whl (393 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.9/393.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting elastic-transport<9,>=8 (from elasticsearch)
  Downloading elastic_transport-8.4.0-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.4.0 elasticsearch-8.8.2


In [3]:
# Install the farasapy package for Arabic text processing
!pip install farasapy

Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14


In [8]:
import sys
sys.path.insert(1, './quranqa/code')
sys.path.insert(0,"/content/drive/MyDrive/Quran_QA/Notebooks/")
import quranqa22_eval as eval

In [9]:
import json
import argparse
import read_write_qrcd as q_reader
from gensim import corpora, models, similarities
from typing import List, Tuple, Union
from collections import defaultdict
import pprint
from gensim import corpora, models, similarities

In [10]:
!python /content/quranqa/code/read_write_qrcd.py --/content/quranqa/datasets/qrcd_v1.1_train.jsonl

usage: read_write_qrcd.py
       [-h]
       --input_file
       INPUT_FILE
       [--output_file OUTPUT_FILE]
read_write_qrcd.py: error: the following arguments are required: --input_file


In [12]:
def preprocess_text(docs, stoplist):
    """Preprocesses the text by removing stop words and splitting into tokens."""
    return [[word for word in doc.split() if word not in stoplist] for doc in docs]

In [13]:
def build_model(texts, dims=4):
    """Builds the TF-IDF and LSI models for the given texts."""
    # Build dictionary and corpus
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # Build TF-IDF and LSI models
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=dims)
    corpus_lsi = lsi_model[corpus_tfidf]

    return dictionary, corpus, tfidf, corpus_tfidf, lsi_model, corpus_lsi

In [14]:
def preprocess_query(query):
    """Preprocesses the query by removing special characters and splitting into tokens."""
    return query.replace('؟', '').replace('"', '').split()

In [15]:
def build_query_vectors(dictionary, query_bow, lsi_model):
    """Builds the BOW and LSI vectors for the given query."""
    vec_bow = dictionary.doc2bow(query_bow)
    vec_lsi = lsi_model[vec_bow]

    return vec_bow, vec_lsi

In [16]:
def get_similarity_index(model, corpus):
    """Builds a similarity index for the given model and corpus."""
    index = similarities.MatrixSimilarity(corpus)
    return index

In [17]:
def get_similarities(query_vector, index):
    """Computes the similarities between the query vector and the documents."""
    return index[query_vector]

In [18]:
def ir(data_item, query_key='question', docs_key='passage', index_method='lsi', dims=4, stoplist=[]):
    """Performs information retrieval on a single PassageQuestion object."""
    # Get data items
    docs = [ver.strip() for ver in data_item.get(docs_key).split('.') if ver.strip()]
    query = data_item.get(query_key)

    # Preprocess text
    texts = preprocess_text(docs, stoplist)

    # Build model
    dictionary, corpus, tfidf, corpus_tfidf, lsi_model, corpus_lsi = build_model(texts, dims=dims)

    # Preprocess query
    query_bow = preprocess_query(query)

    # Build query vectors
    vec_bow, vec_lsi = build_query_vectors(dictionary, query_bow, lsi_model)

    # Build similarity index
    if index_method == 'tfidf':
        index = get_similarity_index(tfidf, corpus_tfidf)
    elif index_method == 'lsi':
        index = get_similarity_index(lsi_model, corpus_lsi)
    else:
        raise ValueError(f'Invalid index method: {index_method}')

    # Compute similarities and sort documents
    sims = get_similarities(vec_bow, index) if index_method == 'tfidf' else get_similarities(vec_lsi, index)
    sims = sorted(enumerate(sims), key=lambda item: -item[1])

    # Create result list
    result = [(docs[doc_position], doc_score) for doc_position, doc_score in sims]

    return result

In [19]:
if __name__ == '__main__':
    # Load dataset
    train_data_file = '/content/quranqa/datasets/qrcd_v1.1_test_gold.jsonl'
    train_data = q_reader.load_jsonl(train_data_file)

    # Choose a sample data item to perform IR on
    data_item = train_data[0]
    print("  ")
    print('Query (Question):', data_item['question'])
    print("  ")
    print('Answers:', data_item['answers'])

    # Perform IR and print results
    print("  ")
    print('Passages sorted by similarity score (TF-IDF):')
    print("  ")
    tfidf_results = ir(data_item, index_method='tfidf', stoplist=[])
    for passage, score in tfidf_results:
        print(f'{score:.2f}: {passage}')
    print("  ")
    print('Passages sorted by similarity score (LSI):')
    print("  ")
    lsi_results = ir(data_item, index_method='lsi', stoplist=[])
    for passage, score in lsi_results:
        print(f'{score:.2f}: {passage}')



Loaded 710 records from /content/quranqa/datasets/qrcd_v1.1_train.jsonl
  
Query (Question): لماذا سيُحاسب ويُعذب الضال يوم القيامة ان كان ""من يضلل الله فما له من هاد"" كما ورد من قوله تعالى في آية 23 و آية 36 من سورة الزمر؟
  
Answers: [{'text': 'أولئك الذين اشتروا الضلالة بالهدى', 'start_char': 504}]
  
Passages sorted by similarity score (TF-IDF):
  
0.31: ومن الناس من يقول آمنا بالله وباليوم الآخر وما هم بمؤمنين
0.11: وإذا قيل لهم آمنوا كما آمن الناس قالوا أنؤمن كما آمن السفهاء ألا إنهم هم السفهاء ولكن لا يعلمون
0.10: الله يستهزئ بهم ويمدهم في طغيانهم يعمهون
0.07: أولئك الذين اشتروا الضلالة بالهدى فما ربحت تجارتهم وما كانوا مهتدين
0.07: في قلوبهم مرض فزادهم الله مرضا ولهم عذاب أليم بما كانوا يكذبون
0.05: وإذا قيل لهم لا تفسدوا في الأرض قالوا إنما نحن مصلحون
0.04: يخادعون الله والذين آمنوا وما يخدعون إلا أنفسهم وما يشعرون
0.00: ألا إنهم هم المفسدون ولكن لا يشعرون
0.00: وإذا لقوا الذين آمنوا قالوا آمنا وإذا خلوا إلى شياطينهم قالوا إنا معكم إنما نحن مستهزئون
  
Passages sorted by sim