In [None]:
!pip install transformers flask pandas scikit-learn



In [None]:
!pip install kaggle
!mkdir ~/.kaggle
!echo '{"username":"ramaalyoubi","key":"82e22c4988257fba9141f3be474f0b66"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d fahd09/hadith-dataset
!unzip hadith-dataset.zip

mkdir: cannot create directory ‘/root/.kaggle’: File exists
hadith-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  hadith-dataset.zip
replace all_hadiths_clean.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
!pip install langdetect



In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from langdetect import detect
import re

# Load the dataset
df = pd.read_csv('all_hadiths_clean.csv')


In [None]:
# Normalize the text
df['text_en'] = df['text_en'].str.lower().str.replace('[^\w\s]', '')

  df['text_en'] = df['text_en'].str.lower().str.replace('[^\w\s]', '')


In [None]:
# Check for missing values and handle them
df.dropna(subset=['text_ar', 'text_en'], inplace=True)

In [None]:
from langdetect import detect, LangDetectException

def remove_english_words(chapter_name):
    # Split the chapter name into Arabic and English parts
    parts = chapter_name.split(' and ')

    # Filter out the Arabic part
    arabic_part = parts[0]

    # Remove any remaining English words using regex
    cleaned_chapter_name = re.sub(r'\b\w+\b', '', arabic_part).strip()

    return cleaned_chapter_name

# Apply function to remove English words from chapter names
df["chapter"] = df["chapter"].apply(lambda x: remove_english_words(x))


In [None]:
# Define dataset class
class HadithDataset(Dataset):
    def __init__(self, texts, chapters, tokenizer, max_length):
        self.texts = texts
        self.chapters = chapters
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        chapter = self.chapters[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(chapter)
        }

In [None]:
# Split the dataset into train and validation sets
train_texts, val_texts, train_chapters, val_chapters = train_test_split(df['text_ar'], df['chapter'], test_size=0.2, random_state=42)

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")
model = AutoModel.from_pretrained("aubmindlab/aragpt2-base")


In [None]:
# Define dataset and dataloaders
train_dataset = HadithDataset(train_texts.tolist(), train_chapters.tolist(), tokenizer, max_length=128)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

val_dataset = HadithDataset(val_texts.tolist(), val_chapters.tolist(), tokenizer, max_length=128)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

GPT2Model(
  (wte): Embedding(64000, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [None]:
import torch
import numpy as np

# Function to encode text to embeddings using the last hidden state of the model
def encode_text_with_labels(texts, labels, model, tokenizer, max_length=None, batch_size=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    all_embeddings = []
    all_labels = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_labels = labels[i:i+batch_size] # Ignore this step
        encoded_input = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt', max_length=max_length).to(device)

        with torch.no_grad():
            outputs = model(**encoded_input)
            # Get the embeddings from the last hidden state
            embeddings = outputs.last_hidden_state[:, -1, :].detach().cpu().numpy()
            all_embeddings.append(embeddings)
            all_labels.extend(batch_labels)

    # Concatenate all batch embeddings
    all_embeddings = np.concatenate(all_embeddings, axis=0)
    return all_embeddings, all_labels

# Define a padding token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

batch_size = 32
max_length = 256
# Generate embeddings for each Hadith without labels
embeddings, _ = encode_text_with_labels(df['text_ar'].tolist(), df['chapter'].tolist(), model, tokenizer, max_length=max_length, batch_size=batch_size)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def semantic_search(query, embeddings, texts, tokenizer, model, top_k=5):
    # Encode the query to get its embedding
    query_embedding = encode_text_with_labels([query], [None], model, tokenizer)[0]
    # Calculate cosine similarity scores
    scores = cosine_similarity(query_embedding.reshape(1, -1), embeddings)
    # Get the indices of the top-k most similar texts
    top_k_indices = scores.argsort(axis=1)[:, ::-1][:, :top_k]
    # Get the texts and their corresponding scores
    results = [(texts[i], scores[0, i]) for i in top_k_indices[0]]
    return results


In [None]:
query = input("Enter your query: ")
# Use the query to perform semantic search
results = semantic_search(query, embeddings, df['text_ar'].tolist(), tokenizer, model, top_k=5)
# Display the results
for text, score in results:
    print(f"Score: {score}, Hadith: {text}")


Enter your query: جد لي حديث عن أهمية الصلاة
Score: 0.5251553654670715, Hadith: أخبرنا محمد بن بشار، قال حدثنا يحيى بن سعيد، عن هشام، عن قتادة، ح وأنبأنا محمد بن المثنى، قال حدثنا يحيى، قال حدثنا هشام، قال حدثنا قتادة، عن يونس بن جبير، عن حطان بن عبد الله، أن الأشعري، قال إن رسول الله صلى الله عليه وسلم خطبنا فعلمنا سنتنا وبين لنا صلاتنا فقال ‏"‏ إذا قمتم إلى الصلاة فأقيموا صفوفكم ثم ليؤمكم أحدكم فإذا كبر فكبروا وإذا قال ‏{‏ ولا الضالين ‏}‏ فقولوا آمين يجبكم الله ثم إذا كبر وركع فكبروا واركعوا فإن الإمام يركع قبلكم ويرفع قبلكم ‏"‏ ‏.‏ قال نبي الله صلى الله عليه وسلم ‏"‏ فتلك بتلك وإذا قال سمع الله لمن حمده فقولوا اللهم ربنا لك الحمد فإن الله عز وجل قال على لسان نبيه صلى الله عليه وسلم سمع الله لمن حمده ثم إذا كبر وسجد فكبروا واسجدوا فإن الإمام يسجد قبلكم ويرفع قبلكم ‏"‏ ‏.‏ قال نبي الله صلى الله عليه وسلم ‏"‏ فتلك بتلك وإذا كان عند القعدة فليكن من قول أحدكم أن يقول التحيات الطيبات الصلوات لله السلام عليك أيها النبي ورحمة الله وبركاته السلام علينا وعلى عباد الله الصالحين أشهد أن لا إله 

In [None]:
# Save the entire model
torch.save(model, 'model.pth')

In [None]:
# save only the model state dictionary
torch.save(model.state_dict(), '/content/model_state_dict.pth')