# Install important libraries

In [None]:

!pip install faiss-cpu
! pip install streamlit -q
! pip install -U sentence-transformers
!sudo apt-get install poppler-utils tesseract-ocr libmagic-dev
!pip install -Uq "unstructured[pdf]" pillow lxml
!sudo apt-get install tesseract-ocr-ara
!pip install google-generativeai

# Convert PDF to text file

In [None]:
from unstructured.partition.pdf import partition_pdf

file_path = '/content/أثر العرب في الحضارة الأوروبية.pdf'

# Reference: https://docs.unstructured.io/open-source/core-functionality/chunking
chunks = partition_pdf(
    filename=file_path,
    # infer_table_structure=False,            # extract tables
    strategy="hi_res",                     # mandatory to infer tables
    languages=["ara", "eng"],

    # extract_image_block_types=["Image"],   # Add 'Table' to list to extract image of tables
    # image_output_dir_path=output_path,   # if None, images and tables will saved in base64

    # extract_image_block_to_payload=True,   # if true, will extract base64 for API usage

    chunking_strategy="basic",          # or 'basic'
    max_characters=10000,                  # defaults to 500
    combine_text_under_n_chars=2000,       # defaults to 0
    new_after_n_chars=6000,

)
texts = []
texts = [str(chunk) for chunk in chunks if "CompositeElement" in str(type(chunk))]

# Write to file with double newlines between paragraphs
with open("أثر العرب في الحضارة الأوروبية.txt", "w", encoding="utf-8") as f:
    f.write("\n\n".join(texts))

# Split text into paragraphs

In [None]:
import re
import random

def enhanced_split_to_paragraphs(text, min_sentences=2, max_sentences=4):
    # Normalize and prepare text
    text = text.replace('\n', ' \n ')
    text = re.sub(r'\s+', ' ', text).strip()

    # Split into sentences using Arabic punctuation and newlines
    sentence_endings = re.compile(r'(?<=[\.\!\؟\!،؛\n])\s+(?=[^\s])')
    sentences = sentence_endings.split(text)
    sentences = [s.strip() for s in sentences if len(s.strip()) > 2]

    paragraphs = []
    i = 0
    while i < len(sentences):
        # Choose a random chunk size between min and max
        remaining = len(sentences) - i
        if remaining < min_sentences:
            break
        chunk_size = random.randint(min_sentences, min(max_sentences, remaining))
        chunk = sentences[i:i + chunk_size]
        paragraphs.append(' '.join(chunk))
        i += chunk_size

    # Add any leftover short sentences to the last paragraph
    if i < len(sentences):
        if paragraphs:
            paragraphs[-1] += ' ' + ' '.join(sentences[i:])
        else:
            paragraphs.append(' '.join(sentences[i:]))

    return paragraphs


with open('أثر العرب في الحضارة الأوروبية.txt', 'r', encoding='utf-8') as f:
    arabic_text = f.read()

paragraphs = enhanced_split_to_paragraphs(arabic_text)

print(f"Total paragraphs created: {len(paragraphs)}")
for i, para in enumerate(paragraphs[:5]):
    print(f"\nParagraph {i+1}:\n{para}")

# Save the chunked paragraphs to a text file
with open("Paragraphs.txt", "w", encoding="utf-8") as f:
    for paragraph in paragraphs:
        f.write(paragraph + "\n\n")  # Add double newlines to separate paragraphs

print("Chunked book saved to 'Paragraphs.txt'.")

# Generate Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
with open("/content/Paragraphs.txt", "r", encoding="utf-8") as f:
      paragraphs = [p.strip() for p in f.read().split("\n\n") if len(p.strip()) > 0]

      # Load GATE-AraBert-v1 model
      model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

      # Generate embeddings for paragraphs
      print("Generating paragraph embeddings...")
      paragraph_embeddings = model.encode(paragraphs, show_progress_bar=True)

      # Save embeddings for reuse (optional)
      np.save("paragraph_embeddings.npy", paragraph_embeddings)


# Indexing with FAISS

In [None]:
import faiss
import numpy as np

# Load embeddings
embeddings = np.load("/content/paragraph_embeddings.npy")
# Ensure the embeddings are a proper contiguous float32 NumPy array
embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)

# Check dtype and shape
print(embeddings.dtype)
print(embeddings.shape)

# Create FAISS index
embedding_dim = embeddings.shape[1]
# Validate embeddings
assert embeddings.ndim == 2, "Embeddings should be a 2D array."
assert embeddings.dtype == np.float32, "Embeddings should be of dtype float32."

# Validate FAISS index creation
try:
    index = faiss.IndexFlatL2(embedding_dim)
    index.add(embeddings)
    faiss.write_index(index, "faiss_index.bin")
    print(f"FAISS index created and saved with {index.ntotal} vectors.")
except Exception as e:
    print(f"Error during FAISS indexing: {e}")

# Run Retrieval System

In [None]:

!wget -q -O - ipv4.icanhazip.com
! streamlit run /content/Retrieval_app.py & npx localtunnel --port 8501

# Set API Key

In [None]:
import os
os.environ["GOOGLE_API_KEY"] = "YOUR_API_KEY"

# Run RAG System

In [None]:

!wget -q -O - ipv4.icanhazip.com
! streamlit run /content/Rag_app.py & npx localtunnel --port 8501