<a href="https://colab.research.google.com/github/stellarIV/My-portfolio/blob/main/rag_redo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Data Scraping

## Install Dependancies

In [1]:
# Install dependencies
!pip install -q PyMuPDF
!pip install -q spacy
!python -m spacy download en_core_web_sm
!pip install chromadb sentence-transformers requests


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Upload PDF to ingest

In [61]:
import fitz  # PyMuPDF
import json
from google.colab import files
import json
import chromadb
import openai
import re
from sentence_transformers import SentenceTransformer
import spacy
import numpy as np

In [63]:


def upload_and_process_pdf():
    print("📤 Please upload a PDF file...")
    uploaded = files.upload()

    for filename in uploaded.keys():
        if filename.endswith('.pdf'):
            # Open the PDF file
            doc = fitz.open(filename)
            scraped_data = []

            # Extract text from each page
            for i, page in enumerate(doc):
                text = page.get_text()
                scraped_data.append({"page": i + 1, "text": text.strip()})

            # Save to JSONL format
            with open("scraped_data.jsonl", "w", encoding="utf-8") as f:
                for item in scraped_data:
                    json.dump(item, f, ensure_ascii=False)
                    f.write("\n")

            print(f"✅ Extracted {len(scraped_data)} pages and saved to scraped_data.jsonl")
            files.download("scraped_data.jsonl")
        else:
            print("❌ Please upload a valid PDF file.")

# Run the function
upload_and_process_pdf()


📤 Please upload a PDF file...


Saving 2014.pdf to 2014.pdf
✅ Extracted 71 pages and saved to scraped_data.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Clean Ingested Data

In [70]:


def remove_common_headers(text):
    # Define common patterns to strip out from the top of the page
    header_patterns = [
        r'^\s*(Incomplete advance copy)\s*',  # Header alone
        r'^\s*\d+\s+Grade 9\s+Incomplete advance copy',  # e.g., '15 Grade 9 Incomplete advance copy'
        r'^\s*Grade 9.*',  # Any other Grade 9 variation
        r'^\s*የፋይናንስ ደህንነት አገልግሎት',  # Amharic: "Finance Security Service"
        r'^\s*አመታዊ መጽሐፍ',  # Amharic: "Annual Book"
    ]

    lines = text.split('\n')
    cleaned_lines = []

    for i, line in enumerate(lines):
        # Only check first 3 lines of the page (where headers typically occur)
        if i < 3:
            if any(re.match(pat, line, flags=re.IGNORECASE) for pat in header_patterns):
                continue  # Skip header line
        cleaned_lines.append(line)

    return '\n'.join(cleaned_lines).strip()


def clean_text(text):
    # Remove header
    text = remove_common_headers(text)

    # Preserve '.\n' but clean up messy whitespace
    text = text.replace('.\n', '[DOT_NEWLINE]')
    text = text.replace('።\n', '[DOT_NEWLINE]')
    text = re.sub(r'[\t\r\f\v]+', ' ', text)
    text = re.sub(r'\s*\n\s*', ' ', text)
    text = text.replace('[DOT_NEWLINE]', '.\n')
    text = re.sub(r'[\[\]{}<>“”"\'()_/\\=+@#%*~`|^•●]+', ' ', text)  # generic symbol cleaner
    text = re.sub(r'\s+', ' ', text)  # collapse multiple spaces
    # Remove standalone Amharic letters
    tokens = text.split()
    filtered_tokens = [
        word for word in tokens
        if not (len(word) == 1 and re.fullmatch(r'[\u1200-\u137F]', word))
    ]
    text = ' '.join(filtered_tokens)

    return text.strip()

def extract_amharic_text(jsonl_path='scraped_data.jsonl', output_path='amharic_text.jsonl'):
    amharic_pattern = re.compile(r'[\u1200-\u137F0-9\s\.\,\:\;\-\–\(\)\[\]\{\}\'\"!@#\$%\^&\*\+=\?\/\\]+')

    with open(jsonl_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            entry = json.loads(line)
            cleaned = clean_text(entry['text'])
            matches = amharic_pattern.findall(cleaned)
            result_text = ''.join(matches).strip()
            if result_text:
                json.dump({"page": entry["page"], "text": result_text}, outfile, ensure_ascii=False)
                outfile.write('\n')

def extract_english_text(jsonl_path='scraped_data.jsonl', output_path='english_text.jsonl'):
    english_pattern = re.compile(r'[A-Za-z0-9\s\.\,\:\;\-\–\(\)\[\]\{\}\'\"!@#\$%\^&\*\+=\?\/\\]+')

    with open(jsonl_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            entry = json.loads(line)
            cleaned = clean_text(entry['text'])
            matches = english_pattern.findall(cleaned)
            result_text = ''.join(matches).strip()
            if result_text:
                json.dump({"page": entry["page"], "text": result_text}, outfile, ensure_ascii=False)
                outfile.write('\n')


In [71]:
# Call the functions to extract and save filtered text
extract_amharic_text()   # creates amharic_text.jsonl


In [11]:
extract_english_text()   # creates english_text.jsonl


# Chunk our Data

### For English

In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sentence splitter using spaCy
def split_into_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

# Paragraph splitter using double line breaks (manually preserved)
def split_into_paragraphs(text):
    return [p.strip() for p in text.strip().split('\n') if p.strip()]

### For Amharic

In [72]:
def split_into_sentences_amharic(text):
    sentences = []
    current_sentence = ""
    for char in text:
        current_sentence += char
        if char in {'።', '!', '?'}:
            sentence = current_sentence.strip()
            if sentence:
                sentences.append(sentence)
            current_sentence = ""
    # Add any remaining text as a sentence
    if current_sentence.strip():
        sentences.append(current_sentence.strip())
    return [sent for sent in sentences if sent]


In [73]:


# Clean text by removing all '\n' characters, tabs, and excess spaces
def clean_text_linebreaks(text):
    # Remove all \n characters
    text = text.replace('\n', ' ')
    # Remove any tabs and excess spaces
    text = re.sub(r'\t+', ' ', text)
    text = re.sub(r' +', ' ', text)   # Collapse multiple spaces into one
    return text.strip()

# Main chunking function
def chunk_jsonl_text(
    filepath='english_text.jsonl',
    output_path='chunked_text.jsonl',
    chunk_by='sentence',  # 'sentence', 'paragraph', 'page'
    group_size=1,
    max_characters=3000
):
    with open(filepath, 'r', encoding='utf-8') as infile:
        data = [json.loads(line) for line in infile]

    chunks = []
    buffer = ""
    unit_count = 0
    current_pages = []

    for entry in data:
        text = entry['text']
        page = entry['page']

        # Clean all newlines from the text
        text = clean_text_linebreaks(text)

        # Split text by chosen method
        if chunk_by == 'sentence':
            units = split_into_sentences_amharic(text)
        elif chunk_by == 'paragraph':
            units = split_into_paragraphs(text)
        elif chunk_by == 'page':
            units = [text]
        else:
            raise ValueError("chunk_by must be 'sentence', 'paragraph', or 'page'")

        # Build chunks
        for unit in units:
            proposed = buffer + (" " if buffer else "") + unit
            if len(proposed) > max_characters or unit_count >= group_size:
                if buffer:
                    chunks.append({
                        "pages": current_pages,
                        "text": buffer.strip()
                    })
                buffer = unit
                unit_count = 1
                current_pages = [page]
            else:
                buffer = proposed
                unit_count += 1
                if page not in current_pages:
                    current_pages.append(page)

    # Add last chunk
    if buffer.strip():
        chunks.append({
            "pages": current_pages,
            "text": buffer.strip()
        })

    # Write to output
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for chunk in chunks:
            json.dump(chunk, outfile, ensure_ascii=False)
            outfile.write('\n')

    print(f"✅ Chunked into {len(chunks)} items and saved to {output_path}")


In [75]:
chunk_jsonl_text(
    filepath='amharic_text.jsonl',
    output_path='chunked_by_sentences_amharic.jsonl',
    chunk_by='sentence',
    group_size=6,
    max_characters=7000
)

✅ Chunked into 64 items and saved to chunked_by_sentences_amharic.jsonl


Hurray!! data collection and chunking finished

## Embedding Our Data

## For FAISS

In [34]:
def embed_text_chunks(
    input_jsonl='chunked_text.jsonl',
    output_npy='embeddings.npy',
    output_metadata='metadata.jsonl',
    model_name='paraphrase-MiniLM-L6-v2',
    batch_size=32,
    show_preview=True
):
    print(f"🔄 Loading model: {model_name}")
    model = SentenceTransformer(model_name)

    texts = []
    metadata = []

    # Load text chunks
    with open(input_jsonl, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            texts.append(entry['text'])
            metadata.append({
                "pages": entry.get("pages", []),
                "text_preview": entry["text"][:100] + "..."  # For quick inspection
            })

    print(f"📄 Loaded {len(texts)} text chunks")

    # Compute embeddings
    print("⚙️ Computing embeddings...")
    embeddings = model.encode(texts, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True)

    # Save embeddings as .npy
    np.save(output_npy, embeddings)
    print(f"💾 Saved embeddings to {output_npy}")

    # Save metadata
    with open(output_metadata, 'w', encoding='utf-8') as meta_out:
        for item in metadata:
            json.dump(item, meta_out, ensure_ascii=False)
            meta_out.write('\n')
    print(f"📎 Metadata saved to {output_metadata}")

    # Optional preview
    if show_preview:
        print("\n🧾 Example preview:")
        print("Text:", texts[0][:200])
        print("Embedding shape:", embeddings[0].shape)

    return embeddings


In [38]:
embed_text_chunks(
    input_jsonl='chunked_by_sentences.jsonl',
    output_npy='paraphrase_MiniLM_embeddings.npy',
    output_metadata='paraphrase_MiniLM_metadata.jsonl',
    model_name='paraphrase-MiniLM-L6-v2'  # Try changing this to other models!
)


🔄 Loading model: paraphrase-MiniLM-L6-v2
📄 Loaded 231 text chunks
⚙️ Computing embeddings...


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

💾 Saved embeddings to paraphrase_MiniLM_embeddings.npy
📎 Metadata saved to paraphrase_MiniLM_metadata.jsonl

🧾 Example preview:
Text: Author: Ann Fullick Adviser: Alemu Asfaw Evaluators: Solomon Belayneh Getachew Bogale Silas Araya Federal Democratic Republic of Ethiopia Ministry of Education Biology Student Textbook Grade 9
Embedding shape: (384,)


array([[-0.36088732,  0.62437767, -0.31661388, ..., -0.34875688,
         0.30137637,  0.29754072],
       [-0.43010095,  0.17494476, -0.41270575, ..., -0.2968747 ,
         0.20853299,  0.06345865],
       [-0.20878713, -0.02464597, -0.02219432, ..., -0.24649675,
         0.04722616,  0.10193737],
       ...,
       [-0.09510569, -0.16385382,  0.01368616, ..., -0.08052471,
         0.21493797,  0.04344206],
       [-0.39261466,  0.05208082, -0.02295829, ..., -0.31241482,
         0.2776665 ,  0.06188554],
       [-0.33360326,  0.07018267,  0.12846112, ..., -0.37540123,
         0.2615606 ,  0.17897749]], dtype=float32)

In [32]:
# Test the function with different models
embed_chunks(input_jsonl='chunked_by_sentences.jsonl', output_jsonl='chunked_embeddings_paraphrase.jsonl', model_name='paraphrase-MiniLM-L6-v2')
embed_chunks(input_jsonl='chunked_by_sentences.jsonl', output_jsonl='chunked_embeddings_all_miniLM.jsonl', model_name='all-MiniLM-L6-v2')
embed_chunks(input_jsonl='chunked_by_sentences.jsonl', output_jsonl='chunked_embeddings_distilbert.jsonl', model_name='distilbert-base-nli-stsb-mean-tokens')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FileNotFoundError: [Errno 2] No such file or directory: 'chunked_text.jsonl'

## For Chroma

### With out embedding declared

In [82]:
import json
import uuid
import chromadb

# Initialize Chroma client and collection
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name="my_collection")

# Step 1: Read the JSONL file and load the data
with open("chunked_by_sentences_amharic.jsonl", "r") as file:
    documents = []
    ids = []
    for idx, line in enumerate(file):
        data = json.loads(line)
        text = data["text"]
        # Make the ID unique using page number + line index
        document_id = f"id{data['pages']}_{idx}"
        documents.append(text)
        ids.append(document_id)

# Step 2: Add documents to Chroma collection
collection.upsert(
    documents=documents,
    ids=ids
)

# Step 3: Query the collection
query_text = "አቶ ኪሩቤል ጨምረው እንደገለጹት ሶፍትዌሩ ከወንጀሎቹ ጋር በተያያዘ ስንት መሰረታዊ ጉዳዮችን ያከናውናል።"
results = collection.query(
    query_texts=[query_text],
    n_results=2
)

# Print results
print(results)


{'ids': [['id[44]_39', 'id[20]_14']], 'embeddings': None, 'documents': [['ይህ ደግሞ የተለያዩ አካትን ይመለከታል። ከነዚህም መካከል ለትርፍ ያልተቋቋሙ ድርጅቶች ተጠቃሽ ናቸው። የዚህን ወንጀል ተግባር ለመከላከልና ለትርፍ ያልተቋቋሙ ድርጅቶች ለወንጀሉ ያላቸውን ተጋላጭነት መቀነስ ያስፈልጋል. እነዚሁ ድርጅቶች የሚቋቋሙት ለትርፍ ዓላማ ሳይሆን የተለያዩ ማሕበራዊ እና ሰብአዊ አገልግሎቶችን ለማሕበረሰቡ ለመስጠት ነው። ተቋማቱም ሀገር በቀል ወይም በአለም ዓቀፍ ደረጃ የሚቋቋሙ ሲሆኑ በውስጣቸውም የተለያዩ አደረጃጀቶች ይኖሯቸዋል። የተቋቋሙበትን ዓላማ ለማሳካት የሚያስችላቸውን ገንዘብ በአብዛኛው የሚያገኙት ዓላማቸውን ከሚደግፉ ግለሰቦች እና ድርጅቶች በሚገኝ እርዳታ ነው። ተቋማቱ ዓላማቸውን ከሚደግፉ ግለሰቦች እና ድርጅቶች ባለፈ ከመንግሥታትም የገንዘብ እና የቁሳቁስ ድጋፍ የሚያገኙበት አጋጣሚም ሰፊ ነው. የእርዳታ ድርጅቶች በኃያላን ሀገራት መንግሥታት ድጋፍ የሚደረግላቸው ከሆነ የዚያን ሀገር ጥቅም ማስጠበቂያ እና ከተቋቋሙበት ዓላማ ውጪ በድብቅ ሌሎች ዓላማዎች ማስፈጸሚያ መሳሪያ ሆነው እንደሚያገለግሉ የተለያዩ መረጃዎች ይገልጻሉ።', 'እነዚህ ወንጀሎች የሚያስከትሉት ጉዳት ሁሉን-አቀፍ ነው። አገራዊ ጉዳታቸውም ኢኮኖሚያዊ፣ ማህበራዊ እንዲሁም ፖለቲካዊ አሉታዊ ተፅዕኖ ከማሳደርም ባለፈ ድንበር ተሻጋሪ በመሆናቸው ጉዳታቸው በአገር ውስጥ ብቻ የማይወሰን ነው። በዓለም አቀፍ ደረጃ ጥቁር ገበያን ከሚፈጥሩ ወንጀሎች መካከል ሕገ-ወጥ ወይም የኮንትሮባንድ ንግድ ይጠቀሳል። የሕገ-ወጥ ንግድ ወንጀል ገንዘብን፣ ዕቃን ወይም ከሕገ- ወጥ የሚገኝ ማንኛውንም ትርፍ ያካትታል። ወንጀሉም በዋናነት የተለያዩ ቁሳቁስንና ሰዎችን በሕገ-ወጥ መንገድ 

# Open Router Integration

## Setup chroma

In [85]:
import json
import chromadb
from sentence_transformers import SentenceTransformer

# Load the sentence transformer model
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Initialize Chroma client
chroma_client = chromadb.Client()

# Delete the old collection (if it exists) to ensure clean state
try:
    chroma_client.delete_collection("my_collection2")
    print("Old collection deleted.")
except:
    pass  # Ignore if the collection doesn't exist

# Create a new collection (no embedding_function)
collection = chroma_client.create_collection(name="my_collection2")

# Step 1: Load and process JSONL data
documents = []
ids = []
metadatas = []
embeddings = []

try:
    with open("chunked_by_sentences_amharic.jsonl", "r") as file:
        for i, line in enumerate(file):
            data = json.loads(line)
            text = data.get("text", "")
            if not text:
                continue
            page_num = data.get("pages", [0])[0]
            doc_id = f"id{page_num}_{i}"

            documents.append(text)
            ids.append(doc_id)
            metadatas.append({"page": page_num, "line_number": i})
            embeddings.append(model.encode(text).tolist())
except Exception as e:
    print("Error reading file:", e)

# Sanity check
print(f"Loaded {len(documents)} documents.")
print(f"Sample embedding (first 5 values): {embeddings[0][:5]}")

# Step 2: Add to Chroma collection
collection.add(
    documents=documents,
    ids=ids,
    metadatas=metadatas,
    embeddings=embeddings
)

print("Documents added to collection.")
print(f"Total in collection: {collection.count()}")

# Step 3: Query using manual embedding
query_text = "በፋይናንስ ደህንነት አገልግሎት ስለ ወንጀል ተግባር ምን ተሰምቷል"
query_embedding = model.encode(query_text).tolist()

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2
)

# Step 4: Print the results
print("\n--- Query Results ---")
for i, doc in enumerate(results["documents"][0]):
    print(f"\nResult {i+1}:")
    print("Document ID:", results["ids"][0][i])
    print("Metadata:", results["metadatas"][0][i])
    print("Text:", doc)


Old collection deleted.
Loaded 64 documents.
Sample embedding (first 5 values): [-0.1400429904460907, -0.04620487987995148, 0.20908226072788239, -0.08718230575323105, -0.22662471234798431]
Documents added to collection.
Total in collection: 64

--- Query Results ---

Result 1:
Document ID: id54_50
Metadata: {'line_number': 50, 'page': 54}
Text: ሰባተኛ የፋይናንስ ደህንነት አገልግሎት ከፋይናንስ እና ኢኮኖሚ ወንጀሎች ጋር ግንኙነት ያላቸው መረጃዎችን ከተለያዩ ሪፖርት አቅራቢ አካላት እና በራሱ ያሰባስባል። እነዚህን መረጃዎች በመፈተሽና በመተንተን ወንጀል ነክ ጉዳዮችን የመለየት እና ግኝቱን ሕጋዊ እርምጃ እንዲወሰድባቸው ለሕግ አስከባሪ አካላት ያስተላልፋል. ተቋሙ አገራዊ ሪፎርሙን ተከትሎ የውስጥ አቅሙን በማጠናከር እና ከሕግ አስከባሪ አካላት ጋር በቅርበት በመሥራት በርካታ ቢሊዮን ብሮች ይዘት ያላቸው ወንጀል-ነክ ጉዳዮች ወንጀል ፈፃሚ አካላት ላይ ሕጋዊ እርምጃ እንዲወሰድ አድርጓል። በርካታ ጉዳዮች በሕግ ተይዘው በሂደት ላይ ናቸው. ስምንተኛ ሕግ አስከባሪ አካላት ወንጀል ተፈፅሞ የሚገኘው ገንዘብ ሀብት ሕጋዊ ለማስመሰል የሚደረጉ ሂደቶችን የመለየትና የመመርመር አቅም በመገንባትወንጀሎቹን እና ከወንጀሎቹ በስተጀርባ ያሉ ወንጀል ፈፃሚዎችን መለየትና ምርመራ ማከናወን ይጠበቃል. ለፋይናንስ ጤናማነት እና ኢኮኖሚ ደህንነት የባለድርሻ አካላት ሚና ለፋይናንስ ጤናማነት እና ኢኮኖሚ ደህንነት የባለድርሻ አካላት ሚና ዘጠነኛ ዐቃቢያን ሕጎች ከወንጀሎቹ ጋር በተያየዘ የሕግ የ

## load model

In [98]:
# Load MiniLM model
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Initialize Chroma client
chroma_client = chromadb.Client()

# Load or create Chroma collection
collection = chroma_client.get_or_create_collection(name="my_collection2")


## Define query and top_k

In [99]:
# Define your query
query_text = "አቶ ኪሩቤል ጨምረው እንደገለጹት ሶፍትዌሩ ከወንጀሎቹ ጋር በተያያዘ ስንት መሰረታዊ ጉዳዮችን ያከናውናል?"
query_embedding = model.encode(query_text).tolist()

# Query top N results
n_results = 2
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=n_results
)
# Extract and combine top documents
retrieved_chunks = results["documents"][0]
context = "\n\n".join(retrieved_chunks)

# Fill in the prompt template
import requests

# Use the same prompt built earlier
prompt = f"""

እርስዎ አጋዥ የ AI ረዳት ነዎት። ጥያቄውን ለመመለስ የቀረበውን ጽሑፍ ብቻ ይጠቀሙ።
ጽሑፍ:
{context}
ጥያቄ:

{query_text}
መልሱን ግልጽ እና አጭር በሆነ መንገድ በጽሑፉ ላይ ብቻ በመመስረት ይመልሱ።

"""


In [112]:
OPENROUTER_API_KEY = "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"  # Replace with your real key

# Choose the model: "openai/gpt-3.5-turbo" or "google/gemini-pro"
model = "facebook/mbart-large-50-many-to-one-mmt"

In [113]:
response = requests.post(
    "https://openrouter.ai/api/v1/chat/completions",
    headers={
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json"
    },
    json={
        "model": model,
        "messages": [
            {"role": "system", "content": "ጠቃሚ ረዳት ነዎት።"},
            {"role": "user", "content": prompt}
        ]
    }
)
# Show result
if response.status_code == 200:
    result = response.json()
    print("\n--- የመጨረሻ መልስ ---\n")
    print(result['choices'][0]['message']['content'])
else:
    print("❌ ስህተት:", response.status_code)
    print(response.text)

❌ ስህተት: 400
{"error":{"message":"facebook/mbart-large-50-many-to-one-mmt is not a valid model ID","code":400},"user_id":"user_2x5FBdk5fwfv9NJk1bNvfh8SpOc"}
