# Cross-Language Information Retrieval System
This notebook demonstrates the training and evaluation of our CLIR system components.

In [None]:
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

## 1. Load Models

In [None]:
# Translation model
translator_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
translator_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

# Sentence embedding model
encoder_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

## 2. Translation Function

In [None]:
def translate_text(text, target_lang="hi"):
    inputs = translator_tokenizer(text, return_tensors="pt")
    translated = translator_model.generate(
        **inputs,
        forced_bos_token_id=translator_tokenizer.get_lang_id(target_lang)
    )
    return translator_tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

## 3. Document Embedding and Similarity Search

In [None]:
def compute_embeddings(texts):
    return encoder_model.encode(texts)

def search_documents(query, documents, threshold=0.5):
    query_embedding = compute_embeddings(query)
    results = []
    
    for doc in documents:
        doc_embedding = compute_embeddings(doc['title'] + ' ' + doc['content'])
        similarity = cosine_similarity([query_embedding], [doc_embedding])[0][0]
        
        if similarity > threshold:
            results.append({
                'document': doc,
                'similarity': similarity
            })
    
    return sorted(results, key=lambda x: x['similarity'], reverse=True)

## 4. Evaluation

In [None]:
# Sample evaluation data
test_queries = [
    "machine learning applications",
    "deep learning frameworks",
    "neural networks"
]

# Evaluate translation quality and search relevance
for query in test_queries:
    print(f"Query: {query}")
    
    # Test translation
    translated = translate_text(query)
    print(f"Translated: {translated}\n")
    
    # Test search
    results = search_documents(query, documents)
    for result in results[:3]:
        print(f"Relevance: {result['similarity']:.3f}")
        print(f"Title: {result['document']['title']}")
        print("---")
    print("\n")