# Sanskrit NLP Components for Vedic Language Model

This notebook demonstrates the core Sanskrit NLP components we've implemented for the Vedic Language Model:
1. Sandhi processing (phonological junctions)
2. Pāṇinian grammar validation
3. Basic Sanskrit tokenization

In [None]:
# Import required modules
import sys
import os

# Add the project root to the Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))

from src.vlm.grammar.sandhi import SandhiProcessor
from src.vlm.grammar.ashtadhyayi import AshtadhyayiEngine
from src.vlm.core.tokenizer import SanskritTokenizer

## 1. Sandhi Processing

Demonstrate Sanskrit sandhi (phonological junction) rules

In [None]:
# Initialize the sandhi processor
sandhi_processor = SandhiProcessor()

# Test sandhi application
print("Sandhi Application Demo:")
test_cases = [
    ("rama", "iva", "rameva"),  # a + i = e
    ("deva", "atra", "devātra"),  # a + a = ā
    ("gaccha", "uta", "gacchota"),  # a + u = o
]

for first, second, expected in test_cases:
    result = sandhi_processor.apply(first, second)
    print(f"Sandhi rule: {first} + {second} = {result}")
    print(f"Expected: {expected}")
    print(f"Correct: {result == expected}")
    print()

# Test sandhi reversal
print("Sandhi Reversal Demo:")
test_cases = [
    "devātra",  # deva + atra
    "rameva",   # rama + iva
    "gacchota"  # gaccha + uta
]

for combined in test_cases:
    result = sandhi_processor.reverse(combined)
    print(f"Sandhi reversal of '{combined}' = {result}")

## 2. Grammar Validation

Demonstrate the Pāṇinian grammar validation

In [None]:
# Initialize the grammar engine
grammar_engine = AshtadhyayiEngine()

# Sample texts for grammar validation
print("Grammar Validation Demo:")

# Valid sentences
valid_sentences = [
    "rāmaḥ vanam gacchati",  # Rama goes to the forest
    "devāḥ yajñam rakṣanti"   # The gods protect the sacrifice
]

for sentence in valid_sentences:
    is_valid = grammar_engine.validate(sentence)
    print(f"Sentence: {sentence}")
    print(f"Valid: {is_valid}")
    print()

# Invalid sentences
invalid_sentences = [
    "rāma vanam gacchati",    # Missing visarga on subject
    "gacchati rāmaḥ vanam"    # Wrong word order (VSO instead of SOV)
]

for sentence in invalid_sentences:
    is_valid = grammar_engine.validate(sentence)
    corrected = grammar_engine.correct(sentence)
    print(f"Invalid Sentence: {sentence}")
    print(f"Valid: {is_valid}")
    print(f"Corrected: {corrected}")
    print()

## 3. Grammatical Parsing

Parse a Sanskrit sentence using the Aṣṭādhyāyī engine

In [None]:
# Parse a valid sentence
print("Grammatical Parsing Demo:")
sentence = "rāmaḥ vanam gacchati"  # Rama goes to the forest
analysis = grammar_engine.parse_sentence(sentence)
print(f"Sentence: {sentence}")
print("Analysis:")
for i, word_analysis in enumerate(analysis["words"]):
    print(f"  Word {i+1}: {word_analysis['text']}")
    for key, value in word_analysis.items():
        if key != "text":
            print(f"    {key}: {value}")

## 4. Sanskrit Tokenization

Demonstrate the Sanskrit tokenizer (for transliterated text)

In [None]:
try:
    # Initialize the tokenizer
    tokenizer = SanskritTokenizer()
    
    # Test tokenization
    text = "rāmāyaṇam"  # Sanskrit word for 'Ramayana'
    tokens = tokenizer._tokenize(text)
    token_ids = [tokenizer._convert_token_to_id(token) for token in tokens]
    
    print(f"Text: {text}")
    print(f"Tokens: {tokens}")
    print(f"Token IDs: {token_ids}")
    
    # Reconstruct text from tokens
    reconstructed = tokenizer.convert_tokens_to_string(tokens)
    print(f"Reconstructed: {reconstructed}")
    
except Exception as e:
    print(f"Error initializing tokenizer: {e}")
    print("Using simple character tokenization as fallback:")
    text = "rāmāyaṇam"  # Sanskrit word for 'Ramayana'
    tokens = list(text)
    print(f"Text: {text}")
    print(f"Simple tokenization: {tokens}")

## 5. Next Steps for Sanskrit NLP Components

Future improvements for these components:

1. Sandhi Processor:
   - Implement comprehensive sandhi rule set
   - Add probabilistic sandhi splitting for ambiguous cases
   - Support Devanagari text directly

2. Aṣṭādhyāyī Grammar Engine:
   - Implement full rule set from Pāṇini's original work
   - Add support for all declension and conjugation patterns
   - Implement constraint satisfaction for complex grammatical validation

3. Sanskrit Tokenizer:
   - Improve handling of compound words (samāsa)
   - Add support for Vedic accents and meter
   - Implement subword tokenization optimized for Sanskrit morphology