In [1]:
from google.colab import files
uploaded = files.upload()


Saving dev.sn to dev.sn


In [2]:
import os
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# === Set Up ===
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Ignore GPU

# Load model and tokenizer
model_name = "chronbmm/sanskrit-byt5-dp"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# === Load First 200 Sentences from File ===
sn_file = "dev.sn"  # Make sure the path is correct and file exists
sentences_byt5 = []

with open(sn_file, "r", encoding="utf-8") as f:
    for line in f:
        if len(sentences_byt5) >= 1000:
            break  # Stop after 200 lines
        stripped = line.strip()
        if stripped:
            sentence = stripped.split("\t")[0]  # If tab-separated, get first column
            sentences_byt5.append(sentence)

# === Prepare Embeddings ===
embeddings = []

for sentence in sentences_byt5:
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = model.encoder(**inputs).last_hidden_state
    pooled = output.mean(dim=1)
    pooled = torch.nn.functional.normalize(pooled, p=2, dim=1)
    embeddings.append(pooled.cpu().numpy()[0])

# === Save Embeddings and Metadata ===
all_embeddings = np.vstack(embeddings)
np.savetxt("vectors.tsv", all_embeddings, delimiter="\t")

with open("metadata.tsv", "w", encoding="utf-8") as f:
    for sent in sentences_byt5:
        f.write(sent + "\n")

# === Sample Cosine Similarities ===
if len(all_embeddings) > 1:
    sim_matrix = cosine_similarity(all_embeddings[:5])
    print("Sample cosine similarity matrix (first 5 sentences):")
    print(sim_matrix)

print("First 5 sentences:")
for i, sent in enumerate(sentences_byt5[:5], 1):
    print(f"{i}. {sent}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/866 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sample cosine similarity matrix (first 5 sentences):
[[1.         0.940488   0.95522827 0.94646865 0.9622255 ]
 [0.940488   0.9999999  0.9245204  0.958811   0.96352   ]
 [0.95522827 0.9245204  1.0000002  0.94436234 0.9551693 ]
 [0.94646865 0.958811   0.94436234 1.0000002  0.97181344]
 [0.9622255  0.96352    0.9551693  0.97181344 1.0000002 ]]
First 5 sentences:
1. तस्यां चीरं वसानायां नाथवत्यामनाथवत्। प्रचुक्रोश जनः सर्वो धिक् त्वां दशरथं त्विति ॥
2. तेन तत्र प्रणादेन दुःखितः स महीपतिः। चिच्छेद जीविते श्रद्धां धर्मे यशसि चात्मनः॥ स निःश्वस्योष्णमैक्ष्वाकस्तां भार्यामिदमब्रवीत्। कैकेयि कुशचीरेण न सीता गन्तुमर्हति॥
3. सुकुमारी च बाला च सततं च सुखोचिता। नेयं वनस्य योग्येति सत्यमाह गुरुर्मम ॥
4. इयं हि कस्यापि करोति किंचित् तपस्विनी राजवरस्य पुत्री। या चीरमासाद्य वनस्य मध्ये जाता विसंज्ञा श्रमणीव काचित्॥
5. चीराण्यपास्याज्जनकस्य कन्या नेयं प्रतिज्ञा मम दत्तपूर्वा। यथासुखं गच्छतु राजपुत्री वनं समग्रा सह सर्वरत्नैः॥


In [4]:
uploaded = files.upload()

Saving merged_sentences_with_analysis.xlsx to merged_sentences_with_analysis.xlsx


In [5]:
import random

In [10]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import random

# Load Excel file
df = pd.read_excel("merged_sentences_with_analysis.xlsx")

# Load IndicBERT model and tokenizer
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()
device = torch.device("cpu")
model.to(device)

# Extract Sanskrit sentences
sentences = df["English"].astype(str).tolist()

# Generate embeddings
embeddings = []
for sentence in tqdm(sentences, desc="Generating embeddings"):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_embedding = torch.nn.functional.normalize(cls_embedding, p=2, dim=1)
        embeddings.append(cls_embedding.cpu().numpy()[0])

# Compute cosine similarity matrix
similarities = cosine_similarity(embeddings)

# Randomly pick 25 unique sentence indices
random.seed(42)
selected_indices = random.sample(range(len(sentences)), 25)

# Prepare top 10 similar sentences for each of the 25 selected
comparison_data = []
for idx in selected_indices:
    sims = list(enumerate(similarities[idx]))
    sims = [(j, score) for j, score in sims if j != idx]  # exclude self
    sims = sorted(sims, key=lambda x: x[1], reverse=True)[:10]

    for j, score in sims:
        comparison_data.append({
            "Base English": df.loc[idx, "English"],
            "Base Sanskrit": df.loc[idx, "Sanskrit"],
            "Base Grammatical Analysis": df.loc[idx].get("Grammatical Analysis", ""),
            "Similar Sanskrit": df.loc[j, "Sanskrit"],
            "Similar English": df.loc[j, "English"],
            "Similar Grammatical Analysis": df.loc[j].get("Grammatical Analysis", ""),
            "Similarity Score": round(score, 4)
        })

# Save results
comparison_df = pd.DataFrame(comparison_data)
comparison_df.to_excel("indicbert_top25_each_English_similar.xlsx", index=False)


Generating embeddings: 100%|██████████| 6148/6148 [24:30<00:00,  4.18it/s]


In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import itertools
from tqdm import tqdm

# Load Excel file
df = pd.read_excel("merged_sentences_with_analysis.xlsx")

# Load IndicBERT model and tokenizer (force CPU)
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()
device = torch.device("cpu")
model.to(device)

# Extract Sanskrit sentences
sentences = df["Sanskrit"].astype(str).tolist()

# Generate embeddings
embeddings = []
for sentence in tqdm(sentences, desc="Generating embeddings"):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_embedding = torch.nn.functional.normalize(cls_embedding, p=2, dim=1)
        embeddings.append(cls_embedding.cpu().numpy()[0])

# Compute cosine similarity
similarities = cosine_similarity(embeddings)

# Get top 25 most similar sentence pairs (excluding self-similarity)
pairs = []
n = len(sentences)
for i, j in itertools.combinations(range(n), 2):
    score = similarities[i, j]
    pairs.append((score, i, j))

# Sort and take top 25
top_pairs = sorted(pairs, key=lambda x: x[0], reverse=True)[:25]

# Create result DataFrame
comparison_data = []
for sim_score, i, j in top_pairs:
    comparison_data.append({
        "Sanskrit 1": df.loc[i, "Sanskrit"],
        "English 1": df.loc[i, "English"],
        "Grammatical Analysis 1": df.loc[i].get("Grammatical Analysis", ""),
        "Sanskrit 2": df.loc[j, "Sanskrit"],
        "English 2": df.loc[j, "English"],
        "Grammatical Analysis 2": df.loc[j].get("Grammatical Analysis", ""),
        "Similarity Score": round(sim_score, 4)
    })

# Save to Excel
comparison_df = pd.DataFrame(comparison_data)
comparison_df.to_excel("indicbert_similarity_top25_pairs.xlsx", index=False)


In [None]:
from google.colab import files
uploaded = files.upload()

Saving dev.en to dev.en


In [14]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Load Excel file
df = pd.read_excel("indicbert_top25_each_English_similar.xlsx")

# Load ByT5 model
model_name = "chronbmm/sanskrit-byt5-dp"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.eval()
device = torch.device("cpu")
model.to(device)

# Function to get normalized mean-pooled encoder embedding
def get_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    with torch.no_grad():
        encoder_outputs = model.encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = encoder_outputs.last_hidden_state
        mean_embedding = hidden_states.mean(dim=1)
        normalized_embedding = torch.nn.functional.normalize(mean_embedding, p=2, dim=1)
        return normalized_embedding.cpu().numpy()[0]

# Compute ByT5 cosine similarity
byt5_scores = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Computing ByT5 similarities"):
    base = str(row["Base Sanskrit"])
    similar = str(row["Similar Sanskrit"])
    emb1 = get_embedding(base)
    emb2 = get_embedding(similar)
    sim = cosine_similarity([emb1], [emb2])[0][0]
    byt5_scores.append(round(sim, 4))

# Add and save
df["ByT5 Similarity Score"] = byt5_scores
df.to_excel("indicbert_top25_each_English_similar_1.xlsx", index=False)


Computing ByT5 similarities: 100%|██████████| 250/250 [37:20<00:00,  8.96s/it]


In [15]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Load Excel file
df = pd.read_excel("indicbert_top25_each_English_similar.xlsx")

# Load Sentence-BERT model for English
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load ByT5 model (optional, retained if still needed for Sanskrit)
model_name = "chronbmm/sanskrit-byt5-dp"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.eval()
device = torch.device("cpu")
model.to(device)

# Function to get SBERT embeddings
def get_sbert_embedding(text):
    return sbert_model.encode(text, convert_to_tensor=True)

# Compute SBERT cosine similarity for English sentences
sbert_scores = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Computing SBERT English similarities"):
    base_eng = str(row["Base English"])
    sim_eng = str(row["Similar English"])
    emb1 = get_sbert_embedding(base_eng)
    emb2 = get_sbert_embedding(sim_eng)
    sim = util.pytorch_cos_sim(emb1, emb2).item()
    sbert_scores.append(round(sim, 4))

# Add SBERT similarity column
df["SBERT Similarity Score"] = sbert_scores

# Save updated DataFrame
df.to_excel("indicbert_top25_each_English_similar_1.xlsx", index=False)
print("✅ English SBERT similarity computation complete.")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing SBERT English similarities: 100%|██████████| 250/250 [00:18<00:00, 13.75it/s]


✅ English SBERT similarity computation complete.


In [16]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Load the Excel file
df = pd.read_excel("indicbert_top25_each_English_similar.xlsx")

# Load IndicBERT model
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to get sentence embedding (mean pooling over last hidden state)
def get_indicbert_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state  # shape: [1, seq_len, hidden_size]

        # Mean pooling
        mask = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
        summed = torch.sum(last_hidden * mask, dim=1)
        counted = torch.clamp(mask.sum(dim=1), min=1e-9)
        mean_pooled = summed / counted
        normalized = torch.nn.functional.normalize(mean_pooled, p=2, dim=1)

    return normalized.cpu().numpy()[0]

# Compute similarities
indicbert_scores = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Computing IndicBERT Sanskrit similarities"):
    base_sn = str(row["Base Sanskrit"])
    similar_sn = str(row["Similar Sanskrit"])
    emb1 = get_indicbert_embedding(base_sn)
    emb2 = get_indicbert_embedding(similar_sn)
    sim = cosine_similarity([emb1], [emb2])[0][0]
    indicbert_scores.append(round(sim, 4))

# Save results
df["IndicBERT Sanskrit Similarity"] = indicbert_scores
df.to_excel("indicbert_top25_each_Sanakrit_similar_with_sn_similarity.xlsx", index=False)
print("✅ IndicBERT Sanskrit similarity computation complete.")


Computing IndicBERT Sanskrit similarities: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]

✅ IndicBERT Sanskrit similarity computation complete.





In [17]:
import pandas as pd
import re

# Load the Excel file
input_file = "indicbert_top25_each_Sanakrit_similar_with_sn_similarity.xlsx"
df = pd.read_excel(input_file)

# Function to extract lemma values as comma-separated string
def extract_lemmas(grammatical_analysis):
    if pd.isna(grammatical_analysis):
        return ""
    # Find all occurrences of lemma="..."
    lemmas = re.findall(r'lemma="(.*?)"', grammatical_analysis)
    return ", ".join(lemmas)

# Apply function to the column
df["Base_Lemmas"] = df["Base Grammatical Analysis"].apply(extract_lemmas)

# Save to new Excel file
output_file = "Similarity_score_with_lemmas.xlsx"
df.to_excel(output_file, index=False)
print(f"File saved as: {output_file}")


File saved as: Similarity_score_with_lemmas.xlsx


In [None]:
import os
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# === Set Up ===
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Ignore GPU

# Load model and tokenizer
model_name = "chronbmm/sanskrit-byt5-dp"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# === Load First 200 Sentences from File ===
sn_file = "dev.en"  # Make sure the path is correct and file exists
sentences_byts_eng = []

with open(sn_file, "r", encoding="utf-8") as f:
    for line in f:
        if len(sentences_byts_eng) >= 500:
            break  # Stop after 200 lines
        stripped = line.strip()
        if stripped:
            sentence = stripped.split("\t")[0]  # If tab-separated, get first column
            sentences_byts_eng.append(sentence)

# === Prepare Embeddings ===
embeddings = []

for sentence in sentences_byts_eng:
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = model.encoder(**inputs).last_hidden_state
    pooled = output.mean(dim=1)
    pooled = torch.nn.functional.normalize(pooled, p=2, dim=1)
    embeddings.append(pooled.cpu().numpy()[0])

# === Save Embeddings and Metadata ===
all_embeddings = np.vstack(embeddings)
np.savetxt("vectors_en.tsv", all_embeddings, delimiter="\t")

with open("metadata_en.tsv", "w", encoding="utf-8") as f:
    for sent in sentences_byts_eng:
        f.write(sent + "\n")

# === Sample Cosine Similarities ===
if len(all_embeddings) > 1:
    sim_matrix = cosine_similarity(all_embeddings[:5])
    print("Sample cosine similarity matrix (first 5 sentences):")
    print(sim_matrix)


print("First 5 sentences:")
for i, sent in enumerate(sentences_byts_eng[:5], 1):
    print(f"{i}. {sent}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sample cosine similarity matrix (first 5 sentences):
[[1.0000001  0.9141112  0.88752854 0.89655155 0.9052173 ]
 [0.9141112  1.0000001  0.87539583 0.93222344 0.9212436 ]
 [0.88752854 0.87539583 1.0000001  0.8796254  0.90854055]
 [0.89655155 0.93222344 0.8796254  0.9999998  0.9153956 ]
 [0.9052173  0.9212436  0.90854055 0.9153956  1.0000002 ]]
First 5 sentences:
1. When Şītā, having a husband although seeming as if she had none, was putting on the ascetic guise, the people got into a wrath and exclaimed, “O Dasaratha, fie on you!"
2. Aggrieved at the uproar that arose there in consequence, the lord of earth banished from his heart all regard for life, virtue, and fame. And sighing hot, that descendant of Ikşvāku spoke to that wife of his, saying, O Kaikeyi, Sītā deserves not to go in a Kuća dress.
3. Tender, and youthful, and worthy of happiness, she is by no means capable of living in the forest. My spiritual guide has spoken the truth.
4. Whom has this one injured that, being the daugh

In [None]:
print("First 5 sentences:")
for i, sent in enumerate(sentences[:5], 1):
    print(f"{i}. {sent}")

First 5 sentences:
1. तस्यां चीरं वसानायां नाथवत्यामनाथवत्। प्रचुक्रोश जनः सर्वो धिक् त्वां दशरथं त्विति ॥
2. तेन तत्र प्रणादेन दुःखितः स महीपतिः। चिच्छेद जीविते श्रद्धां धर्मे यशसि चात्मनः॥ स निःश्वस्योष्णमैक्ष्वाकस्तां भार्यामिदमब्रवीत्। कैकेयि कुशचीरेण न सीता गन्तुमर्हति॥
3. सुकुमारी च बाला च सततं च सुखोचिता। नेयं वनस्य योग्येति सत्यमाह गुरुर्मम ॥
4. इयं हि कस्यापि करोति किंचित् तपस्विनी राजवरस्य पुत्री। या चीरमासाद्य वनस्य मध्ये जाता विसंज्ञा श्रमणीव काचित्॥
5. चीराण्यपास्याज्जनकस्य कन्या नेयं प्रतिज्ञा मम दत्तपूर्वा। यथासुखं गच्छतु राजपुत्री वनं समग्रा सह सर्वरत्नैः॥


In [None]:
import os
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# === Set Up ===
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable GPU if needed

# Load IndicBERT model and tokenizer
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# === Load First 500 Sentences from File ===
sn_file = "dev.sn"  # Ensure this file exists in the same directory
sentences = []
import os

# Get path relative to this script's location
# base_dir = os.path.dirname(__file__)
# sn_file = os.path.join(base_dir, "dev.sn")

with open(sn_file, "r", encoding="utf-8") as f:
    for line in f:
        if len(sentences) >= 500:
            break
        stripped = line.strip()
        if stripped:
            sentence = stripped.split("\t")[0]
            sentences.append(sentence)

# === Prepare Embeddings ===
embeddings = []

for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token representation
        cls_embedding = torch.nn.functional.normalize(cls_embedding, p=2, dim=1)
        embeddings.append(cls_embedding.cpu().numpy()[0])

# === Save Embeddings and Metadata ===
all_embeddings = np.vstack(embeddings)
np.savetxt("vectors_indic_bert.tsv", all_embeddings, delimiter="\t")

with open("metadata_indic_bert.tsv", "w", encoding="utf-8") as f:
    for sent in sentences:
        f.write(sent + "\n")

# === Sample Cosine Similarities ===
if len(all_embeddings) > 1:
    sim_matrix = cosine_similarity(all_embeddings[:5])
    print("Sample cosine similarity matrix (first 5 sentences):")
    print(sim_matrix)



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sample cosine similarity matrix (first 5 sentences):
[[1.0000005  0.9981257  0.998922   0.99869347 0.9985336 ]
 [0.9981257  0.99999994 0.99801064 0.99931884 0.9994087 ]
 [0.998922   0.99801064 0.99999994 0.9984797  0.99865603]
 [0.99869347 0.99931884 0.9984797  1.         0.9993559 ]
 [0.9985336  0.9994087  0.99865603 0.9993559  1.0000002 ]]


In [None]:
from google.colab import files
uploaded = files.upload()

Saving Rāmāyaṇa-0000-Rām-Bā-1-1067.conllu to Rāmāyaṇa-0000-Rām-Bā-1-1067.conllu


In [None]:
pip install conllu

Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Downloading conllu-6.0.0-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-6.0.0


In [None]:
from conllu import parse_incr
import pandas as pd

file_path = "Rāmāyaṇa-0000-Rām-Bā-1-1067.conllu"

data = []
with open(file_path, "r", encoding="utf-8") as f:
    for sentence in parse_incr(f):
        for token in sentence:
            data.append({
                "ID": token.get("id"),
                "FORM": token.get("form"),
                "LEMMA": token.get("lemma"),
                "UPOS": token.get("upos"),        # Universal POS tag
                "XPOS": token.get("xpos"),        # Language-specific POS tag
                "FEATS": token.get("feats"),      # Morphological features
                "HEAD": token.get("head"),        # Head word (by index)
                "DEPREL": token.get("deprel"),    # Dependency relation
                "MISC": token.get("misc")         # Miscellaneous (e.g., glosses)
            })

# Convert to DataFrame
df = pd.DataFrame(data)
df.to_excel("parsed_conllu.xlsx", index=False)
print("Parsed CoNLL-U file saved as 'parsed_conllu.xlsx'")


Parsed CoNLL-U file saved as 'parsed_conllu.xlsx'


In [None]:
print(df)

             ID                   FORM      LEMMA  UPOS  XPOS  \
0     (1, -, 3)  tapaḥsvādhyāyanirataṃ          _     _  None   
1             1                  tapas      tapas  NOUN  None   
2             2              svādhyāya  svādhyāya  NOUN  None   
3             3                niratam      niram  VERB  None   
4             4                tapasvī   tapasvin  NOUN  None   
...         ...                    ...        ...   ...   ...   
1257          6                     ca         ca  CONJ  None   
1258          7                  śūdro      śūdra  NOUN  None   
1259          8                    'pi        api  PART  None   
1260          9              mahattvam   mahattva  NOUN  None   
1261         10                   īyāt          i  VERB  None   

                                                  FEATS  HEAD DEPREL  \
0                                                  None  None      _   
1                                       {'Case': 'Cpd'}  None      _   
2  

In [None]:
from google.colab import files
uploaded = files.upload()

Saving merged_sentences_with_analysis.xlsx to merged_sentences_with_analysis.xlsx


In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import itertools
from tqdm import tqdm

# Load Excel file
df = pd.read_excel("merged_sentences_with_analysis.xlsx")

# Load IndicBERT model and tokenizer (force CPU)
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()
device = torch.device("cpu")
model.to(device)

# Extract Sanskrit sentences
sentences = df["Sanskrit"].astype(str).tolist()

# Generate embeddings
embeddings = []
for sentence in tqdm(sentences, desc="Generating embeddings"):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_embedding = torch.nn.functional.normalize(cls_embedding, p=2, dim=1)
        embeddings.append(cls_embedding.cpu().numpy()[0])

# Compute cosine similarity
similarities = cosine_similarity(embeddings)

# Get top 25 most similar sentence pairs (excluding self-similarity)
pairs = []
n = len(sentences)
for i, j in itertools.combinations(range(n), 2):
    score = similarities[i, j]
    pairs.append((score, i, j))

# Sort and take top 25
top_pairs = sorted(pairs, key=lambda x: x[0], reverse=True)[:25]

# Create result DataFrame
comparison_data = []
for sim_score, i, j in top_pairs:
    comparison_data.append({
        "Sanskrit 1": df.loc[i, "Sanskrit"],
        "English 1": df.loc[i, "English"],
        "Grammatical Analysis 1": df.loc[i].get("Grammatical Analysis", ""),
        "Sanskrit 2": df.loc[j, "Sanskrit"],
        "English 2": df.loc[j, "English"],
        "Grammatical Analysis 2": df.loc[j].get("Grammatical Analysis", ""),
        "Similarity Score": round(sim_score, 4)
    })

# Save to Excel
comparison_df = pd.DataFrame(comparison_data)
comparison_df.to_excel("indicbert_similarity_top25_pairs.xlsx", index=False)


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Generating embeddings:   0%|          | 0/6148 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/135M [00:00<?, ?B/s]

Generating embeddings: 100%|██████████| 6148/6148 [22:21<00:00,  4.58it/s]


In [None]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans

# Load model and tokenizer
model_name = "chronbmm/sanskrit-byt5-dp"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()
device = torch.device("cpu")
model.to(device)

# Load sentences
sn_file = "dev.sn"  # Update if needed
sentences = []
with open(sn_file, "r", encoding="utf-8") as f:
    for line in f:
        stripped = line.strip()
        if stripped:
            sentence = stripped.split("\t")[0]
            sentences.append(sentence)
        if len(sentences) >= 200:  # Limit to 200 for memory safety
            break

# Generate embeddings
embeddings = []
for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        output = model.encoder(**inputs).last_hidden_state
    pooled = output.mean(dim=1)
    pooled = torch.nn.functional.normalize(pooled, p=2, dim=1)
    embeddings.append(pooled.squeeze().cpu().numpy().astype(np.float32))  # Use float32

# Stack embeddings
all_embeddings = np.stack(embeddings)

# Cluster
kmeans = KMeans(n_clusters=5, random_state=42)
labels = kmeans.fit_predict(all_embeddings)

# Save
df = pd.DataFrame({"Sentence": sentences, "Cluster": labels})
df.to_excel("byt5_sanskrit_clusters.xlsx", index=False)
print("Clustering complete and saved to 'byt5_sanskrit_clusters.xlsx'")


Clustering complete and saved to 'byt5_sanskrit_clusters.xlsx'


In [18]:
from google.colab import files
uploaded = files.upload()

Saving Similarity_score_baseine.xlsx to Similarity_score_baseine.xlsx


In [19]:
pip install dharmamitra_sanskrit_grammar


Collecting dharmamitra_sanskrit_grammar
  Downloading dharmamitra_sanskrit_grammar-0.1.7-py3-none-any.whl.metadata (2.2 kB)
Downloading dharmamitra_sanskrit_grammar-0.1.7-py3-none-any.whl (4.9 kB)
Installing collected packages: dharmamitra_sanskrit_grammar
Successfully installed dharmamitra_sanskrit_grammar-0.1.7


# New Section

In [20]:
import pandas as pd
from tqdm import tqdm
from dharmamitra_sanskrit_grammar import DharmamitraSanskritProcessor

# Load the Excel file
df = pd.read_excel("Similarity_score_baseine.xlsx")

# Initialize the Sanskrit processor
processor = DharmamitraSanskritProcessor()

# Define a function to extract lemmas from a Sanskrit sentence
def extract_lemmas(sentence):
    try:
        results = processor.process_batch([sentence], mode="unsandhied-lemma-morphosyntax", human_readable_tags=True)
        grammatical_info = results[0].get("grammatical_analysis", [])
        lemmas = [entry["lemma"] for entry in grammatical_info if "lemma" in entry]
        return ", ".join(lemmas)
    except Exception as e:
        return str(e)

# Apply the function to the Base Sanskrit and Similar Sanskrit columns
tqdm.pandas(desc="Extracting Lemmas for Base Sanskrit")
df["Base Sanskrit Lemmas"] = df["Base Sanskrit"].progress_apply(extract_lemmas)

tqdm.pandas(desc="Extracting Lemmas for Similar Sanskrit")
df["Similar Sanskrit Lemmas"] = df["Similar Sanskrit"].progress_apply(extract_lemmas)

# Save the updated dataframe
df.to_excel("Similarity_score_with_lemmas.xlsx", index=False)


Extracting Lemmas for Base Sanskrit: 100%|██████████| 250/250 [02:13<00:00,  1.88it/s]
Extracting Lemmas for Similar Sanskrit: 100%|██████████| 250/250 [02:13<00:00,  1.87it/s]


In [23]:
import pandas as pd
from tqdm import tqdm
from dharmamitra_sanskrit_grammar import DharmamitraSanskritProcessor

# === Load the Excel File ===
df = pd.read_excel("Similarity_score_with_lemmas.xlsx")

# === Initialize the Sanskrit Processor ===
processor = DharmamitraSanskritProcessor()

# === Function to Extract Lemma + Tag ===
def extract_lemma_tag(sentence):
    try:
        results = processor.process_batch(
            [sentence],
            mode="unsandhied-lemma-morphosyntax",
            human_readable_tags=True
        )
        analysis = results[0].get("grammatical_analysis", [])
        lemma_tags = [
            f"{entry['lemma']}: {entry['tag']}" if entry.get("tag") else f"{entry['lemma']}: "
            for entry in analysis if "lemma" in entry
        ]
        return ", ".join(lemma_tags)
    except Exception as e:
        return f"ERROR: {str(e)}"

# === Apply the Function ===
tqdm.pandas(desc="Extracting Lemma+Tags for Base Sanskrit")
df["Base Sanskrit Lemma+Tags"] = df["Base Sanskrit"].progress_apply(extract_lemma_tag)

tqdm.pandas(desc="Extracting Lemma+Tags for Similar Sanskrit")
df["Similar Sanskrit Lemma+Tags"] = df["Similar Sanskrit"].progress_apply(extract_lemma_tag)

# === Save the Updated File ===
output_path = r"Similarity_score_with_lemmas.xlsx"
df.to_excel(output_path, index=False)


Extracting Lemma+Tags for Base Sanskrit: 100%|██████████| 250/250 [02:13<00:00,  1.87it/s]
Extracting Lemma+Tags for Similar Sanskrit: 100%|██████████| 250/250 [02:13<00:00,  1.87it/s]
