In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install necessary libraries
!pip install umap-learn matplotlib

Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7


In [4]:
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from sklearn.metrics import jaccard_score
import torch
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [38]:

# Load the pre-trained IndicBERT model and tokenizer for Sanskrit and English
indicbert_model_name = 'ai4bharat/indic-bert'  # IndicBERT model for multiple Indian languages
tokenizer = AutoTokenizer.from_pretrained(indicbert_model_name)
model = AutoModel.from_pretrained(indicbert_model_name)

# Function to encode text and get embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Get the mean of embeddings for all tokens
    return embeddings

# Load Sanskrit and English chapters from files
with open('/content/drive/MyDrive/Valmiki Ramayana/Sanskrit/Yuddhakanda/chapters/chapter 37.txt', 'r', encoding='utf-8') as file:
    sanskrit_chapter = file.read()

with open('/content/drive/MyDrive/Valmiki Ramayana/English/Yuddhakanda/chapters/chapter 46.txt', 'r', encoding='utf-8') as file:
    english_chapter = file.read()

# Get embeddings for Sanskrit and English chapters
sanskrit_embeddings = get_embeddings(sanskrit_chapter).numpy()
english_embeddings = get_embeddings(english_chapter).numpy()

# Check if we have more than one point for UMAP (or embeddings for both chapters)
if sanskrit_embeddings.shape[0] > 1 and english_embeddings.shape[0] > 1:
    # Combine embeddings and labels for UMAP visualization
    embeddings = np.concatenate((sanskrit_embeddings, english_embeddings), axis=0)
    labels = ['Sanskrit'] * sanskrit_embeddings.shape[0] + ['English'] * english_embeddings.shape[0]

    # Apply UMAP for dimensionality reduction
    reducer = umap.UMAP(n_neighbors=2)
    umap_embeddings = reducer.fit_transform(embeddings)

    # Create a DataFrame for seaborn visualization
    df = pd.DataFrame(umap_embeddings, columns=['UMAP 1', 'UMAP 2'])
    df['Language'] = labels

    # Plot UMAP projection using seaborn
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x='UMAP 1', y='UMAP 2', hue='Language', palette='Spectral', data=df, s=100, alpha=0.7)
    plt.title('UMAP projection of Sanskrit and English chapter embeddings')
    plt.show()
else:
    print("Not enough data points for UMAP. Skipping UMAP projection.")

# Compute cosine similarity between Sanskrit and English embeddings
cosine_similarity_score = cosine_similarity(sanskrit_embeddings, english_embeddings)[0][0]
print("Cosine Similarity Score:", cosine_similarity_score)



# Compute Jaccard similarity
# Note: Jaccard similarity is typically used for binary or categorical data.
# Binarizing embeddings for Jaccard similarity.
sanskrit_binarized = np.where(sanskrit_embeddings[0] > 0, 1, 0)
english_binarized = np.where(english_embeddings[0] > 0, 1, 0)


# Jaccard similarity score
from sklearn.metrics import jaccard_score
jaccard_similarity_score = jaccard_score(sanskrit_binarized, english_binarized)
print("Jaccard Similarity Score:", jaccard_similarity_score)

# Compute Euclidean distance between Sanskrit and English embeddings
euclidean_distance = euclidean(sanskrit_embeddings[0], english_embeddings[0])
print("Euclidean Distance:", euclidean_distance)





Not enough data points for UMAP. Skipping UMAP projection.
Cosine Similarity Score: 0.6935919
Jaccard Similarity Score: 0.46520874751491054
Euclidean Distance: 6.095340251922607
