# Cross-Lingual Alignment (English ↔ Hindi)

## Preprocess Hindi Corpus
You can use the indic-nlp-library for Hindi-specific tokenization and normalization.

In [3]:
import os
os.environ["INDIC_RESOURCES_PATH"] = "D:\RESEARCH related\PreCog tasks\indic_nlp_resources"

  os.environ["INDIC_RESOURCES_PATH"] = "D:\RESEARCH related\PreCog tasks\indic_nlp_resources"


## Preprocessing of Hindi language

In [6]:
pip install indic-nlp-library

Collecting indic-nlp-library
  Using cached indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Using cached sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Using cached sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Using cached Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinx>=5.1.0 (from sphinx-argparse->indic-nlp-library)
  Using cached sphinx-8.2.3-py3-none-any.whl.metadata (7.0 kB)
Collecting docutils>=0.19 (from sphinx-argparse->indic-nlp-library)
  Using cached docutils-0.21.2-py3-none-any.whl.metadata (2.8 kB)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme->indic-nlp-library)
  Using cached sphinxcontrib_jquery-4.1-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting sphinxcontrib-applehelp>=1.0.7 (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library)
  Usin

In [2]:
import indicnlp
from indicnlp import common
from indicnlp import loader
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from collections import Counter
import numpy as np

# Set up Indic NLP resources
INDIC_RESOURCES_PATH = "D:\\RESEARCH related\\PreCog tasks\\indic_nlp_resources"  # Replace with your path
common.set_resources_path(INDIC_RESOURCES_PATH)
loader.load()

# Load Hindi corpus file
path = "D:\\RESEARCH related\\PreCog tasks\\Language_representations\\Data\\hin_news_2020_300K\\hin_news_2020_300K-sentences.txt"
with open(path, 'r', encoding='utf-8') as f:
    hindi_lines = f.readlines()

# Preprocess Hindi text
def preprocess_hindi(text):
    normalizer = IndicNormalizerFactory().get_normalizer("hi") # Hindi language
    text = normalizer.normalize(text) # Normalize the text
    text = text.replace('\n', ' ')  # Replace newlines with spaces
    tokens = list(indic_tokenize.trivial_tokenize(text, lang='hi')) # Tokenize the text
    tokens = [token for token in tokens if token.strip()]  # Remove empty tokens
    tokens = [token.lower() for token in tokens]
    return tokens

processed_hindi = [preprocess_hindi(sent) for sent in hindi_lines]

# Example usage
for i in range(5):
    print(f"Original: {hindi_lines[i]}")
    print(f"Processed: {processed_hindi[i]}")
    print()

Original: 1	⏺️ 03 मजदूरों को बेहतर इलाज के लिए रायपुर ले जाने की करवाई गई व्यवस्था pic.

Processed: ['1', '⏺️', '03', 'मजदूरों', 'को', 'बेहतर', 'इलाज', 'के', 'लिए', 'रायपुर', 'ले', 'जाने', 'की', 'करवाई', 'गई', 'व्यवस्था', 'pic', '.']

Original: 2	• 06.00 PM से 07.00 PM: सांस्कृतिक कार्यक्रमों में हिस्सा.

Processed: ['2', '•', '06.00', 'pm', 'से', '07.00', 'pm', ':', 'सांस्कृतिक', 'कार्यक्रमों', 'में', 'हिस्सा', '.']

Original: 3	० में कहा कि लॉकडाउन के बाद गरीब कल्याण योजना का ऐलान किया गया था।

Processed: ['3', '०', 'में', 'कहा', 'कि', 'लॉकडाउन', 'के', 'बाद', 'गरीब', 'कल्याण', 'योजना', 'का', 'ऐलान', 'किया', 'गया', 'था', '।']

Original: 4	"100 मरीजों पर नियंत्रित क्लिनिकल ट्रायल किया गया, जिसमें तीन दिन के अंदर 69 प्रतिशत और चार दिन के अंदर शत प्रतिशत मरीज ठीक हो गए और उनकी जांच रिपोर्ट निगेटिव आई।"

Processed: ['4', '"', '100', 'मरीजों', 'पर', 'नियंत्रित', 'क्लिनिकल', 'ट्रायल', 'किया', 'गया', ',', 'जिसमें', 'तीन', 'दिन', 'के', 'अंदर', '69', 'प्रतिशत', 'और', 'चार', 'दिन', 'के', 'अंदर'

## Build Vocabulary & Get Pretrained Embeddings

In [3]:
from collections import Counter

# Flatten and count
flat_tokens = [token for sent in processed_hindi for token in sent]
vocab_counter = Counter(flat_tokens)

# Top-N words
top_n = 10000
vocab = [word for word, freq in vocab_counter.most_common(top_n)]
word2id = {word: idx for idx, word in enumerate(vocab)}


# Build the Co-occurrence Matrix

In [6]:
from collections import defaultdict
import numpy as np

def build_cooc_matrix(sentences, word2id, window_size=4):
    cooc_mat = np.zeros((len(word2id), len(word2id)), dtype=np.float32)
    
    for sent in sentences:
        token_ids = [word2id[w] for w in sent if w in word2id]
        for center_pos, center_id in enumerate(token_ids):
            start = max(0, center_pos - window_size)
            end = min(len(token_ids), center_pos + window_size + 1)
            for context_pos in range(start, end):
                if context_pos == center_pos:
                    continue
                context_id = token_ids[context_pos]
                cooc_mat[center_id][context_id] += 1.0
    return cooc_mat

cooc_matrix = build_cooc_matrix(processed_hindi, word2id)

# Apply Dimensionality Reduction (SVD or NMF)

In [7]:
from sklearn.decomposition import TruncatedSVD

def reduce_with_svd(cooc_matrix, dim=300):
    svd = TruncatedSVD(n_components=dim, random_state=42)
    return svd.fit_transform(cooc_matrix)

hindi_embeddings = reduce_with_svd(cooc_matrix, dim=300)

# Build Final Word2Vec-like Dictionary

In [9]:
id2word = {idx: word for word, idx in word2id.items()}
hindi_word2vec = {id2word[i]: vec for i, vec in enumerate(hindi_embeddings)}

# save my model in pickle format
import pickle
with open("D:\\RESEARCH related\\PreCog tasks\\Language_representations\\models\\my_hindi_embeddings.pkl", "wb") as f:
    pickle.dump(hindi_word2vec, f)