In [3]:
#!/usr/bin/env python3

import os, json, pickle, random,re
from pathlib import Path

INPUT = "C://Users/rani/Desktop/nlp lab/lab1/hindi_tokens.txt"   # will try .json .pkl .txt or bare file
VAL_SIZE = 1000
TEST_SIZE = 1000
SEED = 42
max_sentences = 10000 
def load_sentences(base):
    candidates = [base, base + ".json", base + ".pkl", base + ".txt"]
    for f in candidates:
        if not os.path.exists(f):
            continue
        sentences = []
        sentence_count = 0
        if f.endswith(".txt"):
            with open(f, "r", encoding="utf8") as fh:
                for line in fh:
                    # Split paragraph into sentences by punctuation marks
                    parts = re.split(r'[।!?.]', line)
                    for sentence in parts:
                        sentence = sentence.strip()
                        if sentence:
                            tokens = sentence.split()
                            if tokens:
                                sentences.append(tokens)
                                sentence_count += 1
                                if sentence_count >= max_sentences:
                                    break
                    if sentence_count >= max_sentences:
                        break
            return sentences
        

sentences = load_sentences(INPUT)
print(f"Loaded {len(sentences)} sentences.")
random.seed(SEED)
random.shuffle(sentences)
if len(sentences) < VAL_SIZE + TEST_SIZE:
    raise ValueError("Not enough sentences to create the requested splits.")
val = sentences[:VAL_SIZE]
test = sentences[VAL_SIZE:VAL_SIZE+TEST_SIZE]
train = sentences[VAL_SIZE+TEST_SIZE:]




Loaded 10000 sentences.


In [4]:
import math
from collections import Counter

def compute_pmi(unigram_counts, bigram_counts, total_unigrams):
    """Compute PMI for all bigrams seen."""
    pmi = {}
    for (w1, w2), c12 in bigram_counts.items():
        p_w1 = unigram_counts.get((w1,), 0) / total_unigrams
        p_w2 = unigram_counts.get((w2,), 0) / total_unigrams
        p_w1w2 = c12 / (total_unigrams - 1)
        if p_w1 > 0 and p_w2 > 0 and p_w1w2 > 0:
            pmi[(w1, w2)] = math.log2(p_w1w2 / (p_w1 * p_w2))
    return pmi

def score_bigrams_in_set(tokenized_sentences, pmi_dict):
    scores = []
    for sent in tokenized_sentences:
        for i in range(len(sent)-1):
            bigram = (sent[i], sent[i+1])
            if bigram in pmi_dict:
                scores.append((bigram, pmi_dict[bigram]))
    return scores

from sklearn.feature_extraction.text import TfidfVectorizer

# Convert your tokenized sentences to text strings
def detokenize(sentences):
    return [" ".join(tokens) for tokens in sentences]

train_texts = detokenize(train)
val_texts = detokenize(val)
test_texts = detokenize(test)

# Learn IDF on train only
tfidf = TfidfVectorizer(min_df=2, ngram_range=(1,2))  # you can include bigrams
X_train = tfidf.fit_transform(train_texts)
X_val = tfidf.transform(val_texts)
X_test = tfidf.transform(test_texts)


In [8]:
features = tfidf.get_feature_names_out()
print("Vocabulary size:", len(features))
print("Sample features:", features[:20])


Vocabulary size: 9680
Sample features: ['00' '00 लर' '000' '000 तक' '000 पय' '01' '02' '04' '04 तत' '042'
 '042 पय' '05' '06' '07' '08' '08 58' '09' '10' '10 000' '10 30']


In [22]:
import pandas as pd

sample = X_train[:50].toarray()  # first 5 sentences
df = pd.DataFrame(sample, columns=features)
print(df.sample(10))


     00  00 लर       000  000 तक  000 पय   01   02   04  04 तत  042  ...  \
20  0.0    0.0  0.000000     0.0     0.0  0.0  0.0  0.0    0.0  0.0  ...   
11  0.0    0.0  0.000000     0.0     0.0  0.0  0.0  0.0    0.0  0.0  ...   
42  0.0    0.0  0.000000     0.0     0.0  0.0  0.0  0.0    0.0  0.0  ...   
35  0.0    0.0  0.000000     0.0     0.0  0.0  0.0  0.0    0.0  0.0  ...   
28  0.0    0.0  0.000000     0.0     0.0  0.0  0.0  0.0    0.0  0.0  ...   
13  0.0    0.0  0.437387     0.0     0.0  0.0  0.0  0.0    0.0  0.0  ...   
19  0.0    0.0  0.000000     0.0     0.0  0.0  0.0  0.0    0.0  0.0  ...   
33  0.0    0.0  0.000000     0.0     0.0  0.0  0.0  0.0    0.0  0.0  ...   
38  0.0    0.0  0.000000     0.0     0.0  0.0  0.0  0.0    0.0  0.0  ...   
1   0.0    0.0  0.000000     0.0     0.0  0.0  0.0  0.0    0.0  0.0  ...   

    हस षर  हसन  हड़क   ज़त  ज़त बर   ड़क  ड़कर   ड़छ   ड़त   ड़न  
20    0.0  0.0  0.0  0.0    0.0  0.0  0.0  0.0  0.0  0.0  
11    0.0  0.0  0.0  0.0    0.0  0.0  0.0  0.

In [14]:
def top_words_for_sentence(X, features, index, top_n=10):
    row = X[index].toarray()[0]
    top_idx = row.argsort()[-top_n:][::-1]
    print("Top TF–IDF words for sentence", index, ":")
    for i in top_idx:
        if row[i] > 0:
            print(f"{features[i]} -> {row[i]:.4f}")

top_words_for_sentence(X_train, features, 0, top_n=10)


Top TF–IDF words for sentence 0 :
धन -> 0.7540
आर -> 0.6569


In [23]:
import numpy as np

features = tfidf.get_feature_names_out()
with open("tfidf_readable.txt", "w", encoding="utf-8") as f:
    for i in range(X_train.shape[0]):  # each sentence
        row = X_train[i].toarray()[0]
        nz_idx = row.nonzero()[0]
        words_weights = [(features[j], row[j]) for j in nz_idx]
        words_weights = sorted(words_weights, key=lambda x: x[1], reverse=True)[:10]  # top 10
        f.write(f"Sentence {i}:\n")
        for word, weight in words_weights:
            f.write(f"  {word}: {weight:.4f}\n")
        f.write("\n")


In [24]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def nearest_neighbors_within_set(X):
    # compute cosine similarity matrix
    sim_matrix = cosine_similarity(X)
    np.fill_diagonal(sim_matrix, -1)  # ignore self-similarity
    nearest_idx = np.argmax(sim_matrix, axis=1)
    return nearest_idx, np.max(sim_matrix, axis=1)

val_nn_idx, val_nn_score = nearest_neighbors_within_set(X_val)
test_nn_idx, test_nn_score = nearest_neighbors_within_set(X_test)


In [25]:
print("Validation sentence:", val_texts[0])
print("Nearest neighbor:", val_texts[val_nn_idx[0]])
print("Similarity score:", val_nn_score[0])


Validation sentence: जरुरत की चीजें गांव की दुकानों से खरीदते हैं
Nearest neighbor: व्यापारिक समूह को इससे चिन्तित होने की जरूरत नहीं
Similarity score: 0.4398336019931508
