In [7]:
#!/usr/bin/env python3
"""
Task 1: Create data splits (random sampling):
 - Validation Set: 1000 sentences
 - Test Set: 1000 sentences
 - Training Set: remaining sentences
"""

import os, json, pickle, random,re
from pathlib import Path

INPUT = "C://Users/rani/Desktop/nlp lab/lab1/hindi_tokens.txt"   # will try .json .pkl .txt or bare file
VAL_SIZE = 1000
TEST_SIZE = 1000
SEED = 42
max_sentences = 10000 
def load_sentences(base):
    candidates = [base, base + ".json", base + ".pkl", base + ".txt"]
    for f in candidates:
        if not os.path.exists(f):
            continue
        sentences = []
        sentence_count = 0
        if f.endswith(".txt"):
            with open(f, "r", encoding="utf8") as fh:
                for line in fh:
                    # Split paragraph into sentences by punctuation marks
                    parts = re.split(r'[।!?.]', line)
                    for sentence in parts:
                        sentence = sentence.strip()
                        if sentence:
                            tokens = sentence.split()
                            if tokens:
                                sentences.append(tokens)
                                sentence_count += 1
                                if sentence_count >= max_sentences:
                                    break
                    if sentence_count >= max_sentences:
                        break
            return sentences
        

def save_pickle(obj, fn):
    with open(fn, "wb") as fh:
        pickle.dump(obj, fh)

def main():
    sentences = load_sentences(INPUT)
    print(f"Loaded {len(sentences)} sentences.")
    random.seed(SEED)
    random.shuffle(sentences)
    if len(sentences) < VAL_SIZE + TEST_SIZE:
        raise ValueError("Not enough sentences to create the requested splits.")
    val = sentences[:VAL_SIZE]
    test = sentences[VAL_SIZE:VAL_SIZE+TEST_SIZE]
    train = sentences[VAL_SIZE+TEST_SIZE:]
    save_pickle(train, "train.pkl")
    save_pickle(val, "val.pkl")
    save_pickle(test, "test.pkl")
    print("Splits written: train.pkl ({}), val.pkl ({}), test.pkl ({})".format(len(train), len(val), len(test)))
    # also write simple summaries
    with open("split_summary.txt", "w", encoding="utf8") as fh:
        fh.write(f"Total sentences: {len(sentences)}\n")
        fh.write(f"Train: {len(train)}\nVal: {len(val)}\nTest: {len(test)}\n")
    print("Done.")

if __name__ == "__main__":
    main()


Loaded 10000 sentences.
Splits written: train.pkl (8000), val.pkl (1000), test.pkl (1000)
Done.


In [12]:
import pickle

with open("train.pkl", "rb") as f:
    data = pickle.load(f)
    print(data)


[['तुला', ':', 'आर्थिक', 'विकास,', 'धन', 'लाभ'], ['21', 'तारीख', 'को', 'शाम', 'को', '5', 'बजकर', '20', 'मिनट', 'से', '22', 'फरवरी,', 'शनिवार', 'को', 'शाम', 'सात', 'बजकर', '2', 'मिनट', 'तक', 'रहेगी'], ['झारखंड', 'के', 'अतिरिक्त', 'पुलिस', 'महानिदेशक', '(एडीजीपी)', 'और', 'प्रवक्ता', 'मुरारी', 'लाल', 'मीणा', 'ने', 'यह', 'जानकारी', 'देते', 'हुए', 'बताया', 'कि', 'शाम', 'को', 'गश्त', 'पर', 'निकले', 'पुलिस', 'जवानों', 'के', 'दल', 'को', 'नक्\u200dसलियों', 'ने', 'बंगाल', 'की', 'सीमा', 'के', 'निकट', 'तिरुलडीह', 'थाना', 'क्षेत्र', 'में', 'जंगलों', 'में', 'चारों', 'ओर', 'से', 'घेर', 'लिया', 'और', 'अंधाधुंध', 'गोलीबारी', 'की', 'जिससे', 'गाड़ी', 'में', 'सवार', 'दो', 'सहायक', 'पुलिस', 'निरीक्षकों', 'और', 'तीन', 'सिपाहियों', 'की', 'मौत', 'हो', 'गई', 'जबकि', 'वाहन', 'चालक', 'ने', 'मौके', 'से', 'किसी', 'तरह', 'भाग', 'कर', 'अपनी', 'जान', 'बचाई'], ['सुबह', 'संकल्प', 'लेते', 'समय,', 'ऊं', 'अद्य', 'अमुक', 'गोत्रो', 'अमुक', 'नामाहं', 'मम', 'सर्व,', 'पापानक्षय', 'पूर्वक', 'शररा', 'रोग्यार्थ', 'श्री,', 'सूर्यना

In [8]:
#!/usr/bin/env python3
"""
Task 2: Build unigram/bigram/trigram/quadrigram counts on TRAIN and compute Good-Turing
adjusted joint probabilities for each n (1..4).

Input: train.pkl (produced by task1)
Outputs:
 - counts_n{1..4}.pkl  (Counter of n-grams)
 - gt_joint_n{1..4}.pkl (dict: seen ngram -> probability)
 - diagnostics_n{1..4}.json (Nc, N1, N_distinct, total_tokens_observed)
"""

import pickle, math, json
from collections import Counter, defaultdict
from pathlib import Path

TRAIN_PICKLE = "train.pkl"
MAX_N = 4
SPECIAL_START = "<s>"
SPECIAL_END = "</s>"

def load_pickle(fn):
    with open(fn, "rb") as fh:
        return pickle.load(fh)

def ngrams_from_sentence(tokens, n):
    padded = [SPECIAL_START] * (n-1) + tokens + [SPECIAL_END]
    return [tuple(padded[i:i+n]) for i in range(len(padded)-n+1)]

def build_counts(sentences, n):
    c = Counter()
    total = 0
    for s in sentences:
        ngs = ngrams_from_sentence(s, n)
        c.update(ngs)
        total += len(ngs)
    return c, total

def compute_Nc(counter):
    freq_of_freq = Counter(counter.values())
    return dict(freq_of_freq)

def good_turing_counts(counter, V_power_n=None):
    """
    Compute C* per seen n-gram using c* = (c+1) * N_{c+1} / N_c when possible.
    Also compute Punseen per unseen n-gram using formula from prompt if V_power_n provided.
    Returns: C_star_dict (ngram->c_star), Punseen_per_unseen (float or None), diagnostics
    """
    Nc = compute_Nc(counter)
    N_distinct = len(counter)
    N1 = Nc.get(1, 0)
    max_c = max(counter.values()) if counter else 0
    C_star = {}
    for ng, c in counter.items():
        Nc_c = Nc.get(c, 0)
        Nc_cp1 = Nc.get(c+1, 0)
        if Nc_c > 0 and Nc_cp1 > 0:
            c_star = (c+1) * (Nc_cp1 / Nc_c)
        else:
            c_star = float(c)
        C_star[ng] = c_star
    Punseen = None
    if V_power_n is not None:
        unseen_count = max(V_power_n - N_distinct, 0)
        if unseen_count > 0 and N_distinct > 0:
            Punseen = (N1 / N_distinct) / unseen_count
        else:
            Punseen = 0.0
    diagnostics = {
        "Nc": Nc,
        "N_distinct": N_distinct,
        "N1": N1,
        "max_c": max_c,
        "total_seen_tokens": sum(counter.values())
    }
    return C_star, Punseen, diagnostics

def normalize_joint_from_Cstar(C_star, punseen, V_power_n):
    """
    Convert C* to probabilities that sum to 1 by scaling seen mass and including unseen mass = punseen * unseen_count.
    Returns: P_seen (dict ngram->prob), p_unseen_each
    """
    sum_cstar = sum(C_star.values())
    unseen_count = max(V_power_n - len(C_star), 0)
    unseen_mass = (punseen * unseen_count) if punseen is not None else 0.0
    if sum_cstar == 0:
        scale = 0.0
    else:
        scale = max(0.0, (1.0 - unseen_mass) / sum_cstar)
    P_seen = {ng: cstar * scale for ng, cstar in C_star.items()}
    # final numeric adjust (small)
    total = sum(P_seen.values()) + (punseen or 0.0) * unseen_count
    if total > 0:
        # renormalize to exactly 1
        factor = 1.0 / total
        for ng in list(P_seen.keys()):
            P_seen[ng] *= factor
        if punseen is not None:
            punseen *= factor
    return P_seen, punseen

def save_pickle(obj, fn):
    with open(fn, "wb") as fh:
        pickle.dump(obj, fh)

def main():
    train = load_pickle(TRAIN_PICKLE)
    print(f"Loaded train ({len(train)} sentences). Building counts and Good-Turing models...")
    # compute vocab size from train tokens
    vocab = set()
    for s in train:
        vocab.update(s)
    vocab.update([SPECIAL_START, SPECIAL_END])
    V = len(vocab)
    for n in range(1, MAX_N+1):
        counts, total = build_counts(train, n)
        save_pickle(counts, f"counts_n{n}.pkl")
        Vpow = V ** n
        C_star, punseen, diag = good_turing_counts(counts, V_power_n=Vpow)
        P_seen, punseen = normalize_joint_from_Cstar(C_star, punseen, Vpow)
        save_pickle(P_seen, f"gt_joint_n{n}.pkl")
        with open(f"diagnostics_n{n}.json", "w", encoding="utf8") as fh:
            # JSON-friendly: reduce large Nc dict by converting keys to str
            dd = dict(diag)
            dd["Nc"] = {str(k): v for k, v in diag["Nc"].items()}
            dd["vocab_size"] = V
            fh.write(json.dumps(dd, indent=2))
        print(f"n={n}: distinct_seen={diag['N_distinct']} N1={diag['N1']} total_tokens={diag['total_seen_tokens']}")
    print("Done. Files produced: counts_n*.pkl, gt_joint_n*.pkl, diagnostics_n*.json")

if __name__ == "__main__":
    main()


Loaded train (8000 sentences). Building counts and Good-Turing models...
n=1: distinct_seen=19027 N1=11015 total_tokens=140600
n=2: distinct_seen=81338 N1=66451 total_tokens=140600
n=3: distinct_seen=118149 N1=110029 total_tokens=140600
n=4: distinct_seen=129145 N1=124921 total_tokens=140600
Done. Files produced: counts_n*.pkl, gt_joint_n*.pkl, diagnostics_n*.json


In [21]:
# import pickle

with open("counts_n1.pkl", "rb") as f:
    data = pickle.load(f)
    print(data)
print(len(data))

Counter({('</s>',): 8000, ('के',): 5430, ('में',): 4109, ('की',): 3418, ('है',): 3221, ('को',): 2590, ('से',): 2431, ('ने',): 1902, ('का',): 1780, ('और',): 1724, ('पर',): 1537, ('कि',): 1274, ('हैं',): 1164, ('भी',): 1112, ('लिए',): 912, ('इस',): 787, ('कर',): 786, ('एक',): 774, ('नहीं',): 722, ('किया',): 668, ('ही',): 611, ('गया',): 592, ('हो',): 572, ('करने',): 495, ('यह',): 487, ('था',): 468, ('तो',): 465, ('साथ',): 422, ('बाद',): 419, ('कहा',): 413, ('दिया',): 389, ('रहे',): 387, ('तक',): 351, ('रहा',): 342, ('जा',): 341, ('गई',): 337, ('रही',): 331, ('हुए',): 312, ('अपने',): 291, ('लोगों',): 288, ('पुलिस',): 287, ('है,',): 286, ('लेकिन',): 278, ('थे',): 269, ('इसके',): 258, ('उन्होंने',): 256, ('गए',): 254, ('दी',): 253, ('सरकार',): 252, ('अब',): 241, ('थी',): 240, ('होने',): 237, ('पहले',): 226, ('वह',): 224, ('जो',): 221, ('बताया',): 216, ('वाले',): 214, ('कुछ',): 214, ('व',): 208, ('कोई',): 194, ('लेकर',): 192, ('दौरान',): 187, ('करते',): 184, ('दो',): 181, ('हुई',): 177, ('ये'

In [9]:
#!/usr/bin/env python3
"""
Task 3: Compute sentence log-probabilities for validation and test sets using
the Good-Turing-smoothed joint models produced in task2.

Inputs:
 - val.pkl, test.pkl (from task1)
 - gt_joint_n{1..4}.pkl (from task2)
Outputs:
 - logprobs_n{1..4}_val.txt
 - logprobs_n{1..4}_test.txt
"""

import pickle, math
from collections import Counter
SPECIAL_START = "<s>"
SPECIAL_END = "</s>"
MAX_N = 4

def load_pickle(fn):
    with open(fn, "rb") as fh:
        return pickle.load(fh)

def sentence_ngrams(tokens, n):
    padded = [SPECIAL_START] * (n-1) + tokens + [SPECIAL_END]
    ngrams = [tuple(padded[i:i+n]) for i in range(len(padded)-n+1)]
    return ngrams

def conditional_prob(ngram, joint_n, joint_n_minus1, punseen_n, punseen_n_minus1):
    # joint_n: dict of joint probs for n-grams
    # joint_n_minus1: dict for (n-1)-grams
    if len(ngram) == 1:
        return joint_n.get(ngram, punseen_n if punseen_n is not None else 1e-16)
    joint = joint_n.get(ngram, punseen_n if punseen_n is not None else 1e-16)
    history = ngram[:-1]
    denom = joint_n_minus1.get(history, punseen_n_minus1 if punseen_n_minus1 is not None else 1e-12)
    if denom <= 0:
        denom = 1e-12
    p = joint / denom
    return max(p, 1e-16)

def sentence_logprob(tokens, order, joint_models):
    """
    joint_models: dict n -> (joint_probs_dict, punseen_each)
    """
    logp = 0.0
    n = order
    padded = [SPECIAL_START]*(n-1) + tokens + [SPECIAL_END]
    for i in range(n-1, len(padded)):
        ngram = tuple(padded[i-(n-1):i+1])
        if n == 1:
            joint_n, punseen_n = joint_models[1]
            p = joint_n.get(ngram, punseen_n if punseen_n is not None else 1e-16)
        else:
            joint_n, punseen_n = joint_models[n]
            joint_nm1, punseen_nm1 = joint_models[n-1]
            p = conditional_prob(ngram, joint_n, joint_nm1, punseen_n, punseen_nm1)
        logp += math.log(p)
    return logp

def main():
    val = load_pickle("val.pkl")
    test = load_pickle("test.pkl")
    joint_models = {}
    for n in range(1, MAX_N+1):
        jp = load_pickle(f"gt_joint_n{n}.pkl")  # dict ngram->prob
        # We didn't explicitly save punseen in task2's pickle. We'll approximate punseen by tiny value for safety.
        # If you saved punseen, load it; otherwise set None so code uses fallback.
        # Here: punseen unknown -> None
        joint_models[n] = (jp, None)

    for n in range(1, MAX_N+1):
        out_val = f"logprobs_n{n}_val.txt"
        out_test = f"logprobs_n{n}_test.txt"
        with open(out_val, "w", encoding="utf8") as fv:
            for sent in val:
                lp = sentence_logprob(sent, n, joint_models)
                fv.write(f"{lp}\n")
        with open(out_test, "w", encoding="utf8") as ft:
            for sent in test:
                lp = sentence_logprob(sent, n, joint_models)
                ft.write(f"{lp}\n")
        print(f"Wrote {out_val}, {out_test}")
    print("Done.")

if __name__ == "__main__":
    main()


Wrote logprobs_n1_val.txt, logprobs_n1_test.txt
Wrote logprobs_n2_val.txt, logprobs_n2_test.txt
Wrote logprobs_n3_val.txt, logprobs_n3_test.txt
Wrote logprobs_n4_val.txt, logprobs_n4_test.txt
Done.


In [24]:
import pandas as pd
from collections import Counter

# Suppose you already have a dictionary of n-gram counts:
# e.g., counts = {('the',): 123, ('of',): 234, ('new',): 5, ...}
# Replace with your actual counts dictionary
counts=[]
with open("counts_n1.pkl", "rb") as f:
    counts = pickle.load(f)  # for unigram or bigram as per your data

# Step 1: Count frequencies of frequencies
freq_counter = Counter(counts.values())  # Nc table

# Step 2: Sort by observed count
sorted_counts = sorted(freq_counter.items())

# Step 3: Compute C* using Good-Turing formula
rows = []
for c, Nc in sorted_counts[:100]:  # top 100 frequency levels
    Nc1 = freq_counter.get(c + 1, 0)
    if Nc > 0:
        c_star = (c + 1) * (Nc1 / Nc)
    else:
        c_star = 0
    rows.append((c, Nc, round(c_star, 4)))

# Step 4: Convert to DataFrame for pretty table
gt_table = pd.DataFrame(rows, columns=["C (Observed Count)", "Nc (Frequency of Count)", "C* (Adjusted Count)"])
print(gt_table)


    C (Observed Count)  Nc (Frequency of Count)  C* (Adjusted Count)
0                    1                    11015               0.4962
1                    2                     2733               1.4259
2                    3                     1299               2.3279
3                    4                      756               3.3929
4                    5                      513               4.1637
..                 ...                      ...                  ...
95                  99                        2              50.0000
96                 100                        1             303.0000
97                 101                        3               0.0000
98                 103                        4              52.0000
99                 104                        2               0.0000

[100 rows x 3 columns]


In [10]:
#!/usr/bin/env python3


import pickle, math
from itertools import product
SPECIAL_START = "<s>"
SPECIAL_END = "</s>"
MAX_N = 4

GRID_STEP = 0.1   # default coarse; lower -> finer but slower

def load_pickle(fn):
    with open(fn, "rb") as fh:
        return pickle.load(fh)

def conditional_from_joint(ngram, joint_n, joint_nm1, punseen_n=None, punseen_nm1=None):
    if len(ngram) == 1:
        return joint_n.get(ngram, punseen_n if punseen_n is not None else 1e-16)
    joint = joint_n.get(ngram, punseen_n if punseen_n is not None else 1e-16)
    history = ngram[:-1]
    denom = joint_nm1.get(history, punseen_nm1 if punseen_nm1 is not None else 1e-12)
    if denom <= 0:
        denom = 1e-12
    return max(joint / denom, 1e-16)

def sentence_ngrams(tokens, n):
    padded = [SPECIAL_START]*(n-1) + tokens + [SPECIAL_END]
    for i in range(n-1, len(padded)):
        yield tuple(padded[i-(n-1):i+1])

def load_joint_models():
    jm = {}
    for n in range(1, MAX_N+1):
        jp = load_pickle(f"gt_joint_n{n}.pkl")
        jm[n] = (jp, None)   # None punseen; if you have it, include
    return jm

def compute_val_loglik(lambdas, val_sents, joint_models):
    total_loglik = 0.0
    for s in val_sents:
        padded = [SPECIAL_START]*(MAX_N-1) + s + [SPECIAL_END]
        for i in range(MAX_N-1, len(padded)):
            w = padded[i]
            history = padded[i-(MAX_N-1):i]
            # gather P_k for k=1..4
            probs = []
            for k in range(1, MAX_N+1):
                if k == 1:
                    ngram = (w,)
                else:
                    hist_k = history[-(k-1):] if (k-1) > 0 else []
                    ngram = tuple(hist_k + [w])
                joint_k, punseen_k = joint_models[k]
                if k == 1:
                    p_k = joint_k.get(ngram, punseen_k if punseen_k is not None else 1e-16)
                else:
                    joint_km1, punseen_km1 = joint_models[k-1]
                    p_k = conditional_from_joint(ngram, joint_k, joint_km1, punseen_k, punseen_km1)
                probs.append(p_k)
            p_interp = sum(l * p for l, p in zip(lambdas, probs))
            if p_interp <= 0:
                p_interp = 1e-16
            total_loglik += math.log(p_interp)
    return total_loglik

def grid_simplex(step=GRID_STEP):
    steps = int(1.0/step) + 1
    lambdas = []
    for i1 in range(steps):
        l1 = i1*step
        for i2 in range(steps):
            l2 = i2*step
            for i3 in range(steps):
                l3 = i3*step
                s = l1 + l2 + l3
                if s > 1.0 + 1e-12:
                    continue
                l4 = 1.0 - s
                if l4 < -1e-12:
                    continue
                lambdas.append((l1, l2, l3, l4))
    return lambdas

def main():
    val = load_pickle("val.pkl")
    joint_models = load_joint_models()
    best = (None, -1e300)
    grid = grid_simplex(GRID_STEP)
    print(f"Evaluating {len(grid)} lambda candidates (step={GRID_STEP}) on {len(val)} validation sentences...")
    for lamb in grid:
        ll = compute_val_loglik(lamb, val, joint_models)
        if ll > best[1]:
            best = (lamb, ll)
    print("Best lambdas:", best[0], "loglik:", best[1])
    with open("best_quadrigram_lambdas.txt", "w", encoding="utf8") as fh:
        fh.write(f"{best[0]}\nloglik={best[1]}\n")
    print("Saved best_quadrigram_lambdas.txt")

if __name__ == "__main__":
    main()


Evaluating 286 lambda candidates (step=0.1) on 1000 validation sentences...
Best lambdas: (0.30000000000000004, 0.4, 0.1, 0.19999999999999996) loglik: -104913.35381049472
Saved best_quadrigram_lambdas.txt
