In [7]:
import json

# Load JSON file
with open("tokenized_data_final.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract all tokens
all_tokens = []
for document in data:
    for sentence in document.get("sentences", []):
        tokens = sentence.get("tokens", [])
        all_tokens.extend(tokens)

print(f"Total tokens extracted: {len(all_tokens)}")
print(f"Sample tokens: {all_tokens[:20]}")

# Save all tokens to a file (space-separated for n-gram modeling)
with open("all_tokens.txt", "w", encoding="utf-8") as f:
    f.write(" ".join(all_tokens))

print("Tokens saved to 'all_tokens.txt'")


Total tokens extracted: 617616
Sample tokens: ['‡§≤‡•ã‡§ó‡•ã‡§Ç', '‡§ï‡•ã', '‡§¨‡§ø‡§≤‡•ã‡§Ç', '‡§∏‡§Ç‡§¨‡§Ç‡§ß‡•Ä', '‡§∏‡•Å‡§µ‡§ø‡§ß‡§æ', '‡§¶‡•á‡§®‡§æ', '‡§π‡•Ä', '‡§â‡§®‡§ï‡§æ', '‡§ï‡§æ‡§Æ', '‡§á‡§®‡•á‡§≤‡•ã', '1987', '‡§Æ‡•á‡§Ç', '‡§â‡§∏', '‡§µ‡§ï‡•ç‡§§', '‡§ê‡§∏‡•á', '‡§π‡•Ä', '‡§¶‡•ã‡§∞‡§æ‡§π‡•á', '‡§™‡§∞', '‡§ñ‡§°‡§º‡•Ä', '‡§•‡•Ä']
Tokens saved to 'all_tokens.txt'


In [8]:
from collections import defaultdict, Counter
import math

# -------------------------------
# Step 1: Read tokens from file
# -------------------------------
with open("all_tokens.txt", "r", encoding="utf-8") as f:
    # Assuming each token is separated by whitespace
    tokens = f.read().split()

print(f"Total tokens: {len(tokens)}")
print(f"Sample tokens: {tokens[:20]}")

# -------------------------------
# Step 2: Function to build n-gram counts
# -------------------------------
def build_ngram(tokens, n):
    """
    Build n-gram counts
    Returns: dict with n-gram tuple as key and count as value
    """
    ngrams = defaultdict(int)
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        ngrams[ngram] += 1
    return ngrams

# -------------------------------
# Step 3: Build Unigram, Bigram, Trigram, Quadrigram
# -------------------------------
unigrams = build_ngram(tokens, 1)
bigrams = build_ngram(tokens, 2)
trigrams = build_ngram(tokens, 3)
quadrigrams = build_ngram(tokens, 4)

print(f"Unique unigrams: {len(unigrams)}")
print(f"Unique bigrams: {len(bigrams)}")
print(f"Unique trigrams: {len(trigrams)}")
print(f"Unique quadrigrams: {len(quadrigrams)}")

# -------------------------------
# Step 4: Convert counts to probabilities
# -------------------------------
def ngram_probabilities(ngrams, lower_order_counts=None):
    """
    Convert counts to probabilities.
    For unigram: lower_order_counts=None
    For n>1: use conditional probability P(w_n | w_1,...,w_{n-1})
    """
    probs = {}
    if lower_order_counts is None:  # Unigram
        total_count = sum(ngrams.values())
        for ngram, count in ngrams.items():
            probs[ngram] = count / total_count
    else:  # Higher order
        for ngram, count in ngrams.items():
            prefix = ngram[:-1]
            probs[ngram] = count / lower_order_counts[prefix]
    return probs

# Unigram probabilities
unigram_probs = ngram_probabilities(unigrams)

# Bigram probabilities
bigram_probs = ngram_probabilities(bigrams, unigrams)

# Trigram probabilities
trigram_probs = ngram_probabilities(trigrams, bigrams)

# Quadrigram probabilities
quadrigram_probs = ngram_probabilities(quadrigrams, trigrams)

# -------------------------------
# Step 5: Example usage: print top 10 ngrams
# -------------------------------
def print_top_ngrams(probs, n=10):
    sorted_ngrams = sorted(probs.items(), key=lambda x: x[1], reverse=True)
    for gram, p in sorted_ngrams[:n]:
        print(f"{' '.join(gram)} : {p:.4f}")

print("\nTop 10 Unigrams:")
print_top_ngrams(unigram_probs)

print("\nTop 10 Bigrams:")
print_top_ngrams(bigram_probs)

print("\nTop 10 Trigrams:")
print_top_ngrams(trigram_probs)

print("\nTop 10 Quadrigrams:")
print_top_ngrams(quadrigram_probs)


Total tokens: 617616
Sample tokens: ['‡§≤‡•ã‡§ó‡•ã‡§Ç', '‡§ï‡•ã', '‡§¨‡§ø‡§≤‡•ã‡§Ç', '‡§∏‡§Ç‡§¨‡§Ç‡§ß‡•Ä', '‡§∏‡•Å‡§µ‡§ø‡§ß‡§æ', '‡§¶‡•á‡§®‡§æ', '‡§π‡•Ä', '‡§â‡§®‡§ï‡§æ', '‡§ï‡§æ‡§Æ', '‡§á‡§®‡•á‡§≤‡•ã', '1987', '‡§Æ‡•á‡§Ç', '‡§â‡§∏', '‡§µ‡§ï‡•ç‡§§', '‡§ê‡§∏‡•á', '‡§π‡•Ä', '‡§¶‡•ã‡§∞‡§æ‡§π‡•á', '‡§™‡§∞', '‡§ñ‡§°‡§º‡•Ä', '‡§•‡•Ä']
Unique unigrams: 41043
Unique bigrams: 284097
Unique trigrams: 503313
Unique quadrigrams: 584957

Top 10 Unigrams:
‡§ï‡•á : 0.0388
‡§Æ‡•á‡§Ç : 0.0300
‡§ï‡•Ä : 0.0237
, : 0.0215
‡§ï‡•ã : 0.0179
‡§∏‡•á : 0.0171
‡§®‡•á : 0.0135
. : 0.0131
‡§π‡•à : 0.0127
‡§ï‡§æ : 0.0127

Top 10 Bigrams:
‡§¶‡•ã‡§∞‡§æ‡§π‡•á ‡§™‡§∞ : 1.0000
‡§ö‡•å . : 1.0000
‡§¢‡§º‡•Ä‡§≤‡•Ä ‡§π‡•ã : 1.0000
‡§ö‡•à‡§Æ‡•ç‡§™‡§ø‡§Ø‡§Ç‡§∏ ‡§ü‡•ç‡§∞‡§æ‡§´‡•Ä : 1.0000
‡§Ö‡§°‡§ø‡§ó ‡§∞‡§π‡§®‡§æ : 1.0000
‡§™‡•à‡§ü‡•ç‡§∞‡•Ä‡§ï‡§ø‡§Ø‡•ã ‡§∞‡•ã‡§∏‡•á‡§Ç‡§°‡•á : 1.0000
‡§∞‡•ã‡§∏‡•á‡§Ç‡§°‡•á ‡§®‡•á : 1.0000
‡§Æ‡•Å‡§∞‡•ç‡§¶‡§æ‡§¨‡§æ‡§¶ ‡§ï‡•á : 1.0000
‡§¶‡•á‡§µ‡§æ‡§Ç‡§ó‡§® ‡§∏‡•á : 1.0000
‡§™‡•à‡§°‡§≤‡§∞ ‡§¶‡•ç‡§µ‡

In [11]:
from collections import defaultdict

def build_ngram_counts(tokens, n):
    counts = defaultdict(int)
    for i in range(len(tokens)-n+1):
        ngram = tuple(tokens[i:i+n])
        counts[ngram] += 1
    return counts

# Example for bigrams
tokens = open("all_tokens.txt", "r", encoding="utf-8").read().split()

unigrams = build_ngram_counts(tokens, 1)
bigrams = build_ngram_counts(tokens, 2)

vocab_size = len(unigrams)  # number of unique tokens
total_tokens = len(tokens)


In [10]:
def add_one_smoothing(ngram_counts, lower_counts, vocab_size):
    probs = {}
    for ngram, count in ngram_counts.items():
        prefix = ngram[:-1]
        probs[ngram] = (count + 1) / (lower_counts[prefix] + vocab_size)
    return probs

# Bigram probabilities with Add-One smoothing
bigram_probs_add1 = add_one_smoothing(bigrams, unigrams, vocab_size)


In [12]:
def add_k_smoothing(ngram_counts, lower_counts, vocab_size, k=0.5):
    probs = {}
    for ngram, count in ngram_counts.items():
        prefix = ngram[:-1]
        probs[ngram] = (count + k) / (lower_counts[prefix] + k * vocab_size)
    return probs

# Example: bigram with Add-K smoothing, k=0.5
bigram_probs_addk = add_k_smoothing(bigrams, unigrams, vocab_size, k=0.5)


In [13]:
def add_token_type_smoothing(ngram_counts, lower_counts):
    # Get number of unique continuations for each prefix
    prefix_types = defaultdict(set)
    for ngram in ngram_counts:
        prefix = ngram[:-1]
        prefix_types[prefix].add(ngram[-1])

    probs = {}
    for ngram, count in ngram_counts.items():
        prefix = ngram[:-1]
        num_types = len(prefix_types[prefix])
        probs[ngram] = (count + 1) / num_types
    return probs

# Example: bigram with token-type smoothing
bigram_probs_token_type = add_token_type_smoothing(bigrams, unigrams)


In [14]:
def print_top_probs(probs, n=10):
    sorted_probs = sorted(probs.items(), key=lambda x: x[1], reverse=True)
    for gram, p in sorted_probs[:n]:
        print(f"{' '.join(gram)} : {p:.4f}")

print("Top 10 bigrams with Add-One smoothing:")
print_top_probs(bigram_probs_add1)


Top 10 bigrams with Add-One smoothing:
‡§ï‡•á ‡§≤‡§ø‡§è : 0.0530
‡§π‡•à . : 0.0397
‡§π‡•à ‡§ï‡§ø : 0.0359
‡§ï‡§π‡§æ ‡§ï‡§ø : 0.0296
‡§π‡•à , : 0.0275
‡§ï‡•á ‡§∏‡§æ‡§• : 0.0216
‡§®‡•á ‡§ï‡§π‡§æ : 0.0197
‡§ï‡•á ‡§¨‡§æ‡§¶ : 0.0184
. . : 0.0173
‡§π‡•à‡§Ç . : 0.0164


In [15]:
import json, random

# Load tokenized JSON
with open("tokenized_data_final.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract all sentences (tokens)
all_sentences = []
for document in data:
    for sent in document.get("sentences", []):
        tokens = sent.get("tokens", [])
        if tokens:  # only keep non-empty
            all_sentences.append(tokens)

print(f"Total sentences: {len(all_sentences)}")

# Randomly select 1000 (or all if fewer than 1000)
sample_sentences = random.sample(all_sentences, min(1000, len(all_sentences)))
print(f"Selected {len(sample_sentences)} sentences")


Total sentences: 35391
Selected 1000 sentences


In [27]:
import json, random
from collections import defaultdict, Counter
import math

# ----------------------------
# Load tokenized sentences
# ----------------------------
with open("tokenized_data_final.json", "r", encoding="utf-8") as f:
    data = json.load(f)

all_sentences = []
for document in data:
    for sent in document.get("sentences", []):
        tokens = sent.get("tokens", [])
        if tokens:
            all_sentences.append(tokens)

sample_sentences = random.sample(all_sentences, min(1000, len(all_sentences)))
print(f"Selected {len(sample_sentences)} sentences")

# ----------------------------
# Build n-gram models
# ----------------------------
def build_ngram_counts(sentences, n):
    counts = defaultdict(Counter)
    for sent in sentences:
        sent = ["<s>"]*(n-1) + sent + ["</s>"]
        for i in range(len(sent)-n+1):
            context = tuple(sent[i:i+n-1])
            token = sent[i+n-1]
            counts[context][token] += 1
    return counts

unigram_counts = build_ngram_counts(all_sentences, 1)
bigram_counts = build_ngram_counts(all_sentences, 2)
trigram_counts = build_ngram_counts(all_sentences, 3)
quadrigram_counts = build_ngram_counts(all_sentences, 4)

# Vocabulary
vocab = set()
for sent in all_sentences:
    vocab.update(sent)
V = len(vocab)

# ----------------------------
# Smoothed probability functions
# ----------------------------
def unigram_prob(token, k=1):
    total = sum(unigram_counts[()][t] for t in unigram_counts[()][t])
    total = sum(unigram_counts[()][t] for t in unigram_counts[()])
    count = unigram_counts[()][token]
    return (count + k) / (total + k*V)

def ngram_prob(context, token, ngram_counts, k=1):
    context_count = sum(ngram_counts[context].values())
    token_count = ngram_counts[context][token]
    return (token_count + k) / (context_count + k*V)

# ----------------------------
# Compute sentence probabilities
# ----------------------------
def sentence_log_prob(sentence, n, ngram_counts, k=1):
    sent = ["<s>"]*(n-1) + sentence + ["</s>"]
    log_prob = 0.0
    for i in range(n-1, len(sent)):
        context = tuple(sent[i-n+1:i])
        token = sent[i]
        prob = ngram_prob(context, token, ngram_counts, k)
        log_prob += math.log(prob)
    return log_prob

# ----------------------------
# Compute for sample sentences
# ----------------------------
results = []

for sent in sample_sentences:
    unigram_lp = sentence_log_prob(sent, 1, unigram_counts, k=1)
    bigram_lp = sentence_log_prob(sent, 2, bigram_counts, k=1)
    trigram_lp = sentence_log_prob(sent, 3, trigram_counts, k=1)
    quadrigram_lp = sentence_log_prob(sent, 4, quadrigram_counts, k=1)

    results.append({
        "sentence": " ".join(sent),
        "Unigram_logP": unigram_lp,
        "Bigram_logP": bigram_lp,
        "Trigram_logP": trigram_lp,
        "Quadrigram_logP": quadrigram_lp
    })

# Optional: save results to JSON
with open("sentence_probs.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print("Probability computation done for all sampled sentences.")


Selected 1000 sentences
Probability computation done for all sampled sentences.


In [16]:
from collections import defaultdict

def build_ngram_counts(tokens, n):
    counts = defaultdict(int)
    for i in range(len(tokens)-n+1):
        ngram = tuple(tokens[i:i+n])
        counts[ngram] += 1
    return counts

# Flatten all tokens to build models
all_tokens = [tok for sent in all_sentences for tok in sent]

unigrams = build_ngram_counts(all_tokens, 1)
bigrams = build_ngram_counts(all_tokens, 2)
trigrams = build_ngram_counts(all_tokens, 3)
quadrigrams = build_ngram_counts(all_tokens, 4)

vocab_size = len(unigrams)


In [17]:
def add_one_prob(ngram, ngram_counts, lower_counts, vocab_size):
    prefix = ngram[:-1]
    return (ngram_counts.get(ngram, 0) + 1) / (lower_counts.get(prefix, 0) + vocab_size)

def add_k_prob(ngram, ngram_counts, lower_counts, vocab_size, k=0.5):
    prefix = ngram[:-1]
    return (ngram_counts.get(ngram, 0) + k) / (lower_counts.get(prefix, 0) + k * vocab_size)

def add_token_type_prob(ngram, ngram_counts):
    prefix = ngram[:-1]
    continuations = [g[-1] for g in ngram_counts if g[:-1] == prefix]
    num_types = len(set(continuations)) if continuations else 1
    return (ngram_counts.get(ngram, 0) + 1) / num_types


In [18]:
import math

def sentence_probability(sentence, method="add_one", k=0.5):
    prob_log = 0.0
    for i in range(len(sentence)-1):
        bigram = (sentence[i], sentence[i+1])
        unigram = (sentence[i],)

        if method == "add_one":
            p = add_one_prob(bigram, bigrams, unigrams, vocab_size)
        elif method == "add_k":
            p = add_k_prob(bigram, bigrams, unigrams, vocab_size, k)
        elif method == "token_type":
            p = add_token_type_prob(bigram, bigrams)
        else:
            raise ValueError("Unknown method")

        prob_log += math.log(p + 1e-12)  # avoid log(0)

    return math.exp(prob_log)  # sentence probability


In [None]:
import pandas as pd

df = pd.DataFrame(results)
df.to_csv("sentence_probabilities.csv", index=False, encoding="utf-8")
print("Saved sentence probabilities to sentence_probabilities.csv")


In [20]:
from collections import defaultdict

# Precompute continuations for token-type smoothing
prefix_continuations = defaultdict(set)
for (w1, w2), count in bigrams.items():
    prefix_continuations[(w1,)].add(w2)

def add_token_type_prob(ngram, ngram_counts, prefix_continuations):
    prefix = ngram[:-1]
    num_types = len(prefix_continuations.get(prefix, [])) or 1
    return (ngram_counts.get(ngram, 0) + 1) / num_types


In [21]:
prob_cache = {}

def sentence_probability(sentence, method="add_one", k=0.5):
    prob_log = 0.0
    for i in range(len(sentence)-1):
        bigram = (sentence[i], sentence[i+1])

        if (bigram, method) in prob_cache:
            p = prob_cache[(bigram, method)]
        else:
            if method == "add_one":
                p = add_one_prob(bigram, bigrams, unigrams, vocab_size)
            elif method == "add_k":
                p = add_k_prob(bigram, bigrams, unigrams, vocab_size, k)
            elif method == "token_type":
                p = add_token_type_prob(bigram, bigrams, prefix_continuations)
            prob_cache[(bigram, method)] = p

        prob_log += math.log(p + 1e-12)

    return math.exp(prob_log)


In [23]:
def sentence_log_probability(sentence, method="add_one", k=0.5):
    prob_log = 0.0
    for i in range(len(sentence)-1):
        bigram = (sentence[i], sentence[i+1])

        if (bigram, method) in prob_cache:
            p = prob_cache[(bigram, method)]
        else:
            if method == "add_one":
                p = add_one_prob(bigram, bigrams, unigrams, vocab_size)
            elif method == "add_k":
                p = add_k_prob(bigram, bigrams, unigrams, vocab_size, k)
            elif method == "token_type":
                p = add_token_type_prob(bigram, bigrams, prefix_continuations)
            prob_cache[(bigram, method)] = p

        prob_log += math.log(p + 1e-12)
    return prob_log


In [26]:
results = []
for sent in sample_sentences[:20]:
    length = len(sent) or 1

    log_add1 = sentence_log_probability(sent, "add_one")
    log_addk = sentence_log_probability(sent, "add_k", k=0.5)
    log_tt = sentence_log_probability(sent, "token_type")

    results.append({
        "Sentence": " ".join(sent),
        "Length": length,
        "Add-One (avg log P)": log_add1 / length,
        "Add-K=0.5 (avg log P)": log_addk / length,
        "Token-Type (avg log P)": log_tt / length
    })


In [25]:
for row in results[:5]:
    print("üìå Sentence:", row["Sentence"])
    print(f"   üîπ Add-One log P: {row['Add-One (log P)']:.2f}")
    print(f"   üîπ Add-K=0.5 log P: {row['Add-K=0.5 (log P)']:.2f}")
    print(f"   üîπ Token-Type log P: {row['Token-Type (log P)']:.2f}")

    best = max(
        [("Add-One", row['Add-One (log P)']),
         ("Add-K", row['Add-K=0.5 (log P)']),
         ("Token-Type", row['Token-Type (log P)'])],
        key=lambda x: x[1]
    )
    print(f"   ‚úÖ Most likely under: {best[0]}\n")


üìå Sentence: ‡§¨‡§æ‡§Ç‡§¶‡•á ‡§•‡§æ‡§®‡§æ ‡§Æ‡•á‡§Ç ‡§™‡§¶‡§∏‡•ç‡§• ‡§è‡§è‡§∏‡§Ü‡§à ‡§∂‡§ø‡§µ ‡§ï‡•Å‡§Æ‡§æ‡§∞ ‡§Æ‡§Ç‡§°‡§æ‡§µ‡•Ä ‡§ï‡•ã ‡§¶‡§ø‡§® ‡§¶‡§π‡§æ‡•ú‡•á ‡§¨‡§æ‡§ú‡§æ‡§∞ ‡§Æ‡•á‡§Ç ‡§ó‡•ã‡§≤‡•Ä ‡§Æ‡§æ‡§∞‡•Ä‡•§
   üîπ Add-One log P: -130.64
   üîπ Add-K=0.5 log P: -124.47
   üîπ Token-Type log P: -35.82
   ‚úÖ Most likely under: Token-Type

üìå Sentence: ‡§°‡•â .
   üîπ Add-One log P: -5.55
   üîπ Add-K=0.5 log P: -4.86
   üîπ Token-Type log P: 1.50
   ‚úÖ Most likely under: Token-Type

üìå Sentence: ‡§Ü‡•Ö‡§∏‡•ç‡§ü‡•ç‡§∞‡•á‡§≤‡§ø‡§Ø‡§æ‡§à ‡§ï‡•ç‡§∞‡§ø‡§ï‡•á‡§ü ‡§ü‡•Ä‡§Æ ‡§ï‡•á ‡§™‡•Ç‡§∞‡•ç‡§µ ‡§ï‡§™‡•ç‡§§‡§æ‡§® ‡§∏‡•ç‡§ü‡•Ä‡§µ ‡§∏‡•ç‡§Æ‡§ø‡§• ‡§®‡•á ‡§ó‡•á‡§Ç‡§¶ ‡§∏‡•á ‡§õ‡•á‡§°‡§º‡§õ‡§æ‡§°‡§º ‡§ï‡•á ‡§Æ‡§æ‡§Æ‡§≤‡•á ‡§Æ‡•á‡§Ç ‡§∏‡§æ‡§∞‡•ç‡§µ‡§ú‡§®‡§ø‡§ï ‡§§‡•å‡§∞ ‡§™‡§∞ ‡§Æ‡§æ‡§´‡•Ä ‡§Æ‡§æ‡§Ç‡§ó‡•Ä ‡§π‡•à .
   üîπ Add-One log P: -171.54
   üîπ Add-K=0.5 log P: -160.88
   üîπ Token-Type log P: -43.43
   ‚úÖ Most likely under: Token-Type

üìå Sentence: ‡§π‡§ø‡§Ç‡§¶‡•