In [1]:
from collections import Counter
import json

def build_ngrams(file_path, chunk_size=100000):
    
    unigrams = Counter()
    bigrams = Counter()
    trigrams = Counter()
    quadrigrams = Counter()

    buffer = []  # small sliding window of tokens
    total_tokens = 0
    chunk_id = 0

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            # assume file has whitespace-separated tokens
            tokens = line.strip().split()
            for token in tokens:
                buffer.append(token)
                total_tokens += 1

                # Unigram
                unigrams[token] += 1

                # Bigram
                if len(buffer) >= 2:
                    bigrams[(buffer[-2], buffer[-1])] += 1

                # Trigram
                if len(buffer) >= 3:
                    trigrams[(buffer[-3], buffer[-2], buffer[-1])] += 1

                # Quadrigram
                if len(buffer) >= 4:
                    quadrigrams[(buffer[-4], buffer[-3], buffer[-2], buffer[-1])] += 1

            if total_tokens >= chunk_size:
                chunk_id += 1
                print(f"Processed {total_tokens} tokens so far (chunk {chunk_id})")
                total_tokens = 0  # reset for progress log
            if(chunk_id)==200:
                break
    return unigrams, bigrams, trigrams, quadrigrams


def save_ngrams_to_file(counter, file_name):
    
    with open(file_name, "w", encoding="utf-8") as f:
        json.dump(counter.most_common(), f, ensure_ascii=False, indent=2)


file_path = "C:/Users/Rani/Desktop/nlp lab/lab1/hindi_tokens.txt"   
unigrams, bigrams, trigrams, quadrigrams = build_ngrams(file_path)



Processed 100068 tokens so far (chunk 1)
Processed 100133 tokens so far (chunk 2)
Processed 100001 tokens so far (chunk 3)
Processed 100020 tokens so far (chunk 4)
Processed 100024 tokens so far (chunk 5)
Processed 100051 tokens so far (chunk 6)
Processed 100046 tokens so far (chunk 7)
Processed 100002 tokens so far (chunk 8)
Processed 100027 tokens so far (chunk 9)
Processed 100149 tokens so far (chunk 10)
Processed 100015 tokens so far (chunk 11)
Processed 100053 tokens so far (chunk 12)
Processed 100139 tokens so far (chunk 13)
Processed 100055 tokens so far (chunk 14)
Processed 100056 tokens so far (chunk 15)
Processed 100037 tokens so far (chunk 16)
Processed 100053 tokens so far (chunk 17)
Processed 104049 tokens so far (chunk 18)
Processed 100065 tokens so far (chunk 19)
Processed 100032 tokens so far (chunk 20)
Processed 100356 tokens so far (chunk 21)
Processed 100341 tokens so far (chunk 22)
Processed 100001 tokens so far (chunk 23)
Processed 100081 tokens so far (chunk 24)
P

In [2]:
def save_ngrams_to_txt(counter, file_name):
    
    with open(file_name, "w", encoding="utf-8") as f:
        for ngram, freq in counter.most_common():
            if isinstance(ngram, tuple):
                ngram_str = " ".join(ngram)  # join tokens for bigram/trigram/etc
            else:
                ngram_str = ngram
            f.write(f"{ngram_str}\t{freq}\n")

# Save them
save_ngrams_to_txt(unigrams, "unigrams.txt")
save_ngrams_to_txt(bigrams, "bigrams.txt")
save_ngrams_to_txt(trigrams, "trigrams.txt")
save_ngrams_to_txt(quadrigrams, "quadrigrams.txt")

print("тЬЕ N-gram models saved!")


тЬЕ N-gram models saved!


In [None]:
 with open('quadrigrams.txt', "r", encoding="utf-8") as f:
        for line in f:
            print(line)

рдХрд╛ рдХрд╣рдирд╛ рд╣реИ рдХрд┐	5905

рдХрд┐рдпрд╛ рдЬрд╛ рд░рд╣рд╛ рд╣реИред	2625

рдЬрд╛ рд░рд╣рд╛ рд╣реИ рдХрд┐	2612

рдХреА рдЬрд╛ рд░рд╣реА рд╣реИред	2339

рд╣реИред рдЙрдиреНрд╣реЛрдВрдиреЗ рдХрд╣рд╛ рдХрд┐	2172

рдиреЗ рдХрд╣рд╛ рд╣реИ рдХрд┐	2171

рд╣рд┐рдВрджреА рдиреНрдпреВреЫ рдЙрддреНрддрд░ рдкреНрд░рджреЗрд╢	1994

рдХрд░рддреЗ рд╣реБрдП рдХрд╣рд╛ рдХрд┐	1469

рдХрд╣рд╛ рдЧрдпрд╛ рд╣реИ рдХрд┐	1467

рдХрд░ рджрд┐рдпрд╛ рдЧрдпрд╛ рд╣реИред	1423

рдХрд┐рдпрд╛ рдЬрд╛ рд╕рдХрддрд╛ рд╣реИред	1301

рдмрддрд╛рдпрд╛ рдЬрд╛ рд░рд╣рд╛ рд╣реИ	1291

рдЖрдкрдХреЛ рдмрддрд╛ рджреЗрдВ рдХрд┐	1287

рд╕рд┐рдВрд╣ рдиреЗ рдмрддрд╛рдпрд╛ рдХрд┐	1092

рд▓реЛрдЧреЛрдВ рдХреА рдореМрдд рд╣реЛ	1011

рдХреА рдмрд╛рдд рдХрд░реЗрдВ рддреЛ	994

рдХреА рдореМрдд рд╣реЛ рдЧрдИред	983

рд╢реБрд░реВ рдХрд░ рджреА рд╣реИред	957

рдореЗрдВ рдХрд╣рд╛ рдЧрдпрд╛ рд╣реИ	908

рд╕рд┐рдВрд╣ рдиреЗ рдХрд╣рд╛ рдХрд┐	882

рдХреА рдореМрдд рд╣реЛ рдЧрдИ	874

рдиреЗ рдХрд╣рд╛ рдерд╛ рдХрд┐	865

рдХреА рдорд╛рдВрдЧ рд