In [3]:
# we need gdown to download nepali text corpus from google drive
!pip install gdown -q

In [4]:
import gdown
import string
import random
import math
import re
import nltk
import joblib

In [5]:
#download nepali text corpus from google drive
url = "https://drive.google.com/u/0/uc?id=1WOFD6A5wkQPENLhC-5NCt75KZACefxtE"
gdown.download(url,"news.txt")

Downloading...
From (uriginal): https://drive.google.com/u/0/uc?id=1WOFD6A5wkQPENLhC-5NCt75KZACefxtE
From (redirected): https://drive.google.com/uc?id=1WOFD6A5wkQPENLhC-5NCt75KZACefxtE&confirm=t&uuid=51f1589c-29ef-4731-8ee8-de5d290545d0
To: /kaggle/working/news.txt
100%|██████████| 1.28G/1.28G [00:05<00:00, 250MB/s] 


'news.txt'

The text corpus is consist of Nepali news articles(2013-2023) from different news portals.

In [6]:
def get_sents():
    """
    Read text file and create list of sentences.
    """
    with open("news.txt","r") as f:
        text = f.read()
        sentences = text.split("।")
        sentences = [sent.strip() for sent in sentences]

    return sentences

In [7]:
sentences = get_sents()

In [8]:
def clean_sents(sents):
    """
    Perform basic text cleaning
    Args: 
        sents: List of sentences
    """
    #remove puncutations
    sents = [sent.translate(str.maketrans('', '', string.punctuation+"’")) for sent in sents]

    #replace numeric values by <num> token
    sents = [re.sub(r"[१२३४५६७८९०]","<num>",sent) for sent in sents]
    sents = [re.sub(r"(<num>)+","<num>",sent) for sent in sents] #replace multiple of occurance of <num> by single <num>
    
    # add sentence start and end token
    SOS = "<s>"
    EOS = "</s>"
    sents =  [f"{SOS} {sent} {EOS}" for sent in sents]
    
    return sents

In [9]:
sentences = clean_sents(sentences)

In [10]:
sentences[:10]

['<s> सरकारप्रतिको गिर्दो जनविश्वास </s>',
 '<s> जनताको विश्वास र बैधतामा शासन गर्ने सरकारहरू जनताका नजरमा गिर्दै गएका छन् </s>',
 '<s> लोकतान्त्रिक शासन प्रणालीमा राज्य नागरिकको हित रक्षक हित प्रबद्र्धक र हित वितरक हो </s>',
 '<s> लोकतन्त्रको केन्द्रबिन्दु नागरिक हो </s>',
 '<s> राज्य संरचना क्रियाशील हुँदा जनताले आफ्ना भावना परिचालित भएको महसुस नगरेसम्म लोकतन्त्र ‘लोक को हुँदैन </s>',
 '<s> जनप्रतिनिधिले बोल्दा भाषण गर्दा वा सभा बैठकमा बस्दा जनताका भावना बकिएको बोकिएको बोध हुनुपर्छ </s>',
 '<s> राज्यसंरचना जनताको विश्वास र बैधताको धरोहर हुनुपर्छ जसको एकमात्र आधार इमान्दारितासाथ कार्यसम्पादन हो </s>',
 '<s> भनेर होइन गरेर नै जनताको मन जित्न सकिन्छ </s>',
 '<s> केही युरोपीय मुलुकहरूलाई अपवादमा लिँदा विश्वव्यापी रूपमा नै सरकारप्रतिको जनभरोसा गिर्दै गएको छ </s>',
 '<s> विकसित मुलुकहरूको संगठन ओईसीडीको पछिल्लो सर्वेक्षणअनुसार स्वीट्जरल्यान्डमा <num> प्रतिशत जनता सरकारमाथि भरोसा गर्छन् भने त्यसपछि जनविश्वासको सूचकांकमा नर्वे <num> र फिनल्यान्ड <num> छन् </s>']

In [11]:
def create_tokens(sents):
    """Create list of tokens from list of sentences"""
    
    tokens = " ".join(sents).split()
    vocab = nltk.FreqDist(tokens)
    
    #remove tokens if it's count is less than 1
    tokens_with_unk = ["<UNK>" if vocab[token]==1 else token for token in tokens]
    
    return tokens_with_unk

In [12]:
train_tokens = create_tokens(sentences)

In [13]:
def ngrams(tokens, n=2):
    """
    Create n-grams and return unique n-grams with their corresponding counts.
    """
    ngram = nltk.ngrams(tokens,n)
    ngram_dicts = nltk.FreqDist(ngram)
    
    return ngram_dicts

In [14]:
#create n-grams with n=3 (trigram)
n = 3
trigram_dicts = ngrams(train_tokens, n)

#bigram is needed to perform Laplace smoothing of trigrams
bigram_dicts = ngrams(train_tokens, n-1)

unigram_dicts = ngrams(train_tokens,n-2)

In [15]:
#create vocabulory of tokens (tokens-count structure)
vocab = nltk.FreqDist(train_tokens)
vocab_size = len(vocab)
vocab_size

602587

In [16]:
def smoothed_bigram_prob(trigram, trigram_count, bigram_dicts, vocab_size):
    """
    Args:
        trigram (a tuple): a tuple of trigrams
        trigram_count(int): count of bigram
        bigram_dicts: dictionary containing bigrams and their corresponding counts
        vocab_size: vocab size of the corpus

    Returns:
        smoothed_prob(float): Smoothed probability of the trigram.
    """

    bigram = trigram[:-1]
    bigram_count = unigram_dicts[bigram]
    smoothed_prob = (trigram_count+1)/(bigram_count + vocab_size)

    return smoothed_prob

In [17]:
def smoothing(bigram_dicts):
    """
    Args:
        bigram_dicts (dict): dictionary items containing bigram tuple and their corresponding count.

    Returns:
        (dict) : dictionary items containing bigram tuple and thier smoothed probability.
    """
    return { n_gram: smoothed_bigram_prob(n_gram, count, unigram_dicts, vocab_size) \
            for n_gram, count in bigram_dicts.items() }

In [18]:
model = smoothing(trigram_dicts)

In [19]:
# del variables which will no longer be used (Save RAM ewewewew) 
# del sentences, train_tokens, vocab,bigram_dicts,unigram_dicts

### Just A sTuPiD text generator

In [20]:
def best_candidate(prev, i, without=[], gen=True):
    """Choose the most likely next token given the previous (n-1) tokens.
    Args:
        prev (tuple of str): the previous n-1 tokens of the sentence (bigram).
        i (int): which candidate to select if not the most probable one.
        without (list of str): tokens to exclude from the candidates list.
        gen (bool): True if function is used for sentence generation, else false
    Returns:
        A tuple with the next most probable token and its corresponding probability.
    """

    blacklist  = ["<UNK>"] + without
    if len(prev)==1: # case when prev consist of single string(starting token <s>)
      candidates = ((ngram[1], prob) for ngram, prob in model.items() if ngram[0]==prev[0])
    else:
      candidates = ((ngram[-1], prob) for ngram, prob in model.items() if ngram[:-1]==prev)
      candidates = filter(lambda candidate: candidate[0] not in blacklist, candidates)
    candidates = sorted(candidates, key=lambda candidate: candidate[1], reverse=True)
    
    n_candidates = len(candidates)
    if  n_candidates == 0:
        return ("</s>", 1)
    
    # if the task is not to generate sentence, we will return multiple word suggestions
    if not gen:
        nS = 7 if len(candidates)>6 else len(candidates)
        return random.sample(candidates[::-1],nS)
    
    candidate_index = int((random.randint(0, len(candidates)))//3)
    return candidates[candidate_index if prev != () and prev[-1] != "<s>" else i] 

def generate_sentences(num, min_len=12, max_len=40):
    """Generate random sentences using the language model.
    Args:
        num (int): the number of sentences to generate.
        min_len (int): minimum allowed sentence length.
        max_len (int): maximum allowed sentence length.
    Yields:
        A tuple with the generated sentence and the combined probability
        (log-space probability) of all of its n-grams.
    """
    for i in range(num):
        sent, prob = ["<s>"], 1
        while sent[-1] != "</s>":
            prev = tuple(sent[-(2):])
            blacklist = sent + (["</s>"] if len(sent) < min_len else [])
            next_token, next_prob = best_candidate(prev, i, without=blacklist)
            sent.append(next_token)
            prob *= next_prob

            if len(sent) >= max_len:
                sent.append("</s>")

        yield ' '.join(sent), -1/math.log(prob) if prob!=1 else 1

In [21]:
print("Generating sentences...")
for sentence, prob in generate_sentences(num = 12,max_len=20):
    print("{} ({:.5f})".format(sentence, prob),len(sentence.split()))

Generating sentences...
<s> सन् <num>– <num> का उमेदवार समेत रहेका वन तथा वातवरणमन्त्री शक्ति बस्नेतसँग जिज्ञासा राखेका थियौं ‘निर्माता कम्पनीसितको सहकार्यमा नेपालमै </s> (0.00456) 21
<s> उनले विषम स्थितिका बीच नेपालले विकासका सूचकमा फड्को मारिसक्थ्यो </s> (0.00973) 11
<s> उनका सहोदर दाइ सौभाग्य शाहका बारेमा केही खुल्न सकेको छैन् भने </s> (0.00750) 13
<s> तर जनमतसंग्रहमा फेरि पञ्चायत व्यवस्थाकै जित भयो म धेरै ठाँउमा काम गर्ने मजदुरका लागि आन्दोलन गर्दागर्दै सम्भ्रान्त नागरिक हुन </s> (0.00437) 21
<s> राष्ट्रपतिका प्रेस सल्लाहकार प्रमोद दाहालले भने ‘हामीभन्दा गतिला नेता ल्याउन सक्नुहुन्छ शनिबारभित्र </s> (0.00738) 14
<s> यसका कारण बुझ्न संशोधन प्रस्तावको विरोध नै गर्ने सोच बनाउनुभएको छ भने सरकारी कार्यालयमा तीन दिन राष्ट्रिय विदा उल्लेख </s> (0.00470) 21
<s> त्यसका विरुद्धमा सडकमा उत्रिएका विद्यार्थीहरुले बलत्कारीलाई फासी दिने कानुन ल्याउनु पर्ने हो सोको जानकारी गराउन सरकारको भूमिका निष्प्रभावी रहेको </s> (0.00448) 21
<s> पछिल्लो भ्रमणका क्रममा चीनसँग गरेका दसबुँदे सम्झौता कार्यान्वयन प

In [22]:
import pickle
with open('function.pkl', 'wb') as file:
    pickle.dump(best_candidate, file)
    
    
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

**!!!All nuisance**  
**Hopefully, we can make it to generate more sensible sentences with more varying text corpus and higher order n-grams.**

But still, this can be useful for next word suggestion purpose.
# Next word suggestion (autocompletion)

In [23]:
mySent = "मेरो देश नेपाल"

In [24]:
def nextWord(mySent):
    prev = tuple(mySent.split()[-2:])
    suggest = best_candidate(prev, 0, without=[],gen=False)
    for sugg in suggest:
        print(f"{mySent} {sugg[0]} : {sugg[1]}")

In [25]:
nextWord(mySent)

मेरो देश नेपाल भूपरिवेष्ठित : 3.3190228133033072e-06
मेरो देश नेपाल गौतम : 3.3190228133033072e-06
मेरो देश नेपाल शीर्षकमा : 3.3190228133033072e-06
मेरो देश नेपाल भित्र : 3.3190228133033072e-06
मेरो देश नेपाल मनमनै : 3.3190228133033072e-06
मेरो देश नेपाल हो : 2.6552182506426458e-05
मेरो देश नेपाल स्वाधिनता : 3.3190228133033072e-06


In [26]:
nextWord("सूचना प्रविधिको क्षेत्रमा आएको")

सूचना प्रविधिको क्षेत्रमा आएको भूकम्पका : 3.3190228133033072e-06
सूचना प्रविधिको क्षेत्रमा आएको मुसलधारे : 3.3190228133033072e-06
सूचना प्रविधिको क्षेत्रमा आएको हिमआँधीका : 4.978534219954961e-06
सूचना प्रविधिको क्षेत्रमा आएको आँधीका : 9.957068439909921e-06
सूचना प्रविधिको क्षेत्रमा आएको प्रतिवद्धता : 3.3190228133033072e-06
सूचना प्रविधिको क्षेत्रमा आएको छ : 6.6380456266066144e-06
सूचना प्रविधिको क्षेत्रमा आएको परिवर्तन : 4.978534219954961e-06


In [31]:
# import gradio as gr


# iface = gr.Interface(fn=nextWord, 
#                      inputs="text", 
#                      outputs=["text"],
#                      title="n-gram modeling for Next word prediction in Nepali Language",
#                      description="Find Next Word")
# iface.launch(inline=False)

Collecting gradio
  Downloading gradio-4.10.0-py3-none-any.whl (16.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting gradio-client==0.7.3 (from gradio)
  Downloading gradio_client-0.7.3-py3-none-any.whl (304 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.8/304.8 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from gradio)
  Downloading httpx-0.25.2-py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.19.3 (from gradio)
  Downloading huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m24.2 MB/s[0m eta [36m

ImportError: cannot import name 'Doc' from 'typing_extensions' (/opt/conda/lib/python3.10/site-packages/typing_extensions.py)