In [1]:
# we need gdown to download nepali text corpus from google drive
!pip install gdown -q

In [2]:
import gdown
import string
import random
import math
import re
import nltk
import joblib

In [3]:
#download nepali text corpus from google drive
url = "https://drive.google.com/u/0/uc?id=1WOFD6A5wkQPENLhC-5NCt75KZACefxtE"
gdown.download(url,"news.txt")

Downloading...
From (uriginal): https://drive.google.com/u/0/uc?id=1WOFD6A5wkQPENLhC-5NCt75KZACefxtE
From (redirected): https://drive.google.com/uc?id=1WOFD6A5wkQPENLhC-5NCt75KZACefxtE&confirm=t&uuid=def003e2-534f-4037-a360-f97206cff09f
To: /kaggle/working/news.txt
100%|██████████| 1.28G/1.28G [00:05<00:00, 224MB/s] 


'news.txt'

The text corpus is consist of Nepali news articles(2013-2023) from different news portals.

In [4]:
def get_sents():
    """
    Read text file and create list of sentences.
    """
    with open("news.txt","r") as f:
        text = f.read()
        sentences = text.split("।")
        sentences = [sent.strip() for sent in sentences]

    return sentences

In [5]:
sentences = get_sents()

In [6]:
import gc
gc.collect()

32

In [7]:
def clean_sents(sents):
    """
    Perform basic text cleaning
    Args:
        sents: List of sentences
    """
    #remove puncutations
    sents = [sent.translate(str.maketrans('', '', string.punctuation+"’")) for sent in sents]

    #replace numeric values by <num> token
    sents = [re.sub(r"[१२३४५६७८९०]","<num>",sent) for sent in sents]
    sents = [re.sub(r"(<num>)+","<num>",sent) for sent in sents] #replace multiple of occurance of <num> by single <num>

    # add sentence start and end token
    SOS = "<s>"
    EOS = "</s>"
    sents =  [f"{SOS} {sent} {EOS}" for sent in sents]

    return sents

In [8]:
sentences = clean_sents(sentences)

In [9]:
sentences[:10]

['<s> सरकारप्रतिको गिर्दो जनविश्वास </s>',
 '<s> जनताको विश्वास र बैधतामा शासन गर्ने सरकारहरू जनताका नजरमा गिर्दै गएका छन् </s>',
 '<s> लोकतान्त्रिक शासन प्रणालीमा राज्य नागरिकको हित रक्षक हित प्रबद्र्धक र हित वितरक हो </s>',
 '<s> लोकतन्त्रको केन्द्रबिन्दु नागरिक हो </s>',
 '<s> राज्य संरचना क्रियाशील हुँदा जनताले आफ्ना भावना परिचालित भएको महसुस नगरेसम्म लोकतन्त्र ‘लोक को हुँदैन </s>',
 '<s> जनप्रतिनिधिले बोल्दा भाषण गर्दा वा सभा बैठकमा बस्दा जनताका भावना बकिएको बोकिएको बोध हुनुपर्छ </s>',
 '<s> राज्यसंरचना जनताको विश्वास र बैधताको धरोहर हुनुपर्छ जसको एकमात्र आधार इमान्दारितासाथ कार्यसम्पादन हो </s>',
 '<s> भनेर होइन गरेर नै जनताको मन जित्न सकिन्छ </s>',
 '<s> केही युरोपीय मुलुकहरूलाई अपवादमा लिँदा विश्वव्यापी रूपमा नै सरकारप्रतिको जनभरोसा गिर्दै गएको छ </s>',
 '<s> विकसित मुलुकहरूको संगठन ओईसीडीको पछिल्लो सर्वेक्षणअनुसार स्वीट्जरल्यान्डमा <num> प्रतिशत जनता सरकारमाथि भरोसा गर्छन् भने त्यसपछि जनविश्वासको सूचकांकमा नर्वे <num> र फिनल्यान्ड <num> छन् </s>']

In [10]:
def create_tokens(sents):
    """Create list of tokens from list of sentences"""

    tokens = " ".join(sents).split()
    vocab = nltk.FreqDist(tokens)

    #remove tokens if it's count is less than 1
    tokens_with_unk = ["<UNK>" if vocab[token]==1 else token for token in tokens]

    return tokens_with_unk

In [11]:
train_tokens = create_tokens(sentences)

In [12]:
del sentences

In [13]:
def ngrams(tokens, n=2):
    """
    Create n-grams and return unique n-grams with their corresponding counts.
    """
    ngram = nltk.ngrams(tokens,n)
    ngram_dicts = nltk.FreqDist(ngram)

    return ngram_dicts

In [14]:
gc.collect()

0

In [15]:
#create n-grams with n=3 (trigram)
n = 3
trigram_dicts = ngrams(train_tokens, n)

In [16]:
gc.collect()

0

In [17]:
#bigram is needed to perform Laplace smoothing of trigrams
bigram_dicts = ngrams(train_tokens, n-1)

In [18]:
#create vocabulory of tokens (tokens-count structure)
vocab = nltk.FreqDist(train_tokens)
vocab_size = len(vocab)
vocab_size

602587

In [19]:
def smoothed_bigram_prob(trigram, trigram_count, bigram_dicts, vocab_size):
    """
    Args:
        trigram (a tuple): a tuple of trigrams
        trigram_count(int): count of trigram
        bigram_dicts: dictionary containing bigrams and their corresponding counts
        vocab_size: vocab size of the corpus

    Returns:
        smoothed_prob(float): Smoothed probability of the trigram.
    """

    bigram = trigram[:-1]
    bigram_count = bigram_dicts[bigram]
    smoothed_prob = (trigram_count+1)/(bigram_count + vocab_size)

    return smoothed_prob

In [20]:
def smoothing(trigram_dicts):
    """
    Args:
        bigram_dicts (dict): dictionary items containing bigram tuple and their corresponding count.

    Returns:
        (dict) : dictionary items containing bigram tuple and thier smoothed probability.
    """
    return { n_gram: smoothed_bigram_prob(n_gram, count, bigram_dicts, vocab_size) \
            for n_gram, count in trigram_dicts.items() }

In [21]:
gc.collect()

0

In [22]:
model = smoothing(trigram_dicts)

In [23]:
# del variables which will no longer be used (Save RAM ewewewew)
# del sentences, train_tokens, vocab,bigram_dicts,unigram_dicts

In [24]:
import joblib

In [25]:
# joblib.dump(model,"trigram.bin")

### Just A sTuPiD text generator

In [26]:
def best_candidate(prev, i, without=[], gen=True):
    """Choose the most likely next token given the previous (n-1) tokens.
    Args:
        prev (tuple of str): the previous n-1 tokens of the sentence (bigram).
        i (int): which candidate to select if not the most probable one.
        without (list of str): tokens to exclude from the candidates list.
        gen (bool): True if function is used for sentence generation, else false
    Returns:
        A tuple with the next most probable token and its corresponding probability.
    """

    blacklist  = ["<UNK>"] + without
    if len(prev)==1: # case when prev consist of single string(starting token <s>)
      candidates = ((ngram[1], prob) for ngram, prob in model.items() if ngram[0]==prev[0])
    else:
      candidates = ((ngram[-1], prob) for ngram, prob in model.items() if ngram[:-1]==prev)
      candidates = filter(lambda candidate: candidate[0] not in blacklist, candidates)
    candidates = sorted(candidates, key=lambda candidate: candidate[1], reverse=True)

    n_candidates = len(candidates)
    if  n_candidates == 0:
        return ("</s>", 1)

    # if the task is not to generate sentence, we will return multiple word suggestions
    if not gen:
        nS = 7 if len(candidates)>6 else len(candidates)
        return random.sample(candidates[::-1],nS)

    candidate_index = int((random.randint(0, len(candidates)))//3)
    return candidates[candidate_index if prev != () and prev[-1] != "<s>" else i]

def generate_sentences(num, min_len=12, max_len=40):
    """Generate random sentences using the language model.
    Args:
        num (int): the number of sentences to generate.
        min_len (int): minimum allowed sentence length.
        max_len (int): maximum allowed sentence length.
    Yields:
        A tuple with the generated sentence and the combined probability
        (log-space probability) of all of its n-grams.
    """
    for i in range(num):
        sent, prob = ["<s>"], 1
        while sent[-1] != "</s>":
            prev = tuple(sent[-(2):])
            blacklist = sent + (["</s>"] if len(sent) < min_len else [])
            next_token, next_prob = best_candidate(prev, i, without=blacklist)
            sent.append(next_token)
            prob *= next_prob

            if len(sent) >= max_len:
                sent.append("</s>")

        yield ' '.join(sent), -1/math.log(prob) if prob!=1 else 1

In [27]:
print("Generating sentences...")
for sentence, prob in generate_sentences(num = 12,max_len=20):
    print("{} ({:.5f})".format(sentence, prob),len(sentence.split()))

Generating sentences...
<s> सन् <num> सेप्टेम्बर र अक्टोबर क्रान्तिको प्रभाव परेको पाइन्छ तर त्यसरी काम गर्न राखेका छोराछोरीकै उमेरका बालबालिकाबारे किन यत्ति </s> (0.00477) 21
<s> उनका अरु <num> जनालाई गिरफ्तार गरी निर्मम आत्मसमीक्षा गरे </s> (0.01014) 11
<s> उनले अन्यका हकमा तोके बमोजिम <num> सालमा गएको विनाशकारी भूकम्प पश्चात भक्तपुर र भक्तपुरबाट पुरानो ठिमी बस्ने लामिछाने </s> </s> (0.00477) 21
<s> राष्ट्रपतिका निजी चिकित्सक सिंह पनि स्वदेश फिर्ता गर्ने कुनै सङ्केत देखिएको छैन किनभने महिलाको सम्पत्ति प्राप्तिमा प्रतिबन्ध लाग्न सक्ने </s> (0.00456) 21
<s> यसका प्रस्तावकहरु परमेश्वर साह तत्कालीन निर्वाचन आयुक्त यादव भन्छन् ‘संशोधनका लागि आफूलाई छनोट गराउन पनि उनले उत्तिकै योगदान पुर्‍याएको </s> (0.00448) 21
<s> त्यसका विरुद्धमा लड्न सबै राजनीतिक दलहरूका उम्मेदवारले पहिलो चरणमा असोज <num> का शिक्षक राजेन्द्रकुमार श्रेष्ठको शव फेला पर्यो फोन </s> (0.00461) 21
<s> तर प्रत्यर्थी रवि लामिछानेलाई फेरि गृहमन्त्री बनाउन भनेपनि प्रधानमन्त्री दाहालले पहिलो चरणमा पुस <num> गतेबाट बोलाइएको छ थारु

**!!!All nuisance**  
**Hopefully, we can make it to generate more sensible sentences with more varying text corpus and higher order n-grams.**

But still, this can be useful for next word suggestion purpose.
# Next word suggestion (autocompletion)

In [28]:
mySent = "मेरो देश नेपाल"

In [29]:
def nextWord(mySent):
    prev = tuple(mySent.split()[-2:])
    suggest = best_candidate(prev, 0, without=[],gen=False)
    for sugg in suggest:
        print(f"{mySent} {sugg[0]} : {sugg[1]}")

In [30]:
nextWord(mySent)

मेरो देश नेपाल स्वाधिनता : 3.3181637945193888e-06
मेरो देश नेपाल मात्र : 3.3181637945193888e-06
मेरो देश नेपाल दर्ज : 3.3181637945193888e-06
मेरो देश नेपाल अथवा : 3.3181637945193888e-06
मेरो देश नेपाल थोरै : 3.3181637945193888e-06
मेरो देश नेपाल सुन्दरताका : 3.3181637945193888e-06
मेरो देश नेपाल यसनिम्ति : 3.3181637945193888e-06


In [31]:
nextWord("सूचना प्रविधिको क्षेत्रमा आएको")

सूचना प्रविधिको क्षेत्रमा आएको विकृतिका : 3.3184555907680566e-06
सूचना प्रविधिको क्षेत्रमा आएको व्यापक : 3.3184555907680566e-06
सूचना प्रविधिको क्षेत्रमा आएको अविरल : 3.3184555907680566e-06
सूचना प्रविधिको क्षेत्रमा आएको बाढी–पहिरो : 3.3184555907680566e-06
सूचना प्रविधिको क्षेत्रमा आएको पनि : 4.977683386152085e-06
सूचना प्रविधिको क्षेत्रमा आएको आँधीका : 9.95536677230417e-06
सूचना प्रविधिको क्षेत्रमा आएको बताउँदै : 4.977683386152085e-06


In [32]:
import sys

In [33]:
sys.getsizeof(model)

1342177368

In [35]:
model.__sizeof__()/1048576

1280.0000686645508

In [37]:
joblib.dump(model,"trigram.joblib",compress=True)

['trigram.joblib']

In [38]:
from IPython.display import FileLink

In [39]:
FileLink("/kaggle/working/trigram.joblib")