In [1]:
#import library
import numpy as np
import math
import pandas as pd
import re
from collections import Counter
import os

# IOPub data rate exceed : jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

def getAllWords(Text):
    all_words = []
    for x in range(len(Text)):
        temp = re.split(r"(?<!^)\s*[.\n]+\s*(?!$)", Text[x])
        for sentence in temp:
            x = re.sub(r'[^\w\s]', '', sentence.lower())
            if x != '':
                sentence = ('<s> ' + x + ' </s>').split()
                for word in sentence:
                    all_words.append(word)
    return all_words
    
def buildUnigramModel(Text):
    '''
    BUILD UNIGRAM MODEL
    IS : Diberikan input sebuah data berisi text
    FS : Meng-outputkan hasil dari model unigram yang dibuat dalam bentuk dictionary (key: kata; value: probabilitas kemunculan kata tersebut)
    Note : Lakukan proses cleaning dengan menghapus punctuation dan mengubah teks menjadi lower case.
    '''
    uni_arr = []
    all_words = getAllWords(Text)
    uni_counter = Counter(all_words)
    for i in uni_counter:
        uni_arr.append( 
            {
                'key' : i,
                'value' : uni_counter[i]/len(all_words)
            }
        )
    
    return uni_arr
    
def buildBigramModel(Text):
    '''
    BUILD BIGRAM MODEL
    IS : Diberikan input sebuah data berisi text
    FS : Meng-outputkan hasil dari model bigram yang dibuat dalam bentuk dictionary (key: pasangan kata; value: probabilitas kemunculan pasangan kata tersebut)
    Note : Lakukan proses cleaning dengan menghapus punctuation dan mengubah teks menjadi lower case.
    '''
    
    bi_arr = []
    all_words = getAllWords(Text)
    bi_words = [' '.join(ws) for ws in zip(all_words, all_words[1:])]
    bi_words = filter(lambda a: a != '</s> <s>', bi_words)
    
    all_counter = Counter(all_words)
    bi_counter = Counter(bi_words)

    for i in bi_counter:
        x = i.split()
        bi_arr.append( 
            {
                'key' : x,
                'value' : bi_counter[i]/all_counter[x[0]]
            }
        )
            
    return bi_arr

def nextBestWord(bigramModel, currentWord):
    '''
    MENAMPILKAN NEXT BEST WORD
    IS : Menerima input sebuah kata
    FS : Meng-outputkan kata berikutnya yang memiliki probabilitas tertinggi berdasarkan model bigram
    '''
    
    nextWord = []
    
    for bi in bigramModel:
        if bi['key'][0] == currentWord and bi['key'][1] != '</s>':
            nextWord.append(bi)
    
    return max(nextWord, key=lambda x:x['value'])['key'][1]
    

def nextTenBestWords(bigramModel, currentWord):
    '''
    MENYIMPAN TOP 10 NEXT BEST WORD
    IS : Menerima input sebuah kata
    FS : Menghasilkan list berisi 10 kata berikutnya (beserta probabilitasnya) dengan probabilitas tertinggi berdasarkan model bigram. 
    '''
    best_words = []
    
    for bi in bigramModel:
        if bi['key'][0] == currentWord:
            best_words.append([bi['key'][1], bi['value']])
    best_words = sorted(best_words, key = lambda i: i[1], reverse=True)
    
    return best_words[:10]


def generateSentence(bigramModel, length):
    '''
    GENERATE SENTENCE
    IS : Menerima input model bigram dan panjang kalimat yang ingin di-generate
    FS : Mengembalikan kalimat dengan panjang sesuai inputan
    Note : Generate sentence
    '''
    
    curr = 'oh'
    sentence = curr
    for i in range(length-1):
        next_best = nextBestWord(bigramModel,curr)
        sentence += ' ' + next_best
        curr = next_best
    return sentence
    

if __name__ == '__main__':
    print("TUGAS LANGUAGE MODELING NLP - SFY")
    print("SILAKAN MASUKKAN IDENTITAS ANDA")
    Nama = "Shindy Trimaria Laxmi" #input("NAMA : ")
    NIM = "1301170092" #input("NIM : ")

    os.system("pause")
    os.system("cls")

    #import dataset
    data = pd.read_csv('text.csv')
    
    print("TUGAS 1. TAMPILKAN 5 BARIS PERTAMA DARI DATASET")
    print()
    print("HASIL : ")
    print(data.head(5))

    os.system("pause")
    os.system("cls")

    print("TUGAS 2. BUAT MODEL UNIGRAM")
    print()
    print("HASIL : ")
    print(buildUnigramModel(data['text']))

    os.system("pause")
    os.system("cls")

    print("TUGAS 3. BUAT MODEL BIGRAM")
    print()
    print("HASIL : ")
    bigramModel = buildBigramModel(data['text'])
    print(bigramModel)    
    print(len(bigramModel))

    os.system("pause")
    os.system("cls")

    print("TUGAS 4. MENAMPILKAN NEXT BEST WORD")
    print()
    print("HASIL : ")
    print("of -> ",nextBestWord(bigramModel,"of"))
    print("update -> ",nextBestWord(bigramModel,"update"))
    print("hopes -> ",nextBestWord(bigramModel,"hopes"))

    os.system("pause")
    os.system("cls")

    print("TUGAS 5. TOP 10 BEST NEXT WORD")
    print()
    print("HASIL : ")
    print("of -> ",nextTenBestWords(bigramModel,"of"))
    print("update -> ",nextTenBestWords(bigramModel,"update"))
    print("hopes -> ",nextTenBestWords(bigramModel,"hopes"))

    os.system("pause")
    os.system("cls")

    print("TUGAS 6. GENERATE KALIMAT")
    print()
    n = int(input("Panjang Kalimat : "))
    print("HASIL : ")
    print(generateSentence(bigramModel, n))

    os.system("pause")
    os.system("cls")

    print("SELAMAT", Nama ,"ANDA SUDAH MENYELESAIKAN TUGAS LANGUAGE MODELING NLP-SFY")

TUGAS LANGUAGE MODELING NLP - SFY
SILAKAN MASUKKAN IDENTITAS ANDA
TUGAS 1. TAMPILKAN 5 BARIS PERTAMA DARI DATASET

HASIL : 
                                                text
0  Oh, how the headlines blared:\nChatbots were T...
1  If you’ve ever found yourself looking up the s...
2  Machine learning is increasingly moving from h...
3  If your understanding of A.I. and Machine Lear...
4  Want to learn about applied Artificial Intelli...
TUGAS 2. BUAT MODEL UNIGRAM

HASIL : 
TUGAS 3. BUAT MODEL BIGRAM

HASIL : 


76249
TUGAS 4. MENAMPILKAN NEXT BEST WORD

HASIL : 
of ->  the
update ->  this
hopes ->  were
TUGAS 5. TOP 10 BEST NEXT WORD

HASIL : 
of ->  [['the', 0.19190575717271519], ['a', 0.04484134524035721], ['our', 0.022420672620178606], ['this', 0.0222306669200076], ['data', 0.016910507315219456], ['these', 0.01520045601368041], ['machine', 0.011400342010260307], ['each', 0.008170245107353221], ['what', 0.008170245107353221], ['them', 0.007790233707011211]]
update ->  [['this', 0.24444444444444444], ['the', 0.13333333333333333], ['our', 0.1111111111111111], ['1', 0.06666666666666667], ['2', 0.06666666666666667], ['its', 0.06666666666666667], ['it', 0.044444444444444446], ['492017', 0.044444444444444446], ['recommendations', 0.022222222222222223], ['cluster', 0.022222222222222223]]


hopes ->  [['were', 0.5], ['to', 0.25], ['of', 0.25]]
TUGAS 6. GENERATE KALIMAT

Panjang Kalimat : 15
HASIL : 
oh how much you can be a lot of the same time to the same
SELAMAT Shindy Trimaria Laxmi ANDA SUDAH MENYELESAIKAN TUGAS LANGUAGE MODELING NLP-SFY
