# Perplexity

### How to use:
- Input 1: Corpus (Cleaned txt file)
[1]
- Input 2: The poem you generate, need to split to list of sentences (string)
[12]
- Output: 
- if only one sentence, use n-gram_perplexity functions
- if multiple sentences, use total_n-gram_perplexity functions

### Data Import + Data Tokenization

In [1]:
file = open("corpus_CGR.txt", encoding="utf-8")
corpus = file.read()

In [2]:
import re
import nltk
#nltk.download('punkt')

In [3]:
# Clean text
# corpus = re.sub(r'[^A-Za-z\s\']',"", corpus)
# corpus = corpus.lower()

In [4]:
# Tokenize original text
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(corpus)

### N-gram Models

In [5]:
# N-gram List
from nltk import ngrams
unigramlist = tokens
bigramlist = list(ngrams(tokens,2))
trigramlist = list(ngrams(tokens,3))
fourgramlist = list(ngrams(tokens,4))

In [6]:
# N-gram Frequency Dict
unifreq = nltk.FreqDist(unigramlist)
bifreq =  nltk.FreqDist(bigramlist)
trifreq = nltk.FreqDist(trigramlist)
fourfreq = nltk.FreqDist(fourgramlist)

In [7]:
# N-gram MLE
def unigram_mle(word): 
    if word in unigramlist:
        return((unifreq[word]+1)/(len(unigramlist)+len(tokens)))
    else:
        return(1/(len(unigramlist)+len(tokens)))
def bigram_mle(A,B):
    if (B,A) in bigramlist :
        return((1+bifreq[(B,A)])/(unifreq[B] + len(set(unifreq.keys()))))
    else:
        return(1/(unifreq[B]+ len(set(unifreq.keys()))))
def trigram_mle(A,B,C):
    if (B,C,A) in trigramlist :
        return((1+trifreq[(B,C,A)])/(bifreq[(B,C)]+ len(set(unifreq.keys()))))    
    else:
        return((1)/(bifreq[(B,C)]+ len(set(unifreq.keys()))))

def fourgram_mle(A,B,C,D): 
    if (B,C,D,A) in fourgramlist :
        return((1+fourfreq[(B,C,D,A)])/(trifreq[(B,C,D)]+len(set(unifreq.keys()))))
    else:
        return((1)/(trifreq[(B,C,D)]+ len(set(trifreq.keys()))))

### Perplexities

In [8]:
# Unigram Perplexities
import math
def unigram_perplexity(sentence):
    words = sentence.split()
    p = 1
    for i in range(len(words)):
        k = unigram_mle(words[i])
        p = p*(1/k)**(1/len(words))
    return(p)
def total_unigram_perplexity(test):
    k = 1
    mo = 0
    for i in test: 
        temp = unigram_perplexity(i)
        k = k * (temp**(1/len(test)))
    return(k)

In [9]:
# Bigram Perplexity
def bigram_perplexity(sentence):
    words = sentence.split()
    p = 1
    for i in range(1,len(words)):
        k = bigram_mle(words[i],words[i-1])
        p = p*(1/k)**(1/len(words))
    return(p)
def total_bigram_perplexity(test):
    k = 1
    mo = 0
    for i in test: 
        temp = bigram_perplexity(i)
        k = k* (temp**(1/len(test)))
    return(k)

In [10]:
# Trigram Perplexity
def trigram_perplexity(sentence):
    p = 1
    words = sentence.split()
    for i in range(2,len(words)):
        k = trigram_mle(words[i],words[i-2],words[i-1])
        p = p*(1/k)**(1/len(words))
    return(p)
def total_trigram_perplexity(test):
    k = 1
    for i in test: 
        temp = trigram_perplexity(i)
        k = k* (temp**(1/len(test)))
    return(k)

In [11]:
# Fourgram Perplexity
def fourgram_perplexity(sen):
    p = 1
    words = sen.split()
    for i in range(3,len(words)):
        k = fourgram_mle(words[i],words[i-3],words[i-2],words[i-1])
        p = p*(1/k)**(1/len(words))
    return(p)
def total_fourgram_perplexity(test):
    k = 1
    for i in test: 
        temp = fourgram_perplexity(i)
        k = k* (temp**(1/len(test)))
    return(k)

In [28]:
# For multiple sentences, first split to list of strings
text1 = ["night was a wicked world having been tost",
         "but a wicked world quake when one tailor el"]
text2 = [
  "You live his difficulty to the shore",
  "Like a peace to frail other Eye",
  "Desterial lamb out kindness so well",
  "And they Peach back on field. Our home, who find it as his own",
  "But where a grief cost thee they don't in youth",
"And there is a bride in the flushing best night white",
  "Where a gown upon the fortune land",
  "Like an aspiring stroking gain, divine", "when maiden shall been sin"
  "On her whirls for the orition of thine eyes",
  "And was later and every mouth",
  "It was my George each other's fear"
]


In [34]:
# Perplexities for Model 1 and 2 generated poems
t11 = total_unigram_perplexity(text1)
t12 = total_bigram_perplexity(text1)
t13 = total_trigram_perplexity(text1)
t14 = total_fourgram_perplexity(text1)

In [None]:
t21 = total_unigram_perplexity(text2)
t22 = total_bigram_perplexity(text2)
t23 = total_trigram_perplexity(text2)
t24 = total_fourgram_perplexity(text2)