# Perplexity

### How to use:
- Input 1: Corpus (Cleaned txt file)
[1]
- Input 2: The poem you generate, need to split to list of sentences (string)
[12]
- Output: 
- if only one sentence, use n-gram_perplexity functions
- if multiple sentences, use total_n-gram_perplexity functions

### Data Import + Data Tokenization

In [1]:
file = open("corpus_CGR.txt", encoding="utf-8")
corpus = file.read()

In [2]:
import re
import nltk
#nltk.download('punkt')

In [3]:
# Clean text
# corpus = re.sub(r'[^A-Za-z\s\']',"", corpus)
# corpus = corpus.lower()

In [4]:
# Tokenize original text
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(corpus)

### N-gram Models

In [5]:
# N-gram List
from nltk import ngrams
unigramlist = tokens
bigramlist = list(ngrams(tokens,2))
trigramlist = list(ngrams(tokens,3))
fourgramlist = list(ngrams(tokens,4))

In [6]:
# N-gram Frequency Dict
unifreq = nltk.FreqDist(unigramlist)
bifreq =  nltk.FreqDist(bigramlist)
trifreq = nltk.FreqDist(trigramlist)
fourfreq = nltk.FreqDist(fourgramlist)

In [7]:
# N-gram MLE
def unigram_mle(word): 
    if word in unigramlist:
        return((unifreq[word]+1)/(len(unigramlist)+len(tokens)))
    else:
        return(1/(len(unigramlist)+len(tokens)))
def bigram_mle(A,B):
    if (B,A) in bigramlist :
        return((1+bifreq[(B,A)])/(unifreq[B] + len(set(unifreq.keys()))))
    else:
        return(1/(unifreq[B]+ len(set(unifreq.keys()))))
def trigram_mle(A,B,C):
    if (B,C,A) in trigramlist :
        return((1+trifreq[(B,C,A)])/(bifreq[(B,C)]+ len(set(unifreq.keys()))))    
    else:
        return((1)/(bifreq[(B,C)]+ len(set(unifreq.keys()))))

def fourgram_mle(A,B,C,D): 
    if (B,C,D,A) in fourgramlist :
        return((1+fourfreq[(B,C,D,A)])/(trifreq[(B,C,D)]+len(set(unifreq.keys()))))
    else:
        return((1)/(trifreq[(B,C,D)]+ len(set(trifreq.keys()))))

### Perplexities

In [8]:
# Unigram Perplexities
import math
def unigram_perplexity(sentence):
    words = sentence.split()
    p = 1
    for i in range(len(words)):
        k = unigram_mle(words[i])
        p = p*(1/k)**(1/len(words))
    return(p)
def total_unigram_perplexity(test):
    k = 1
    mo = 0
    for i in test: 
        temp = unigram_perplexity(i)
        k = k * (temp**(1/len(test)))
    return(k)

In [9]:
# Bigram Perplexity
def bigram_perplexity(sentence):
    words = sentence.split()
    p = 1
    for i in range(1,len(words)):
        k = bigram_mle(words[i],words[i-1])
        p = p*(1/k)**(1/len(words))
    return(p)
def total_bigram_perplexity(test):
    k = 1
    mo = 0
    for i in test: 
        temp = bigram_perplexity(i)
        k = k* (temp**(1/len(test)))
    return(k)

In [10]:
# Trigram Perplexity
def trigram_perplexity(sentence):
    p = 1
    words = sentence.split()
    for i in range(2,len(words)):
        k = trigram_mle(words[i],words[i-2],words[i-1])
        p = p*(1/k)**(1/len(words))
    return(p)
def total_trigram_perplexity(test):
    k = 1
    for i in test: 
        temp = trigram_perplexity(i)
        k = k* (temp**(1/len(test)))
    return(k)

In [11]:
# Fourgram Perplexity
def fourgram_perplexity(sen):
    p = 1
    words = sen.split()
    for i in range(3,len(words)):
        k = fourgram_mle(words[i],words[i-3],words[i-2],words[i-1])
        p = p*(1/k)**(1/len(words))
    return(p)
def total_fourgram_perplexity(test):
    k = 1
    for i in test: 
        temp = fourgram_perplexity(i)
        k = k* (temp**(1/len(test)))
    return(k)

In [12]:
# For multiple sentences, first split to list of strings
text1 = ["The flower at a wind who said the hanging albatross grapple on the sea sea couple man"]
text2 = ["Love can't be described. key deadweed deadweed adorned harrow rook pollen pollen haul sores"]
text3 = [
  "That I stands his fingers o'er",
  "Once as we will set off",
        "To do then at its care",
    "And can the deep wind course to say",
        "Leastan","greatest are both",
    "Like some wonderful words the sake for life",
      "But I know you bowed my world round down",
    "So more than a something in the days",
      "Bid me up and put on their grace",
    "Looks","What is gladly on his mile",
    "With sertity drew real",
      "The different clearly mere works fire",
    "Sage from my heart lowth in the rain",
      "To doth the cower", "and her new pleasures on my flame",
      "Go din", "my care"
]
text4 = ["The wind is silent and the grave"]

In [13]:
print("Perplexity for Unigram model of text1:", total_unigram_perplexity(text1))
print("Perplexity for Unigram model of text2:", total_unigram_perplexity(text2))
print("Perplexity for Unigram model of text3:", total_unigram_perplexity(text3))
print("Perplexity for Unigram model of text4:", total_unigram_perplexity(text4))

Perplexity for Unigram model of text1: 2140.202696993924
Perplexity for Unigram model of text2: 175613.91093758805
Perplexity for Unigram model of text3: 5069.772503608099
Perplexity for Unigram model of text4: 603.3084449512252


In [14]:
print("Perplexity for Bigram model of text1:", total_bigram_perplexity(text1))
print("Perplexity for Bigram model of text2:", total_bigram_perplexity(text2))
print("Perplexity for Bigram model of text3:", total_bigram_perplexity(text3))
print("Perplexity for Bigram model of text4:", total_bigram_perplexity(text4))

Perplexity for Bigram model of text1: 3627.303744345597
Perplexity for Bigram model of text2: 15298.700117670298
Perplexity for Bigram model of text3: 546.7865057007787
Perplexity for Bigram model of text4: 411.6899377591998


In [15]:
print("Perplexity for Trigram model of text1:", total_trigram_perplexity(text1))
print("Perplexity for Trigram model of text2:", total_trigram_perplexity(text2))
print("Perplexity for Trigram model of text3:", total_trigram_perplexity(text3))
print("Perplexity for Trigram model of text4:", total_trigram_perplexity(text4))

Perplexity for Trigram model of text1: 7204.9616845944975
Perplexity for Trigram model of text2: 7252.460250417535
Perplexity for Trigram model of text3: 205.54940176402422
Perplexity for Trigram model of text4: 1054.029697844953


In [17]:
print("Perplexity for Fourgram model of text1:", total_fourgram_perplexity(text1))
print("Perplexity for Fourgram model of text2:", total_fourgram_perplexity(text2))
print("Perplexity for Fourgram model of text3:", total_fourgram_perplexity(text3))
print("Perplexity for Fourgram model of text4:", total_fourgram_perplexity(text4))

Perplexity for Fourgram model of text1: 51995.82281591697
Perplexity for Fourgram model of text2: 31580.40489006827
Perplexity for Fourgram model of text3: 165.65727519714898
Perplexity for Fourgram model of text4: 1871.9925957309315
