In [6]:
import nltk
nltk.download('punkt')

import pandas as pd
from nltk.tokenize import word_tokenize
from csv import QUOTE_NONE

# Read the SST data
datain_sst = pd.read_csv("D:\ProbabilityDistribution\SST-2/train.tsv",delimiter="\t")
datain_sst_low = datain_sst['sentence'].str.lower()

#Read the qnli data
datain_qnli = pd.read_csv(r"D:\ProbabilityDistribution\QNLI/dev.tsv",delimiter="\t",quoting=QUOTE_NONE)
datain_qnli_low= datain_qnli['sentence'].str.lower()

sst_token=[]

#Tokenize sentences from SST dataset
for i in datain_sst_low:
    token = word_tokenize(i)
    sst_token.extend(token)

qnli_token=[]

#Tokenize sentences from qnli dataset
for i in datain_qnli_low:
    token = word_tokenize(i)
    qnli_token.extend(token)
#print first 10 from each dataset
print(sst_token[:10])
print(qnli_token[:10])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\soham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units', 'contains', 'no', 'wit']
['as', 'of', 'that', 'day', ',', 'the', 'new', 'constitution', 'heralding', 'the']


In [7]:
#function to find probability distribution of a token list 
def probability_distribution(token_list):
    count={}

    #count occurence of each token
    for token in token_list:    
        if token in count:
            count[token] += 1
        else:
            count[token]=1
    
    token_length= len(token_list)
    probability_d={}
    
    #Calculate probability of each token
    for i, j in count.items():
        probability = j / token_length
        probability_d[i] = probability

    return probability_d


#print the probabilities for both datasets
sst_probability_d= probability_distribution(sst_token)
sst_probability= sum(sst_probability_d.values())
print(f"The probability distribution of sst dataset is '{sst_probability}'")
qnli_probability_d= probability_distribution(qnli_token)
qnli_probability= sum(qnli_probability_d.values())
print(f"The probability distribution of qnli dataset is '{qnli_probability}'")

The probability distribution of sst dataset is '0.9999999999998211'
The probability distribution of qnli dataset is '0.9999999999998687'


In [8]:
import math

# A function to find the entropy of probability distribution
def find_entropy(probability_distribution):
    entropy = 0

    for i in probability_distribution.values():
        if i > 0:
            entropy -= i * math.log2(i)

    return entropy

#print word-level entropy for sst and qnli datasets
print(f" word level entropy of sst dataset is '{find_entropy(sst_probability_d)}'")
print(f" word level entropy of qnli dataset is '{find_entropy(qnli_probability_d)}'")

 word level entropy of sst dataset is '10.079162530566823'
 word level entropy of qnli dataset is '10.037404792966129'


In [9]:
# A function to find the kl divergence between the two probability distributions
def find_kl_divergence(pd_1, pd_2):
    kl_divergence = 0.0
    for i in pd_1:
        if pd_1[i] > 0 and i in pd_2:
            kl_divergence += pd_1[i] * math.log2(pd_1[i] / pd_2[i])
    return kl_divergence

#print KL divergence results and check for symmetry
print(f"kl divergence for sst dataset w.r.t qnli dataset is '{find_kl_divergence(sst_probability_d, qnli_probability_d)}'")
print(f"kl divergence for qnli dataset w.r.t sst dataset is '{find_kl_divergence(qnli_probability_d, sst_probability_d)}'")

print("As both the values are different KL divergence is not symmetric")

kl divergence for sst dataset w.r.t qnli dataset is '0.829548217709137'
kl divergence for qnli dataset w.r.t sst dataset is '0.7374204121557258'
As both the values are different KL divergence is not symmetric


In [10]:
# A function to find the entropy rate of a message given a probability distribution
def find_entropy_rate(message, probability_distribution):
    
    tokens = word_tokenize(message.lower())      #Tokenize and convert to lowercase

    total_entropy = 0 
    length_tokens = len(tokens)

    for i in tokens:
        if i in probability_distribution:
            token_probability = probability_distribution[i]
        else:
            token_probability = 1e-10            #small epsilon for missing tokens

        total_entropy = -token_probability * math.log2(token_probability)
        total_entropy+=total_entropy

    entropy_rate = total_entropy / length_tokens

    return entropy_rate

#Example of a movie review
movie_review = '''With its unique visual style and a story that captures the essence of the franchise's appeal,
 Teenage Mutant Ninja Turtles: Mutant Mayhem is an animated treat for the whole family.'''
print(find_entropy_rate(movie_review, sst_probability_d))
print(find_entropy_rate(movie_review, qnli_probability_d))


0.006698149655695167
0.009244245596792456
