# Experiment 2 :
<b>Perform t-Test and Chi-Square test to check whether a given sequence of words is a
collocation or not.</b>

In [None]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import gutenberg, stopwords
import string

data = gutenberg.raw('austen-emma.txt')

#PREPROCESSING THE GIVEN DATA

#Tokenization, stopwords removal
sent_tokens = sent_tokenize(data)
word_tokens = []
for sentence in sent_tokens :
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    word_tokens += word_tokenize(sentence)
stops = set(stopwords.words('english'))
word_tokens = [word for word in word_tokens if word.lower() not in stops]

#Frequency, Propability
unique_words = set(word_tokens)
print(f"TOTAL WORDS IN THE CORPUS : {len(word_tokens)}")
print(f"UNIQUE WORDS : {len(unique_words)}")

frequency = {word : word_tokens.count(word) for word in unique_words}
propability = {word : frequency[word]/len(word_tokens) for word in unique_words}

In [None]:
#Generating Bigrams, frequency and propability of bigrams
bigrams = zip(word_tokens[:-1], word_tokens[1:])
bigram_freq = {}
bigram_count = 0
for bigram in bigrams :
    bigram_count += 1
    if bigram in bigram_freq :
        bigram_freq[bigram] += 1
    else :
        bigram_freq[bigram] = 1
bigram_prop = {}
for bigram, freq in bigram_freq.items() : 
    bigram_prop[bigram] = freq/bigram_count
print("TOTAL UNIQUE BIGRAMS :", len(bigram_freq))

In [None]:
import math
from scipy.stats import t,chi2 #For Critical value(feed value if givem)
#T-test demonstration
t_colloc = []
n = len(word_tokens)
t_critical = t.ppf(1-0.05, n-1)
for bigram, prop in bigram_prop.items() :
    w1, w2 = bigram
    mu = propability[w1] * propability[w2]
    X_ = prop
    t_stat = (X_ - mu)/math.sqrt((X_*(1-X_))/n)
    if t_stat > t_critical :
        t_colloc.append(bigram)
print(f"{len(t_colloc)} COLLOCATIONS IN THE CORPUS DETERMINED FROM T-TEST : \n")
print(t_colloc)

In [None]:
#Chi^2 TEST demonstration
chi_colloc = []
n = len(word_tokens)
chi_critical = chi2.ppf(1-0.05, 1)
for bigram, prop in bigram_prop.items() :
    w1, w2 = bigram
    f1 = frequency[w1]
    f2 = frequency[w2]
    #Observed Frequencies
    o_1_2 = bigram_freq[bigram]
    o_n1_2 = f2 - o_1_2
    o_1_n2 = f1 - o_1_2
    o_n1_n2 = n - (o_1_2 + o_n1_2 + o_1_n2)
    obs = [o_1_2, o_n1_2, o_1_n2, o_n1_n2]
    #Excepcted frequencies
    e_1_2 = (f1 * f2)/n
    e_n1_2 = ((n - f1) * f2)/n
    e_1_n2 = (f1 * (n - f2))/n
    e_n1_n2 = ((n - f1)*(n - f2))/n
    exp = [e_1_2, e_n1_2, e_1_n2, e_n1_n2]
    chi_stat = sum( ((obs[i] - exp[i])**2)/exp[i] for i in range(4))
    if chi_stat > chi_critical :
        chi_colloc.append(bigram)
print(f"{len(chi_colloc)} COLLOCATIONS IN THE CORPUS DETERMINED FROM CHI^2-TEST : \n")
print(chi_colloc)
    

In [None]:
f1 = 15828
f2 = 4675
n = 14307676
#Observed Frequencies
o_1_2 = 8
o_n1_2 = f2 - o_1_2
o_1_n2 = f1 - o_1_2
o_n1_n2 = n - (o_1_2 + o_n1_2 + o_1_n2)
obs = [o_1_2, o_n1_2, o_1_n2, o_n1_n2]
#Excepcted frequencies
e_1_2 = (f1 * f2)/n
e_n1_2 = ((n - f1) * f2)/n
e_1_n2 = (f1 * (n - f2))/n
e_n1_n2 = ((n - f1)*(n - f2))/n
exp = [e_1_2, e_n1_2, e_1_n2, e_n1_n2]
chi_stat = sum( ((obs[i] - exp[i])**2)/exp[i] for i in range(4))
print(chi_stat)