In [2]:
import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk import word_tokenize
from scipy.stats import chi2_contingency, ttest_1samp
import math

# Step 1: Input text and tokenize
text = input("Enter a paragraph of text: ")
tokens = word_tokenize(text.lower())

# Step 2: Input bigram to check
word1 = input("Enter first word of bigram: ").lower()
word2 = input("Enter second word of bigram: ").lower()

# Step 3: Frequency counts
finder = BigramCollocationFinder.from_words(tokens)
bigram_freq = finder.ngram_fd[(word1, word2)]
total_bigrams = finder.N
total_words = len(tokens)
word1_count = tokens.count(word1)
word2_count = tokens.count(word2)

print(f"\nBigram: ({word1}, {word2})")
print(f"Frequency of bigram: {bigram_freq}")
print(f"Frequency of '{word1}': {word1_count}")
print(f"Frequency of '{word2}': {word2_count}")
print(f"Total Bigrams: {total_bigrams}")
print(f"Total Words: {total_words}")

# Step 4: t-Test (simplified)
print("\n--- t-Test ---")
expected_prob = (word1_count / total_words) * (word2_count / total_words)
expected_freq = expected_prob * total_bigrams
t_score = (bigram_freq - expected_freq) / math.sqrt(bigram_freq if bigram_freq > 0 else 1)

print(f"Expected Frequency (under independence): {expected_freq:.2f}")
print(f"t-Score: {t_score:.4f}")
if abs(t_score) > 2:
    print("Conclusion: Likely a collocation (significant)")
else:
    print("Conclusion: Not a strong collocation (not significant)")

# Step 5: Chi-Square Test
print("\n--- Chi-Square Test ---")

# Build contingency table:
#        word2      ~word2
# word1    A           B
# ~word1   C           D

A = bigram_freq
B = word1_count - bigram_freq
C = word2_count - bigram_freq
D = total_bigrams - (A + B + C)

table = [[A, B], [C, D]]

chi2, p, dof, expected = chi2_contingency(table)

print(f"Chi-Square Value: {chi2:.4f}")
print(f"p-value: {p:.4f}")
if p < 0.05:
    print("Conclusion: Likely a collocation (significant)")
else:
    print("Conclusion: Not a strong collocation (not significant)")

Enter a paragraph of text:  The quick brown fox jumps over the lazy dog. The lazy dog did not mind the quick fox. The fox was clever and fast, always running with the dog.
Enter first word of bigram:  lazy
Enter second word of bigram:  dog



Bigram: (lazy, dog)
Frequency of bigram: 2
Frequency of 'lazy': 2
Frequency of 'dog': 3
Total Bigrams: 33
Total Words: 33

--- t-Test ---
Expected Frequency (under independence): 0.18
t-Score: 1.2856
Conclusion: Not a strong collocation (not significant)

--- Chi-Square Test ---
Chi-Square Value: 11.1907
p-value: 0.0008
Conclusion: Likely a collocation (significant)
