In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from tabulate import tabulate

def preprocess_text(text: str) -> list[str]:
    """Preprocesses the text by tokenizing, removing punctuation and stopwords."""
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:
    """Calculates the mean probability (μ-value) of each bigram."""
    mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}
    return mean_probabilities

def main():
    # Download required NLTK data
    # nltk.download('punkt')
    # nltk.download('stopwords')

    # Load text
    with open('text3.txt', 'r') as file:
        text = file.read()

    # Preprocess text
    words = preprocess_text(text)

    # Calculate word frequency distribution
    fdist = FreqDist(words)

    # Calculate bigrams and their frequencies
    bigrams = list(nltk.bigrams(words))
    bigram_freq = FreqDist(bigrams)

    # Calculate mean probability (μ-value) for each bigram
    mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))

    # Sort collocations by mean probability
    collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)

    '''# Calculate mean of bigram frequencies
    total_bigram_freq = sum(bigram_freq.values())
    mean_bigram_freq = total_bigram_freq / len(bigram_freq)

    # Print mean bigram frequency
    print(f"Mean Bigram Frequency: {mean_bigram_freq:.2f}\n")'''

    # Print top N collocations with their frequencies and mean probabilities
    N = 10
    headers = ["Rank", "Bigram", "Frequency", "Mean Probability (μ-value)"]
    table = []
    for i, (bigram, mean_prob) in enumerate(collocations[:N]):
        table.append([i+1, bigram, bigram_freq[bigram], f"{mean_prob:.6f}"])
    print(tabulate(table, headers, tablefmt="orgtbl"))

if __name__ == "__main__":
    main()

|   Rank | Bigram                      |   Frequency |   Mean Probability (μ-value) |
|--------+-----------------------------+-------------+------------------------------|
|      1 | ('data', 'science')         |          15 |                     0.020243 |
|      2 | ('data', 'processing')      |           7 |                     0.009447 |
|      3 | ('predictive', 'analytics') |           5 |                     0.006748 |
|      4 | ('data', 'visualization')   |           5 |                     0.006748 |
|      5 | ('ai', 'data')              |           4 |                     0.005398 |
|      6 | ('ai', 'algorithms')        |           4 |                     0.005398 |
|      7 | ('data', 'cleaning')        |           4 |                     0.005398 |
|      8 | ('cleaning', 'preparation') |           4 |                     0.005398 |
|      9 | ('natural', 'language')     |           4 |                     0.005398 |
|     10 | ('language', 'processing')  |           4 |

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from tabulate import tabulate
from scipy.stats import chi2_contingency, ttest_1samp
import numpy as np

def preprocess_text(text: str) -> list[str]:
    """Preprocesses the text by tokenizing, removing punctuation and stopwords."""
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:
    """Calculates the mean probability (μ-value) of each bigram."""
    mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}
    return mean_probabilities

def perform_statistical_tests(bigram_freq: FreqDist, word_freq: FreqDist, total_bigrams: int):
    """Perform t-test and chi-square test for each bigram."""
    results = []

    for bigram, observed_freq in bigram_freq.items():
        word1, word2 = bigram
        freq_w1 = word_freq.get(word1, 0)
        freq_w2 = word_freq.get(word2, 0)
        
        # Expected frequency for the bigram assuming independence
        expected_freq = (freq_w1 * freq_w2) / total_bigrams
        
        # Chi-square test
        observed = np.array([
            [observed_freq, freq_w1 - observed_freq],
            [freq_w2 - observed_freq, total_bigrams - (freq_w1 + freq_w2 - observed_freq)]
        ])
        
        try:
            chi2_stat, p_value_chi2, dof, ex = chi2_contingency(observed)
        except ValueError:
            chi2_stat, p_value_chi2 = np.nan, np.nan
        
        # Generate sample data to perform t-test
        sample_data = [observed_freq] * observed_freq + [expected_freq] * (total_bigrams - observed_freq)
        
        # Perform one-sample t-test
        t_stat, p_value_t = ttest_1samp(sample_data, expected_freq)
        
        results.append((bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2))
    
    return results

def main():
    # Download required NLTK data
    # nltk.download('punkt')
    # nltk.download('stopwords')

    # Load text
    with open("text3.txt", 'r') as file:
        text = file.read()

    # Preprocess text
    words = preprocess_text(text)

    # Calculate word frequency distribution
    word_freq = FreqDist(words)

    # Calculate bigrams and their frequencies
    bigrams = list(nltk.bigrams(words))
    bigram_freq = FreqDist(bigrams)

    # Calculate mean probability (μ-value) for each bigram
    mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))

    # Sort collocations by mean probability
    collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)

    # Calculate mean of bigram frequencies
    total_bigram_freq = sum(bigram_freq.values())
    mean_bigram_freq = total_bigram_freq / len(bigram_freq)

    # Print mean bigram frequency
    print(f"Mean Bigram Frequency: {mean_bigram_freq:.2f}\n")

    # Perform statistical tests for each bigram
    results = perform_statistical_tests(bigram_freq, word_freq, len(bigrams))

    # Print top N collocations with their frequencies and mean probabilities
    N = 10
    headers = ["Rank", "Bigram", "Frequency", "Mean Prob(μ)", "t-Statistic", "p-Value(t-Test)", "Chi Square", "p-Value(Chi-Square)"]
    table = []
    for i, (bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2) in enumerate(results[:N]):
        table.append([
            i + 1,
            bigram,
            observed_freq,
            f"{mean_probabilities.get(bigram, 0):.6f}",
            f"{t_stat:.4f}" if not np.isnan(t_stat) else "NaN",
            f"{p_value_t:.4f}" if not np.isnan(p_value_t) else "NaN",
            f"{chi2_stat:.4f}" if not np.isnan(chi2_stat) else "NaN",
            f"{p_value_chi2:.4f}" if not np.isnan(p_value_chi2) else "NaN"
        ])
    print(tabulate(table, headers, tablefmt="grid"))

if __name__ == "__main__":
    main()


Mean Bigram Frequency: 1.17

+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+
|   Rank | Bigram                         |   Frequency |   Mean Prob(μ) |   t-Statistic |   p-Value(t-Test) |   Chi Square |   p-Value(Chi-Square) |
|      1 | ('impact', 'artificial')       |           2 |       0.002699 |        1.4152 |            0.1574 |     183.747  |                0      |
+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+
|      2 | ('artificial', 'intelligence') |           3 |       0.004049 |        1.7344 |            0.0833 |     307.077  |                0      |
+--------+--------------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+
|      3 | ('intelligence', 'data')       |           3 |       0.00404