In [69]:
import mysql.connector
import math

In [70]:
# Establish a connection to the database
cnx = mysql.connector.connect(
        host="localhost",
        port=3306,
        database="ru_cococo",
        user="root",
        password="12345678",
    )
cursor = cnx.cursor()

In [71]:
# Define the SQL query
get_bigram_and_both_unigram_freq = """
SELECT 
    bigram.id AS bigram_id,
    bigram.freq AS bigram_freq,
    unigram1.freq AS first_unigram_freq,
    unigram2.freq AS second_unigram_freq
FROM 
    bigram
JOIN 
    unigram AS unigram1 ON bigram.first_unigram_id = unigram1.id
JOIN 
    unigram AS unigram2 ON bigram.second_unigram_id = unigram2.id;
"""

In [72]:
get_unigram_freq = """
SELECT
    unigram.freq
FROM
    unigram
"""

In [73]:
cursor.execute(get_bigram_and_both_unigram_freq)
bigrams = cursor.fetchall()

In [74]:
total_bigrams = sum(bigram[1] for bigram in bigrams)
print(total_bigrams)

14229646


In [75]:
cursor.execute(get_unigram_freq)
unigrams_freq = cursor.fetchall()
total_unigrams = sum(freq[0] for freq in unigrams_freq)
print(total_unigrams)

28416709


In [76]:
def calculate_t_score(bigram):
    E_xy = float(bigram[2] + bigram[3]) / float(total_bigrams)
    t_score = (bigram[1] - E_xy) / (bigram[1] ** 0.5)
    return t_score

In [77]:
def calculate_mi_score(bigram):
    # Probability of bigram
    P_xy = float(bigram[1]) / float(total_bigrams)
    
    # Probabilities of unigrams
    P_x = float(bigram[2]) / float(total_unigrams)
    P_y = float(bigram[3]) / float(total_unigrams)
    
    # MI Score
    mi_score = math.log(P_xy / (P_x * P_y), 2)
    
    return mi_score

In [78]:
def calculate_dice_coefficient(bigram):
    # Size of intersection (i.e., bigram frequency)
    intersection_size = float(bigram[1])
    
    # Sizes of the two sets (i.e., unigram frequencies)
    set1_size = float(bigram[2])
    set2_size = float(bigram[3])
    
    # Dice Coefficient
    dice_coefficient = (2.0 * intersection_size) / (set1_size + set2_size)
    
    return dice_coefficient

In [79]:
def calculate_c_value(bigram):
    # Probability of bigram
    P_xy = float(bigram[1]) / float(total_bigrams)
    
    # Probabilities of unigrams
    P_x = float(bigram[2]) / float(total_unigrams)
    P_y = float(bigram[3]) / float(total_unigrams)
    
    # C Value
    c_value = P_xy * math.log(P_xy / (P_x * P_y), 2)
    
    return c_value

In [80]:
bigrams_tscore = [calculate_t_score(bigram) for bigram in bigrams]
max_tscore = max(bigrams_tscore)
bigrams_tscore = [float(format(tscore/max_tscore, '.4f')) for tscore in bigrams_tscore]

In [81]:
bigrams_mi_score = [calculate_mi_score(bigram) for bigram in bigrams]
max_mi_score = max(bigrams_mi_score)
bigrams_mi_score = [float(format(mi_score/max_mi_score, '.4f')) for mi_score in bigrams_mi_score]

In [82]:
bigrams_dice_coefficient = [calculate_dice_coefficient(bigram) for bigram in bigrams]
max_dice_coefficient = max(bigrams_dice_coefficient)
bigrams_dice_coefficient = [float(format(dice_coefficient/max_dice_coefficient, '.4f')) for dice_coefficient in bigrams_dice_coefficient]

In [88]:
bigrams_cvalue = [calculate_c_value(bigram) for bigram in bigrams]
max_cvalue = max(bigrams_cvalue)
bigrams_cvalue = [float(format(c_value/max_cvalue, '.4f')) for c_value in bigrams_cvalue]

In [89]:
bigrams_with_stats = [(bigrams[i][0], bigrams[i][1], bigrams[i][2], bigrams[i][3], bigrams_cvalue[i], bigrams_dice_coefficient[i], bigrams_mi_score[i], bigrams_tscore[i]) for i in range(len(bigrams_tscore))]

In [90]:
bigrams_with_stats

[(4, 19, 123, 2885, 0.0014, 0.0126, 0.487, 0.0322),
 (5, 11, 2885, 347203, -0.0, 0.0001, -0.0287, 0.0244),
 (9, 1052, 118986, 1306, 0.0579, 0.0175, 0.3614, 0.2395),
 (10, 70, 1306, 1213001, 0.0006, 0.0001, 0.0558, 0.0617),
 (11, 572, 1213001, 19998, 0.0015, 0.0009, 0.0177, 0.1766),
 (16, 5946, 1213001, 5954, 0.2113, 0.0098, 0.2334, 0.5694),
 (18, 10, 1019, 1213001, -0.0001, 0.0, -0.0473, 0.0232),
 (19, 272, 1213001, 334, 0.0092, 0.0004, 0.221, 0.1218),
 (24, 14, 14, 1376, 0.0014, 0.0201, 0.6453, 0.0276),
 (26, 4, 68, 507326, 0.0001, 0.0, 0.1144, 0.0146),
 (27, 730, 507326, 1207, 0.0284, 0.0029, 0.2559, 0.1995),
 (28, 4, 1207, 132, 0.0003, 0.006, 0.441, 0.0148),
 (29, 46, 132, 1581, 0.004, 0.0537, 0.5729, 0.0501),
 (30, 30, 1581, 1213001, -0.0, 0.0, -0.0072, 0.0403),
 (31, 467, 1213001, 725, 0.0147, 0.0008, 0.2068, 0.1596),
 (33, 222, 266, 8138, 0.0178, 0.0528, 0.5264, 0.11),
 (35, 202, 9588, 7862, 0.0094, 0.0232, 0.3051, 0.105),
 (36, 6567, 205847, 10307, 0.3138, 0.0608, 0.3139, 0.5985

In [91]:
bigrams_with_stats[:2]

[(4, 19, 123, 2885, 0.0014, 0.0126, 0.487, 0.0322),
 (5, 11, 2885, 347203, -0.0, 0.0001, -0.0287, 0.0244)]

In [92]:
cursor.close()

True