In [13]:
import sqlite3
import math

In [14]:
def freq_rel(freq, l):
    return freq / l * 100

In [15]:
def count_ll(len1, len2, freq1, freq2):
    E1 = len1 * (freq1 + freq2) / (len1 + len2)
    E2 = len2 * (freq1 + freq2) / (len1 + len2)
    
    if freq1 == 0:
        G2 = 2 * (freq2 * math.log(freq2 / E2))
    elif freq2 == 0:
        G2 = 2 * (freq1 * math.log(freq1 / E1))
    else:
        G2 = 2 * ((freq1 * math.log(freq1 / E1)) + (freq2 * math.log(freq2 / E2)))
    return G2

In [16]:
def proc_ll(conn, name):
    len_first = conn.execute('select sum(freq_first) from ' + name).fetchone()[0]
    len_other = conn.execute('select sum(freq_other) from ' + name).fetchone()[0]
    
    lemmas = conn.execute('select lemma from ' + name).fetchall()
    freq_first = conn.execute('select freq_first from ' + name).fetchall()
    freq_other = conn.execute('select freq_other from ' + name).fetchall()
    freq_first_rel = []
    freq_other_rel = []
    G2 = []
    
    for i in range(len(freq_first)):
        freq_first_rel.append(freq_rel(freq_first[i][0], len_first))
        freq_other_rel.append(freq_rel(freq_other[i][0], len_other))
                              
        if freq_first_rel[i] >= freq_other_rel[i]:
            cf = 1
        else:
            cf = -1
        G2.append(cf * count_ll(len_first, len_other, freq_first[i][0], freq_other[i][0]))
        
    conn.execute('create table if not exists adjectives_ll(lemma text, freq_first int,' + 
                 ' freq_first_rel float, freq_other int, freq_other_rel float, G2 float)')
    conn.execute('create table if not exists verbs_ll(lemma text, freq_first int,' +
                 ' freq_first_rel float, freq_other int, freq_other_rel float, G2 float)')
    
    for i in range(len(freq_first)):
        if freq_first[i][0] != 0 and freq_first[i][0] + freq_other[i][0] >= 85:
            conn.execute('insert into ' + name + '_ll(lemma, freq_first, freq_first_rel, ' + 
                         'freq_other, freq_other_rel, G2) values (?, ?, ?, ?, ?, ?)',
                        (lemmas[i][0], freq_first[i][0], freq_first_rel[i], freq_other[i][0], freq_other_rel[i], G2[i]))
    
    conn.commit()
    return len_first, len_other, lemmas, freq_first, freq_other, freq_first_rel, freq_other_rel

In [17]:
conn = sqlite3.connect('freq.db')
try:
    (len_first_a, len_other_a, lemmas_a, freq_first_a, freq_other_a, freq_first_rel_a,
     freq_other_rel_a) = proc_ll(conn, 'adjectives')
    (len_first_v, len_other_v, lemmas_v, freq_first_v, freq_other_v, freq_first_rel_v,
     freq_other_rel_v) = proc_ll(conn, 'verbs')
finally:
    conn.close()

In [18]:
def count_pmi(len_first, len_other, freq_first, freq_other, freq_first_rel):
    p_A = len_first / (len_first + len_other)
    p_B = (freq_first + freq_other) / (len_first + len_other)
    p_AB = freq_first_rel / 100
    pmi = math.log(p_AB / (p_A * p_B))
    return pmi

In [19]:
def proc_pmi(conn, name, lemmas, len_first, len_other, freq_first, freq_other, freq_first_rel, freq_other_rel):
    conn.execute('create table if not exists ' + name +'_pmi(lemma text, freq_first int,' + 
                 ' freq_first_rel float, freq_other int, freq_other_rel float, PMI float)')
    for i in range(len(freq_first)):
        if freq_first[i][0] != 0 and freq_first[i][0] + freq_other[i][0] >= 85:
            conn.execute('insert into ' + name + '_pmi(lemma, freq_first, freq_first_rel, ' + 
                         'freq_other, freq_other_rel, PMI) values (?, ?, ?, ?, ?, ?)',
                        (lemmas[i][0], freq_first[i][0], freq_first_rel[i], freq_other[i][0],
                         freq_other_rel[i], count_pmi(len_first, len_other, freq_first[i][0], 
                                                      freq_other[i][0], freq_first_rel[i])))
    conn.commit()

In [20]:
conn = sqlite3.connect('freq.db')
try:
    proc_pmi(conn, 'adjectives', lemmas_a, len_first_a, len_other_a, freq_first_a, freq_other_a,
             freq_first_rel_a, freq_other_rel_a)
    proc_pmi(conn, 'verbs', lemmas_v, len_first_v, len_other_v, freq_first_v, freq_other_v,
             freq_first_rel_v, freq_other_rel_v)
finally:
    conn.close()

In [21]:
def weirdness(conn, name, lemmas, freq_first_rel, freq_other_rel, len_other):
    conn.execute('create table if not exists ' + name +'_weirdness(lemma text, w float)')
    for i in range(len(freq_first_rel)):
        if freq_other_rel[i] != 0:
            w = freq_first_rel[i] / freq_other_rel[i]
        else:
            w = freq_first_rel[i] * len_other
            
        conn.execute('insert into ' + name + '_weirdness(lemma, w) values (?, ?)',
                    (lemmas[i][0], w))
    conn.commit()

In [22]:
conn = sqlite3.connect('freq.db')
try:
    weirdness(conn, 'adjectives', lemmas_a, freq_first_rel_a, freq_other_rel_a, len_other_a)
    weirdness(conn, 'verbs', lemmas_v, freq_first_rel_v, freq_other_rel_v, len_other_v)
finally:
    conn.close()