In [61]:
import numpy as np
import gensim as gn
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, preprocess_string
from scipy.sparse import lil_matrix, save_npz
import json
import os, re, math
import multiprocessing as mp

In [62]:
def load_voc():
    with open("voc.json","r") as js:
        data = json.load(js);
        res = dict((v,k) for k,v in data.items())
    return res

def load_corp(file):
    ind = 0
    data = []
    with open("../dicts/"+file) as f:
        for line in f:
            words = line.split()

            # removes usernames from the list
            for word in words:
                if re.match(r'^@',word):
                    words.remove(word)

            line = ' '.join(words)

            # gensim preproceesing...
            # makes lowercase, strips punctuation, and removes stopwords
            CUSTOM_FILTERS = [lambda x: x.lower(), remove_stopwords]
            words = preprocess_string(line,CUSTOM_FILTERS)

            # removes usernames from the list
            for word in words:
                if re.match(r'^@',word):
                    words.remove(word)

            # removes RT from the begining of retweets
            if "rt" in words:
                words.remove("rt")

            # removes urls from the list
            for word in words:
                if re.match(r'^http',word):
                    words.remove(word)

            line = ' '.join(words)
                    
            # adds tweet to list
            data.insert(ind,line)
            ind+=1
    return data

In [63]:
def wd_count(word, corp):
    count = 0
    for i in range(len(corp)):
        if word in corp[i]:
            count+=1
    return count

# gets the word count of a word, wd1, and another, wd2, and 
# counts the number of times they occur within a window size, L
def wd_count_wnd(wd1, wd2, corp, l=5):
    count = 0
    for i in range(len(corp)):
        if wd1 in corp[i]: 
            tweet = corp[i].split()
            wd1_index = 0
            # gets the index of wrd1 so that it can be used to search
            # the windows
            for j in range(len(tweet)):
                if tweet[j] == wd1:
                    wd1_index = j
            # case that if the tweet is too small for the entire
            # window size to be search on either side
            if wd1_index < l and (len(tweet) - wd1_index) < l:
                for j in range(0,wd1_index):
                    if tweet[j] == wd2:
                        count+=1
                    else:
                        continue
                for j in range(wd1_index,len(tweet)):
                    if tweet[j] == wd2:
                        count+=1
                    else:
                        continue
            # case where left side is too small for the entire
            # window size to be searched
            elif wd1_index < l:
                for j in range(0,wd1_index):
                    if tweet[j] == wd2:
                        count+=1
                    else:
                        continue
                for j in range(wd1_index,wd1_index+5):
                    if tweet[j] == wd2:
                        count+=1
                    else:
                        continue
            # case where right side is too small for the entire 
            # window size to be searched
            elif (len(tweet) - wd1_index) < l:
                for j in range(wd1_index-5,wd1_index):
                    if tweet[j] == wd2:
                        count+=1
                    else:
                        continue
                for j in range(wd1_index,len(tweet)):
                    if tweet[j] == wd2:
                        count+=1
                    else:
                        continue
            # case where there is enough space for both window
            # sizes to be search in entirety 
            else:
                for j in range(wd1_index-5,wd1_index):
                    if tweet[j] == wd2:
                        count+=1
                    else:
                        continue               
                for j in range(wd1_index,wd1_index+5):
                    if tweet[j] == wd2:
                        count+=1
                    else:
                        continue            
    return count

In [64]:
def create_PPMI(voc,corp):
    len_voc = len(voc)
    PMI = lil_matrix((len_voc, len_voc)) # scipy sparse matrix

    for i in range(0,len_voc):
        for j in range(0,len_voc):
            wd1_cnt = wd_count(voc[i],corp) # word 1 count 
            wd2_cnt = wd_count(voc[j],corp) # word 2 cout
            wnd_cnt = wd_count_wnd(voc[i],voc[j],corp,5) # window count
            
            top = wnd_cnt * len_voc
            bot = wd1_cnt * wd2_cnt
            
            
            # One of the words may not occur in the corpus
            # if thats so then we can ignore the value because 
            # it will be 0
            if wd1_cnt > 0 and wd2_cnt > 0:
                res = math.log(top/bot) 
            
                print("added: PMI["+voc[i]+","+voc[j]+"] = ",res)
                PMI[i,j] = res
 
    save_npz("2018-01-22.npz", PMI) # saves matrix 
            
    return PMI

In [54]:
voc = load_voc()
print("vocabulary loaded")
len_voc = len(voc)

vocabulary loaded


In [55]:
corp = load_corp("../dicts/2018-01-22.txt")
print("corpus loaded")

corpus loaded


In [65]:
pmi = create_PPMI(voc,corp)

added: PMI[conservatives,conservatives] =  7.112964242690993
added: PMI[conservatives,shadow] =  0.29651243734945554
added: PMI[conservatives,banned] =  -1.2737650509535574
added: PMI[conservatives,again.] =  -1.178861005549984
added: PMI[conservatives,twitter] =  2.245684903414734
added: PMI[conservatives,removing] =  0.26239154165302636
added: PMI[conservatives,followers.] =  -1.253605595607614
added: PMI[conservatives,them.] =  0.4780835351996729
added: PMI[conservatives,i'm] =  0.399134604727471
added: PMI[conservatives,get…rt] =  -0.48428279254760204
added: PMI[conservatives,it’s] =  1.1791193984534323
added: PMI[conservatives,shark] =  -2.289883535955954
added: PMI[conservatives,week!] =  0.022402574301418638
added: PMI[conservatives,day] =  -1.5067043681920649
added: PMI[conservatives,10] =  -1.0979434204006548
added: PMI[conservatives,ending] =  -1.2430288725576089


ValueError: math domain error