Computing Similarities based on word frequency counts

In [101]:
from os import listdir
from gensim import corpora, models, similarities
import re 
import pickle

basepath = '/home/daniel/data/projects/datadive/data/texts/'
subfolderfiles = 'extracted/'
dictfn = 'termcounts-min-2-term-ids-map.txt'
recompute = False

matrixfn = 'matrix.pkl'
dictfn   = 'dict.pkl'
corpusfn = 'corpus.pkl'
lsifn = 'lsi.pkl'
indexfn = 'index.pkl'
shards_prefix  = 'shards'


In [102]:
def make_dict(dict_file=dictfn, data_folder=basepath):
    with open(data_folder + dict_file, 'r') as f:
        wdic = {} 
        for row in f.readlines():
            if len(row.split(',')) == 2:  # this get rid of rows with 3 values i.e. numbers
                (value, key) = row.split(',')
                value = re.match('\D', value)
                if value:
                    if len(value.string) > 2:
                        ikey = int(key.rstrip())
                        wdic[ikey] = value.string
    return wdic

def transform_text(filename, word_dict=wdic):
    with open(filename, 'r') as f:
        file = []  # initiate a list
        for row in f.readlines():
            if len(row.split(',')) == 2:  # this get rid of rows with 3 values i.e. numbers
                row_val = row.split(',')
                row_val[1] = int(row_val[1].rstrip())
                row_val[0] = int(row_val[0])
                file.append(tuple(row_val))
    
    text = [i for i in file if i[0] in word_dict]
    return text

In [103]:
if recompute:
    wdic = make_dict()
    corpus_texts = listdir(basepath + subfolderfiles)
    corpus = []
    errcount = 0
    for ctext in corpus_texts:
        try:
            corpus.append(transform_text(basepath + subfolderfiles + ctext))
        except UnicodeDecodeError:
            errcount += 1
    with open(basepath+matrixfn, 'wb') as f:
        pickle.dump(corpus, f)
        
    lsi = models.LsiModel(corpus, id2word=wdic, num_topics=256)
    index = similarities.Ma
    
    with open(basepath+indexfn, 'wb') as f:
        pickle.dump(index, f)
    with open(basepath+corpusfn, 'wb') as f:
        pickle.dump(corpus, f)
    with open(basepath+lsifn, 'wb') as f:
        pickle.dump(lsi, f)
else:
    corpus = pickle.load(open(basepath+corpusfn,'rb'))
    lsi = pickle.load(open(basepath+lsifn,'rb'))
    index = pickle.load(open(basepath+indexfn,'rb'))
    

    
sims = index[lsi[corpus[0]]]
print(list(enumerate(sims)))

[(0, 0.99999994), (1, 0.35334304), (2, 0.53770334), (3, 0.37575531), (4, 0.22521161), (5, 0.36037683), (6, 0.34621662), (7, 0.29506594), (8, 0.47267634), (9, 0.48081419), (10, 0.41026193), (11, 0.34851038), (12, 0.42212218), (13, 0.31224778), (14, 0.40180686), (15, 0.37388515), (16, 0.32446146), (17, 0.0), (18, 0.35167268), (19, 0.25154471), (20, 0.43492213), (21, 0.29808611), (22, 0.23895866), (23, 0.32281342), (24, 0.33563739), (25, 0.33162743), (26, 0.41490975), (27, 0.27583733), (28, 0.46251774), (29, 0.40729368), (30, 0.26016515), (31, 0.49001223), (32, 0.42671463), (33, 0.36769235), (34, 0.27411261), (35, 0.44618669), (36, 0.27722931), (37, 0.34435818), (38, 0.45350015), (39, 0.27204156), (40, 0.34982136), (41, 0.34349599), (42, 0.29004261), (43, 0.38898811), (44, 0.31665388), (45, 0.36915046), (46, 0.50198555), (47, 0.33967334), (48, 0.16878989), (49, 0.37308386), (50, 0.45178688), (51, 0.37263894), (52, 0.22203834), (53, 0.39722824), (54, 0.67038673), (55, 0.38074851), (56, 0.3