# Computing Similarities: An example

The purpose of this notebook is to load word counts and compute cosine similarities. We will only load a few files.

In [98]:
from os import listdir
from gensim import corpora, models, similarities
import re 
import pickle

basepath = '/home/daniel/data/projects/datadive/data/texts/'
subfolderfiles = 'extracted/'
dictfn = 'termcounts-min-2-term-ids-map.txt'
recompute = True

matrixfn = 'matrix.pkl'
dictfn   = 'dict.pkl'
corpusfn = 'corpus.pkl'
lsifn = 'lsi.pkl'
indexfn = 'index.pkl'
shards_prefix  = 'shards'


## Create a dictionary:

In [85]:
def make_dict(dict_file=dictfn, data_folder=basepath):
    with open(data_folder + dict_file, 'r') as f:
        wdic = {} 
        for row in f.readlines():
            if len(row.split(',')) == 2:  # this get rid of rows with 3 values i.e. numbers
                (value, key) = row.split(',')
                value = re.match('\D', value)
                if value:
                    if len(value.string) > 2:
                        ikey = int(key.rstrip())
                        wdic[ikey] = value.string
    return wdic

def transform_text(filename, word_dict=wdic):
    with open(filename, 'r') as f:
        file = []  # initiate a list
        for row in f.readlines():
            if len(row.split(',')) == 2:  # this get rid of rows with 3 values i.e. numbers
                row_val = row.split(',')
                row_val[1] = int(row_val[1].rstrip())
                row_val[0] = int(row_val[0])
                file.append(tuple(row_val))
    
    text = [i for i in file if i[0] in word_dict]
    return text

## Create the corpus

In [87]:
if recompute:
    wdic = make_dict()
    corpus_texts = listdir(basepath + subfolderfiles)
    corpus = []
    errcount = 0
    for ctext in corpus_texts:
        try:
            corpus.append(transform_text(basepath + subfolderfiles + ctext))
        except UnicodeDecodeError:
            errcount += 1
    with open(basepath+matrixfn, 'wb') as f:
        pickle.dump(corpus, f)
        
    lsi = models.LsiModel(corpus, id2word=wdic, num_topics=256)
    index = similarities.Ma
    
    with open(basepath+indexfn, 'wb') as f:
        pickle.dump(index, f)
    with open(basepath+corpusfn, 'wb') as f:
        pickle.dump(corpus, f)
    with open(basepath+lsifn, 'wb') as f:
        pickle.dump(lsi, f)
else:
    corpus = pickle.load(open(basepath+corpusfn,'rb'))
    lsi = pickle.load(open(basepath+lsifn,'rb'))
    index = pickle.load(open(basepath+indexfn,'rb'))
    

    
sims = index[lsi[corpus[0]]]
print(list(enumerate(sims)))

['termIdCounts-101024_cappelletti2016_markiert.txt',
 'termIdCounts-28414_tsivian2012.txt',
 'termIdCounts-2047_bandera2007.txt']