# Computing Similarities: An example

The purpose of this notebook is to load word counts and compute cosine similarities. We will only load a few files.

In [50]:
from os import listdir

from gensim import corpora, models, similarities
import re 

In [51]:
basepath = '/home/daniel/data/projects/datadive/data/texts/'
subfolderfiles = 'extracted/'
shards_prefix  = 'shards'
dictfn = 'termcounts-min-2-term-ids-map.txt'
matrixfn = 'matrix.pkl'
listdir(basepath)

['shards.1',
 'termcounts-min-2-term-ids-map.txt',
 'extracted',
 'termcounts-min-1.tar',
 'termcounts-min-6.tar',
 'matrix.pkl',
 'shards.0.index.indptr.npy',
 'termcounts-min-5.tar',
 'shards.0.index.npy',
 'shards.0',
 'shards.0.index.data.npy',
 'termcounts-min-3.tar',
 'shards.0.index.indices.npy']

## Create a dictionary:

In [52]:
def make_dict(dict_file=dictfn, data_folder=basepath):
    """Create dictionary of unique terms and their IDs."""
   
    with open(data_folder + dict_file, 'r') as f:
        wdic = {} 
        for row in f.readlines():
            if len(row.split(',')) == 2:  # this get rid of rows with 3 values i.e. numbers
                (value, key) = row.split(',')
                value = re.match('\D', value)
                if value:
                    if len(value.string) > 2:
                        wdic[int(key.rstrip())] = value.string
    return wdic

In [53]:
wdic = make_dict()

In [54]:
len(wdic)

1324237

## Create the corpus

In [55]:
corpus_texts = listdir(basepath + subfolderfiles)
corpus_texts[:3]

['termIdCounts-101024_cappelletti2016_markiert.txt',
 'termIdCounts-28414_tsivian2012.txt',
 'termIdCounts-2047_bandera2007.txt']

In [56]:
def transform_text(filename, word_dict=wdic):
    
    with open(filename, 'r') as f:

        file = []  # initiate a list
        for row in f.readlines():
            if len(row.split(',')) == 2:  # this get rid of rows with 3 values i.e. numbers
                row_val = row.split(',')
                row_val[1] = int(row_val[1].rstrip())
                row_val[0] = int(row_val[0])
                file.append(tuple(row_val))
    
    text = [i for i in file if i[0] in word_dict]
    

    return text

In [57]:
corpus = []
errcount = 0
for ctext in corpus_texts:
    try:
        corpus.append(transform_text(basepath + subfolderfiles + ctext))
    except UnicodeDecodeError:
        errcount += 1


In [58]:
import pickle
#with open(basepath+matrixfn, 'wb') as f:
#    pickle.dump(corpus, f)
corpus = pickle.load(open(basepath+matrixfn,'rb'))
    

In [59]:
tfidf = models.TfidfModel(corpus)

In [60]:
corpus_tfidf = tfidf[corpus]

In [61]:
index = similarities.Similarity(basepath + shards_prefix, corpus_tfidf, len(wdic))
#index = similarities.MatrixSimilarity(corpus_tfidf)

#lsi = models.LsiModel(corpus, id2word=wdic, num_topics=2)
#index = similarities.MatrixSimilarity(lsi[corpus])

In [None]:
index.num_best = 10
sims = index[corpus_tfidf[0]]
print(list(enumerate(sims)))

## Compute similarities:

In [65]:
lsi = models.LsiModel(corpus, id2word=wdic, num_topics=2)
index = similarities.MatrixSimilarity(lsi[corpus])

In [24]:
vec_lsi = lsi[corpus[0]]

NameError: name 'lsi' is not defined

In [68]:
sims = index[vec_lsi]
print(list(enumerate(sims)))

[(0, 1.0), (1, 0.1939158)]
