# Computing Similarities: An example

The purpose of this notebook is to load word counts and compute cosine similarities. We will only load a few files.

In [1]:
import os
from os import listdir

from gensim import corpora, models, similarities
import pandas as pd
import re 

In [2]:
home = os.environ['DATADIVE']

In [3]:
listdir(home)

['.DS_Store',
 'extracted-texts.tar',
 'term-counts-min-1.tar',
 'term-id-counts-min-2',
 'term-id-counts-min-2.tar',
 'termcounts-min-1.tar',
 'termcounts-min-2-term-ids-map.txt',
 'termcounts-min-2.tar',
 'termcounts-min-3.tar',
 'termcounts-min-5.tar',
 'termcounts-min-6.tar']

## Create a dictionary:

In [4]:
def make_dict(dict_file='termcounts-min-2-term-ids-map.txt', data_folder=home):
    """Create dictionary of unique terms and their IDs."""
   
    with open(data_folder + dict_file, 'r') as f:
        wdic = {} 
        for row in f.readlines():
            if len(row.split(',')) == 2:  # this get rid of rows with 3 values i.e. numbers
                (value, key) = row.split(',')
                value = re.match('\D', value)
                if value:
                    if len(value.string) > 2:
                        wdic[int(key.rstrip())] = value.string
    return wdic

In [5]:
wdic = make_dict()

In [6]:
len(wdic)

1324237

## Create the corpus

In [7]:
corpus_texts = listdir(home + 'term-id-counts-min-2/')

In [8]:
def transform_text(filename, word_dict=wdic):
    
    with open(filename, 'r') as f:

        file = []  # initiate a list
        for row in f.readlines():
            if len(row.split(',')) == 2:  # this get rid of rows with 3 values i.e. numbers
                row_val = row.split(',')
                row_val[1] = int(row_val[1].rstrip())
                row_val[0] = int(row_val[0])
                file.append(tuple(row_val))
    
    text = [i for i in file if i[0] in word_dict]
    
    return text

In [9]:
corpus = []
for text in corpus_texts[:4]:
    corpus.append(transform_text(home + 'term-id-counts-min-2/' + text))

In [10]:
tfidf = models.TfidfModel(corpus)

In [11]:
corpus_tfidf = tfidf[corpus]

In [12]:
# for doc in corpus_tfidf:
#     print(doc)

## Compute similarities:

In [13]:
lsi = models.LsiModel(corpus, id2word=wdic, num_topics=2)
index = similarities.MatrixSimilarity(lsi[corpus])

In [14]:
vec_lsi = lsi[corpus]

In [15]:
sims = index[vec_lsi]
print(list(enumerate(sims)))

[(0, array([ 1.        ,  0.58858323,  0.28824052,  0.8828094 ], dtype=float32)), (1, array([ 0.58858317,  1.        ,  0.94377851,  0.89935482], dtype=float32)), (2, array([ 0.28824052,  0.94377851,  0.99999994,  0.70425642], dtype=float32)), (3, array([ 0.8828094 ,  0.89935482,  0.70425642,  0.99999994], dtype=float32))]
