# Computing Similarities: An example

The purpose of this notebook is to load word counts and compute cosine similarities. We will only load a few files.

In [2]:
from os import listdir

from gensim import corpora, models, similarities
import pandas as pd
import re 

In [3]:
home = '/Users/aliciahickey/Desktop/master/texts/'

In [4]:
listdir(home)

['.DS_Store',
 'extracted-texts',
 'extracted-texts.tar',
 'term-id-counts-min-2',
 'term-id-counts-min-2.tar',
 'termcounts-min-1.tar',
 'termcounts-min-2-term-ids-map.txt',
 'termcounts-min-2.tar',
 'termcounts-min-3.tar',
 'termcounts-min-5',
 'termcounts-min-5.tar',
 'termcounts-min-6.tar']

## Create a dictionary:

In [6]:
def make_dict(dict_file='termcounts-min-2-term-ids-map.txt', data_folder=home):
    """Create dictionary of unique terms and their IDs."""
   
    with open(data_folder + dict_file, 'r') as f:
        wdic = {} 
        for row in f.readlines():
            if len(row.split(',')) == 2:  # this get rid of rows with 3 values i.e. numbers
                (value, key) = row.split(',')
                value = re.match('\D', value)
                if value:
                    if len(value.string) > 2:
                        wdic[int(key.rstrip())] = value.string
    return wdic

In [8]:
wdic = make_dict()

In [9]:
len(wdic)

1324237

## Create the corpus

In [10]:
corpus_texts = listdir(home + 'term-id-counts-min-2/')

In [45]:
def transform_text(filename, word_dict=wdic):
    
    with open(filename, 'r') as f:

        file = []  # initiate a list
        for row in f.readlines():
            if len(row.split(',')) == 2:  # this get rid of rows with 3 values i.e. numbers
                row_val = row.split(',')
                row_val[1] = int(row_val[1].rstrip())
                row_val[0] = int(row_val[0])
                file.append(tuple(row_val))
    
    text = [i for i in file if i[0] in word_dict]
    
    return text

In [49]:
text_0 = transform_text(home + 'term-id-counts-min-2/' + corpus_texts[1])

In [50]:
text_1 = transform_text(home + 'term-id-counts-min-2/' + corpus_texts[2])

In [59]:
corpus = [text_0, text_1]

In [60]:
tfidf = models.TfidfModel(corpus)

In [61]:
corpus_tfidf = tfidf[corpus]

In [64]:
# for doc in corpus_tfidf:
#     print(doc)

## Compute similarities:

In [65]:
lsi = models.LsiModel(corpus, id2word=wdic, num_topics=2)
index = similarities.MatrixSimilarity(lsi[corpus])

In [67]:
vec_lsi = lsi[corpus[0]]

In [68]:
sims = index[vec_lsi]
print(list(enumerate(sims)))

[(0, 1.0), (1, 0.1939158)]
