# Jaccard and word2vec matrices

## Dependencies

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.spatial as sp

import matrices.loading as mload

In [2]:
plt.rcParams["figure.figsize"] = (16,16)

## Dataset (629814 documents)

This is the path to th dataset.

In [None]:
data_path = "resources/aminer/v1" # 629814 documents

Load sparse matrix with jaccard similarities.

In [None]:
matrix_jaccard = mload.load_matrix_jaccard_sim(data_path) # 998 documents sample
print(matrix_jaccard.ndim, matrix_jaccard.shape)

Loading indexed jaccard ...
Loading matrix ...


Load sparse matrix with word2vec similarities.

In [None]:
matrix_word2vec = mload.load_matrix_word2vec_sim(data_path) # 998 documents sample
print(matrix_word2vec.ndim, matrix_word2vec.shape)

In [None]:
N = matrix_jaccard.shape[0]
print("Number of documents", N, (matrix_jaccard.shape))

## Jaccard similarity

In [None]:
plt.matshow(matrix_jaccard, cmap='hot', vmin=0, vmax=1)
plt.colorbar()

## Dice from Jaccard

In [None]:
matrix_dice = 2*matrix_jaccard/(1 + matrix_jaccard)
print(matrix_dice.shape)

plt.matshow(matrix_dice, cmap='hot', vmin=0, vmax=1)
plt.colorbar()

## Jaccard disimilarity

In [None]:
matrix_jaccard_dis = 1 -matrix_jaccard
print(matrix_jaccard_dis.shape)

plt.matshow(matrix_jaccard_dis, cmap='hot', vmin=0, vmax=1)
plt.colorbar()

## word2vec similarity.

In [None]:
plt.matshow(matrix_word2vec, cmap='hot', vmin=0, vmax=1)
plt.colorbar()

## Ordering matrices by means of jaccard similarities

- Top-Left: Most similar documents to the rest
- Bottom-Right: Least similar documents to the rest

In [None]:
# Means of jaccard similarities
jaccard_similarity_mean = np.divide(np.sum(matrix_jaccard, axis=1), N)
# Array of indices to sort matrices  
jaccard_ordered_indices = jaccard_similarity_mean.argsort()[::-1]

#Plotting means of jaccard similarities
plt.plot(jaccard_similarity_mean[jaccard_ordered_indices], 'b.')
plt.show()

#Plotting ordered matrices 
plt.matshow(matrix_jaccard[:, jaccard_ordered_indices][jaccard_ordered_indices,:], cmap='hot', vmin=0, vmax=1)
plt.colorbar()
plt.matshow(matrix_word2vec[:, jaccard_ordered_indices][jaccard_ordered_indices,:], cmap='hot', vmin=0, vmax=1)
plt.colorbar()
plt.matshow(matrix_dice[:, jaccard_ordered_indices][jaccard_ordered_indices,:], cmap='hot', vmin=0, vmax=1)
plt.colorbar()
plt.matshow(matrix_jaccard_dis[:, jaccard_ordered_indices][jaccard_ordered_indices,:], cmap='hot', vmin=0, vmax=1)
plt.colorbar()

print(jaccard_similarity_mean.shape, matrix_jaccard.shape, jaccard_ordered_indices.shape)

## Ordering matrices by means of word2vec similarities

- Top-Left: Most similar documents to the rest
- Bottom-Right: Least similar documents to the rest

In [None]:
# Means of word2vec similarities
word2vec_similarity_mean = np.divide(np.sum(matrix_word2vec, axis=1), N)
# Array of indices to sort matrices  
word2vec_ordered_indices = word2vec_similarity_mean.argsort()[::-1]

#Plotting means of word2vec similarities
plt.plot(word2vec_similarity_mean[word2vec_ordered_indices], 'b.')
plt.show()

#Plotting ordered matrices 
plt.matshow(matrix_word2vec[:, word2vec_ordered_indices][word2vec_ordered_indices,:], cmap='hot', vmin=0, vmax=1)
plt.colorbar()
plt.matshow(matrix_jaccard[:, word2vec_ordered_indices][word2vec_ordered_indices,:], cmap='hot', vmin=0, vmax=1)
plt.colorbar()
plt.matshow(matrix_dice[:, word2vec_ordered_indices][word2vec_ordered_indices,:], cmap='hot', vmin=0, vmax=1)
plt.colorbar()
plt.matshow(matrix_jaccard_dis[:, word2vec_ordered_indices][word2vec_ordered_indices,:], cmap='hot', vmin=0, vmax=1)
plt.colorbar()

print(word2vec_similarity_mean.shape, matrix_word2vec.shape, word2vec_ordered_indices.shape)

## Ordering matrices by means of dice similarities

- Top-Left: Most similar documents to the rest
- Bottom-Right: Least similar documents to the rest

In [None]:
# Means of dice similarities
dice_similarity_mean = np.divide(np.sum(matrix_dice, axis=1), N)
# Array of indices to sort matrices  
dice_ordered_indices = dice_similarity_mean.argsort()[::-1]

#Plotting means of dice similarities
plt.plot(dice_similarity_mean[dice_ordered_indices], 'b.')
plt.show()

#Plotting sorted matrices 
plt.matshow(matrix_dice[:, dice_ordered_indices][dice_ordered_indices,:], cmap='hot', vmin=0, vmax=1)
plt.colorbar()
plt.matshow(matrix_word2vec[:, dice_ordered_indices][dice_ordered_indices,:], cmap='hot', vmin=0, vmax=1)
plt.colorbar()
plt.matshow(matrix_jaccard[:, dice_ordered_indices][dice_ordered_indices,:], cmap='hot', vmin=0, vmax=1)
plt.colorbar()
plt.matshow(matrix_jaccard_dis[:, dice_ordered_indices][dice_ordered_indices,:], cmap='hot', vmin=0, vmax=1)
plt.colorbar()

print(dice_similarity_mean.shape, matrix_dice.shape, dice_ordered_indices.shape)