# Computing Similarity Matrix

This notebook will compute the similarity matrix given the `word_matrix` dataframe.

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import normalize
from gensim.models.word2vec import KeyedVectors
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

# These following have to be customized
PATH_TO_DATA = Path('../data')
# for joblib multithreading
N_THREADS = -1

Get the dataframe from file.

In [3]:
dataframes = {
    'norway_attack': 'world_norway_word_matrix_df.csv',
    'russia_shooting': 'world_russia_word_matrix_df.csv',
    'capitol_hill': 'world_capitol_hill_word_matrix_df.csv',
    'test': 'word_matrix_df.csv',
}
story_to_elaborate = 'test'# 'norway_attack' # 'russia_shooting' # 'capitol_hill'
with open(PATH_TO_DATA/dataframes[story_to_elaborate]) as csv_file:
    # dropping autospawned 'Unnamed: 0' column, and unecessary (since they are ordered already) 'article_id' column
    word_matrix_df = pd.read_csv(csv_file).drop(['Unnamed: 0','article_id'], 1)
word_matrix_df.head()

Unnamed: 0,intanto,tarrio,votat,quando,è,banner,vota,dandolo,fiamm,si,...,l'oligarchia,dopò,gravano,sovrappongono,ascoltatori,sopravvivrà,paragonando,attaccavano,stento,un'audi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,2.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,2.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In order to create the **Similarity Matrix**, we should _calculate the distances_ of all the articles (exploiting the simple dot product) from the loaded dataframe.
'WORD_VECTORS' must be passed through the `create_model_matrix` function that will map each document to its 300-dim vector representation using a model from `gensim`.
This model can be dowloaded using the downloading API of `gensim`, more information can be found [here](https://github.com/RaRe-Technologies/gensim-data).
The available models to choose are [ConceptNet Numberbatch](https://aaai.org/ocs/index.php/AAAI/AAAI17/paper/view/14972) or [Google News Word2Vec](https://code.google.com/archive/p/word2vec/).

The results of `create_model_matrix` must be normalized before computing the distance.
The fastest method for normalization is that from _sklearn_, but also a _numpy_ version is provided (but not used).

This is equivalent to use the _cosine similarity_ distance to compute these distances.

In [4]:
# get all the possible words
all_words = list(word_matrix_df.columns)

Then, the function for creating the whole word matrix is defined, TFITF score is used as weight.

In [7]:
def create_model_matrix(data, model):

    # instatiate the iterator
    articles_iterator = tqdm(
        range(len(data)),
        leave=True,
        unit='articles',
    )

    # function to parallelize 
    def fn(article):
        article_vector = np.zeros((1, 300))
        for i, word in enumerate(all_words):
            # using tf-idf as weight
            tf = data[article,i]
            n = np.sum(data[:,i])
            idtf = np.log(len(data)/n)
            weight = tf*idtf
            try:
                word_vector = model.get_vector(word)
                try:
                    assert np.isfinite(word_vector).all()
                except AssertionError:
                    print(word_vector)
            except KeyError:
                word_vector = np.zeros((1, 300))
            article_vector = article_vector + word_vector*weight
        return article_vector
            
    list_of_docvs = Parallel(n_jobs=N_THREADS)(delayed(fn)(i) for i in articles_iterator)
    return np.array(list_of_docvs).squeeze()

Loading the [ConceptNet Numberbatch](https://aaai.org/ocs/index.php/AAAI/AAAI17/paper/view/14972) model, and adapting to our words.
DONT'T USE THIS!!!

In [None]:
conceptnet = KeyedVectors.load_word2vec_format(PATH_TO_DATA/'conceptnet-numberbatch-17-06-300.gz')
conceptnet = conceptnet.vectors_for_all(all_words)
conceptnet

Loading the [Google News Word2Vec](https://code.google.com/archive/p/word2vec/) model, and adapting to our words.
USE THIS!!!

In [6]:
google_news_word2vec = KeyedVectors.load_word2vec_format(PATH_TO_DATA/'word2vec-google-news-300.gz', binary=True)
google_news_word2vec = google_news_word2vec.vectors_for_all(all_words)
google_news_word2vec

<gensim.models.keyedvectors.KeyedVectors at 0x7fc12252a7a0>

Get the whole word matrix using the DataFrame and the model choosen.
DONT'T USE THIS!!!

In [None]:
model = conceptnet
conceptnet_docvs = create_model_matrix(np.array(word_matrix_df), model)
np.savez(PATH_TO_DATA/'conceptnet_docvs.npz', conceptnet_docvs)
conceptnet_docvs.shape

USE THIS!!!

In [None]:
model = google_news_word2vec
google_docvs = create_model_matrix(np.array(word_matrix_df), model)
np.savez(PATH_TO_DATA/'google_docvs.npz', google_docvs)
google_docvs.shape

In [6]:
# docvs = conceptnet_docvs
docvs = google_docvs
docvs.shape

array([[-4.15124185e+00,  8.49358914e+00,  3.24921606e+00, ...,
        -7.54261422e+00,  4.95666061e+00, -2.39626000e+00],
       [ 2.14510083e+01,  3.21642291e+01,  5.47439096e+00, ...,
        -3.59560750e+01,  1.89494194e+01,  8.06504266e-01],
       [-2.01376798e+01,  5.91221505e+01,  1.30303238e+02, ...,
        -5.36260845e+01, -7.60163316e+00,  1.79734167e+02],
       ...,
       [-1.39637447e+01,  2.76800045e+01, -9.91592173e-02, ...,
        -2.79628069e+01, -1.12852966e+01,  1.77993376e+01],
       [-4.35520784e+00,  5.69531574e+00,  6.75323993e+00, ...,
        -6.08904045e+00,  2.29433093e+00,  1.00694237e+01],
       [-8.89013666e+00,  6.28166851e+00, -5.81385224e+00, ...,
        -4.23384561e+00, -3.51962592e+00, -5.14883912e-01]])

`numpy` normalization procedure.
USE THIS!!!

In [7]:
# Numpy normalization procedure
row_sums = docvs.sum(axis=1)
np_docvs_norm = (docvs / np.sqrt((docvs ** 2).sum(-1))[..., np.newaxis]).astype('float')
print("Shape of normalized matrix is {}.".format(np_docvs_norm.shape))
print("Sum of normalized matrix is {}.".format(np.sum(np_docvs_norm)))
print("Max={}; Min={}.".format(np.max(np_docvs_norm), np.min(np_docvs_norm)))
np.savez(PATH_TO_DATA/'np_docvs_norm.npz', np_docvs_norm)
np_docvs_norm.shape

Shape of normalized matrix is (2992, 300).
Sum of normalized matrix is -2821.8530565127417.
Max=0.26247709861429375; Min=-0.24015063773379133.


(2992, 300)

`sklearn` normalization procedure (axis=1).
DONT'T USE THIS!!!

In [None]:
# Scikit-learn normalization procedure (axis=1)
sk_docvs_norm = normalize(docvs)
print("Shape of normalized matrix is {}.".format(sk_docvs_norm.shape))
print("Sum of normalized matrix is {}.".format(np.sum(sk_docvs_norm)))
print("Max={}; Min={}.".format(np.max(sk_docvs_norm), np.min(sk_docvs_norm)))
np.savez(PATH_TO_DATA/'sk_docvs_norm.npz', sk_docvs_norm)
sk_docvs_norm.shape

`sklearn` normalization procedure (axis=0).
DONT'T USE THIS!!!

In [None]:
# Scikit-learn normalization procedure (axis=0)
sk_docvs_norm_0 = normalize(docvs, axis=0)
print("Shape of normalized matrix is {}.".format(sk_docvs_norm_0.shape))
print("Sum of normalized matrix is {}.".format(np.sum(sk_docvs_norm_0)))
print("Max={}; Min={}.".format(np.max(sk_docvs_norm_0), np.min(sk_docvs_norm_0)))
np.savez(PATH_TO_DATA/'sk_docvs_norm_0.npz', sk_docvs_norm_0)
sk_docvs_norm_0.shape

Computation of the distance matrix.
The simple dot product is used between the matrix and its transpose.
Here are used `scipy.sparse` matrices.
DONT'T USE THIS!!!

In [None]:
s = sparse.csr_matrix(sk_docvs_norm_0)
s_t = sparse.csr_matrix(sk_docvs_norm_0).T
s_dist = s.dot(s_t)
dists_triu = sparse.triu(s_dist, k=1)
dists_triu = np.array(dists_triu.todense())
np.savetxt(PATH_TO_DATA/'dists_triu.csv', dists_triu, delimiter=',')
dists_triu

USE THIS!!!

In [8]:
dists = np.dot(np_docvs_norm, np_docvs_norm.T).astype('float')
dists_triu = np.triu(dists, k=1)
np.savetxt(PATH_TO_DATA/story_to_elaborate+'_dists_triu.csv', dists_triu, delimiter=',')
print("Shape of similarity matrix is {}.".format(dists_triu.shape))
print("Sum of similarity matrix is {}.".format(np.sum(dists_triu)))
print("Max={}; Min={}.".format(np.max(dists_triu), np.min(dists_triu)))
dists_triu

Shape of similarity matrix is (2992, 2992).
Sum of similarity matrix is 2725212.7520491164.
Max=1.0000000000000004; Min=-0.4190984022529244.


array([[0.        , 0.71497394, 0.50444967, ..., 0.66298975, 0.61015226,
        0.65692472],
       [0.        , 0.        , 0.46490303, ..., 0.64113891, 0.7083136 ,
        0.63537908],
       [0.        , 0.        , 0.        , ..., 0.57545389, 0.56454262,
        0.46803413],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.65108241,
        0.57432244],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.59058211],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])