In [1]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import mediacloud.api
from IPython.display import JSON
import datetime
import matplotlib.pyplot as plt
from collections import Counter
from functools import partial
from gensim.models.word2vec import KeyedVectors

## These following have to be customized
PATH_TO_DATA = Path('../data')
# for joblib multithreading
N_THREADS = -1

In [2]:
def process_info(story):
    return pd.DataFrame(
            [{
                'article_id': story['stories_id'],
                'timestamp': str(story['publish_date']),
                'source': story['media_id'],
            }]
        )

In [3]:
def process_article_word_matrix_json(article_words_occurences, n_words: int):
    words_occurences = np.zeros(n_words)
    for key, value in article_words_occurences.items():
        words_occurences[eval(key)] = value
    return words_occurences

In [4]:
special_characters = ".\!@#$%^&*()+?_=,<>/"

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def has_special_chars(inputString):
    return any(c in special_characters for c in inputString)

def process_drop_columns(col):
    # tmp = pd.to_numeric(words_df[col])
    # if len(col) < MIN_LENGTH or len(tmp[tmp>0]) == 0 or has_numbers(col) or has_special_chars(col):
    if has_numbers(col) or has_special_chars(col):
        return col

In [5]:
def process_ids(id, articles_ids):
    if id in articles_ids:
        return None
    else:
        return id

In [6]:
stories = {
    'norway_attack': ['world_norway_all_stories.json', 'world_norway_word_matrix.json'],
    'russia_shooting': ['world_russia_all_stories.json', 'world_russia_word_matrix.json'],
    'capitol_hill': ['world_capitol_hill_all_stories.json', 'world_capitol_hill_word_matrix.json'],
}
story_to_elaborate = 'russia_shooting' # 'russia_shooting' # 'capitol_hill'
with open(PATH_TO_DATA/stories[story_to_elaborate][0]) as json_file:
    all_stories = json.load(json_file)
# JSON(all_stories)
with open(PATH_TO_DATA/stories[story_to_elaborate][1]) as json_file:
    stories_words = json.load(json_file)
# JSON(stories_words)

In [7]:
n_words = len(stories_words['word_list'])

articles_iter = tqdm(
    stories_words['word_matrix'].values(),
    leave=True,
    unit='articles',
)
fn = partial(process_article_word_matrix_json, n_words=n_words)
results = np.array(Parallel(n_jobs=N_THREADS)(delayed(fn)(i) for i in articles_iter))
# results

  0%|          | 0/3330 [00:00<?, ?articles/s]

In [8]:
all_words = [stories_words['word_list'][i][0] for i in range(len(stories_words['word_list']))]
google_news_word2vec = KeyedVectors.load_word2vec_format(PATH_TO_DATA/'word2vec-google-news-300.gz', binary=True)
google_news_word2vec = google_news_word2vec.vectors_for_all(all_words)

In [9]:
def create_articles_matrix(articles_words, model):
    # total number of articles
    n_articles = articles_words.shape[0]
    # number of articles containing that word for every word
    art_per_word = np.array([np.sum(articles_words[:,i]>0) for i in range(articles_words.shape[1])])
    # all words idf
    words_idf = np.log(n_articles/art_per_word)
    
    ## get all words vectors
    words_iterator = tqdm(
        all_words,
        leave=True,
        unit='words',
    )
    # function to parallelize 
    def get_word_vector(word):
        try:
            word_vector = google_news_word2vec.get_vector(word)
            try:
                assert np.isfinite(word_vector).all()
            except AssertionError:
                print(word_vector)
        except KeyError:
            word_vector = [0]*300
        return np.array(word_vector)
    words_vectors = np.array([get_word_vector(word) for word in words_iterator])

    ## get the articles vectors
    # instatiate the article iterator
    articles_iterator = tqdm(
        articles_words,
        leave=True,
        unit='articles',
    )
    # function to parallelize 
    def get_article_vector(article):
        article_vector = np.zeros((1, 300))
        for i, word_vector in enumerate(words_vectors):
            ## using tf-idf as weight
            # occurences of word in the article
            tf = article[i]
            # if there are some
            if tf > 0:
                # tf-idf of word in article
                weight = tf*words_idf[i]
                # add with weight this word vector to whole article vector
                article_vector = article_vector + word_vector*weight
        return article_vector
    list_of_docvs = Parallel(n_jobs=N_THREADS)(delayed(get_article_vector)(i) for i in articles_iterator)
    
    return np.array(list_of_docvs).squeeze()

In [10]:
articles_matrix = create_articles_matrix(np.array(results), google_news_word2vec)
articles_matrix.shape

  0%|          | 0/191795 [00:00<?, ?words/s]

  0%|          | 0/3330 [00:00<?, ?articles/s]

In [11]:
articles_ids_word_matrix = np.array([eval(a) for a in list(stories_words['word_matrix'].keys())])
articles_ids_word_matrix.shape

(3330,)

In [12]:
articles_info_iter = tqdm(
    all_stories,
    leave=True,
    unit='stories',
)

articles_info_df = pd.concat(Parallel(n_jobs=N_THREADS)(delayed(process_info)(i) for i in articles_info_iter), axis=0).reset_index()
articles_info_df['timestamp'] = pd.to_datetime(articles_info_df.timestamp)
articles_info_df = articles_info_df.sort_values(by='timestamp')
articles_info_df.head()

  0%|          | 0/3332 [00:00<?, ?stories/s]

Unnamed: 0,index,article_id,timestamp,source
0,0,1926391701,2021-05-10 00:00:00,278722
4,0,1926430980,2021-05-10 00:00:00,20270
1,0,1926386660,2021-05-10 00:01:35,1123
2,0,1926388426,2021-05-10 00:01:35,63091
3,0,1926416043,2021-05-10 00:58:55,1123


In [13]:
len(articles_info_df['article_id'])

3332

In [14]:
len(articles_ids_word_matrix)

3330

In [15]:
remove_from_df = list(set(articles_info_df['article_id']) - set(articles_ids_word_matrix))
remove_from_df

[1928891878,
 1927618664,
 1950003755,
 1956817933,
 1927927920,
 1952064209,
 1928082225,
 1928836306,
 1940974713]

In [16]:
remove_from_matrix = list(set(articles_ids_word_matrix) - set(articles_info_df['article_id']))
remove_from_matrix

[1928431232,
 1927686400,
 1951160778,
 2139135120,
 1930100244,
 1927900885,
 1936130399]

In [17]:
[articles_info_df.drop(articles_info_df[articles_info_df['article_id'] == i].index, inplace=True) for i in remove_from_df]
len(articles_info_df['article_id'])

3323

In [19]:
indices = [articles_ids_word_matrix.tolist().index(i) for i in remove_from_matrix]
articles_ids_word_matrix = np.delete(articles_ids_word_matrix, indices, axis=0)
articles_matrix = np.delete(articles_matrix, indices, axis=0)
articles_matrix.shape

(3323, 300)

In [20]:
articles_matrix[0,:].shape
articles_matrix[0][:4]

array([388.13684672,  56.24779515,  32.1376547 , 539.17715772])

In [21]:
new_indices = [articles_ids_word_matrix.tolist().index(i) for i in articles_info_df['article_id']]
new_articles_matrix = articles_matrix.copy()
for i,j in enumerate(new_indices):
    new_articles_matrix[i,:] = new_articles_matrix[j,:]
new_articles_matrix[0][:4]

array([102.72295375, 305.62167091, -84.02320447, 499.11175712])

In [24]:
# Numpy normalization procedure
row_sums = articles_matrix.sum(axis=1)
np_docvs_norm = (articles_matrix / np.sqrt((articles_matrix ** 2).sum(-1))[..., np.newaxis]).astype('float')
print("Shape of normalized matrix is {}.".format(np_docvs_norm.shape))
print("Sum of normalized matrix is {}.".format(np.sum(np_docvs_norm)))
print("Max={}; Min={}.".format(np.max(np_docvs_norm), np.min(np_docvs_norm)))
np.savez(PATH_TO_DATA/str(story_to_elaborate+'_np_docvs_norm.npz'), np_docvs_norm)
np_docvs_norm.shape

Shape of normalized matrix is (3323, 300).
Sum of normalized matrix is -4756.11382784786.
Max=0.2640060402203688; Min=-0.22912924849362137.


(3323, 300)

In [28]:
dists = np.dot(np_docvs_norm, np_docvs_norm.T).astype('float')
dists_triu = np.triu(dists, k=1)
np.savetxt(PATH_TO_DATA/str(story_to_elaborate+'_dists_triu.csv'), dists_triu, delimiter=',')
print("Shape of similarity matrix is {}.".format(dists_triu.shape))
print("Sum of similarity matrix is {}.".format(np.sum(dists_triu)))
print("Max={}; Min={}.".format(np.max(dists_triu), np.min(dists_triu)))
dists_triu

Shape of similarity matrix is (3323, 3323).
Sum of similarity matrix is 3977099.632397247.
Max=1.0000000000000009; Min=0.0.


array([[0.        , 1.        , 0.83808093, ..., 0.84662232, 0.84437757,
        0.903637  ],
       [0.        , 0.        , 0.83808093, ..., 0.84662232, 0.84437757,
        0.903637  ],
       [0.        , 0.        , 0.        , ..., 0.84400214, 0.8458315 ,
        0.86671762],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.99031549,
        0.96613961],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.96762526],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [26]:
with open(PATH_TO_DATA/str(story_to_elaborate+'_info_df.csv'), 'w') as csv_file:
    articles_info_df.to_csv(csv_file)