In [1]:
import json
import codecs
import os
import re
import numpy as np
import os
import gensim
import nltk
import random
from datetime import datetime
from datetime import timedelta
from scipy.spatial.distance import cosine
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import *
from sklearn.metrics import euclidean_distances
from extract_entities import extract_entity_names, return_entity_list ## helper function for entity extraction
from gensim.models import KeyedVectors
from pyemd import emd ## Word mover's distance

## Use word2vec embeddings
## Code source: http://nbviewer.jupyter.org/github/vene/vene.github.io/blob/pelican/content/blog/word-movers-distance-in-python.ipynb
if not os.path.exists("data/embed.dat"):
    print("Caching word embeddings in memmapped format...")  
    wv = gensim.models.KeyedVectors.load_word2vec_format(
        "data/GoogleNews-vectors-negative300.bin.gz",
        binary=True)
    wv.init_sims(replace=True) # To load L2 normalized vectors in wv.syn0norm from wv.syn0
    fp = np.memmap("data/embed.dat", dtype=np.double, mode='w+', shape=wv.syn0.shape)
    fp[:] = wv.syn0[:]
    with open("data/embed.vocab", "w", encoding='utf-8') as f:
        for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
            print(w, file=f)
    del fp, wv

W = np.memmap("data/embed.dat", dtype=np.double, mode="r", shape=(3000000, 300))
with open("data/embed.vocab", encoding='utf-8') as f:
    vocab_list = map(str.strip, f.readlines())
    
vocab_dict = {w: k for k, w in enumerate(vocab_list)}



*Loading title data from The Onion, Borowitz Report and New York Times. *

In [2]:
def convert_num_in_title(ER_dict):
    for key in list(ER_dict.keys()):
        ER_dict[key]['title'] = re.sub('\d', '#', ER_dict[key]['title']) ## Numbers represented as # in word2vec
        ER_dict[key]['datetime'] = datetime.strptime(ER_dict[key]['timestamp'][0:10], '%Y-%m-%d')
    return ER_dict

with open('borowitz_report.json', 'r') as f:
    bw = json.load(f)

with open('the_onion.json', 'r') as f:
    onion = json.load(f)    

with open('NY_times.json', 'r') as f:
    ny_times = json.load(f)
    
bw, onion, ny_times = convert_num_in_title(bw), convert_num_in_title(onion), convert_num_in_title(ny_times)
    
all_combined = dict(list(bw.items()) + list(onion.items()) + list(ny_times.items()))

print('Number of Borowitz Report articles: %d' %len(bw))
print('Number of The Onion articles: %d' %len(onion))
print('Number of New York Times articles: %d' %len(ny_times))

Number of Borowitz Report articles: 356
Number of The Onion articles: 3279
Number of New York Times articles: 120869


In [3]:
## Just some quick stats of the titles

all_titles = []
for x in all_combined.values():
    all_titles.append(x['title'])

all_titles = list(set(all_titles))

print ('Unique titles from all: %d' %len(all_titles))

vect = CountVectorizer(stop_words="english", lowercase=False).fit(all_titles)

print ('Number of unique words from all titles: %d' %len(vect.get_feature_names()))
print ('Number of words found in word2vec: %d' %len([w for w in vect.get_feature_names() if w in vocab_dict]))

## Entities with 2 or more words will need to be identified and modified so they can be mapped correctly in the word embeddings
## There are also 2250 words that appear that are not in the word embeddings. 
print ('Barack_Obama: %s' %str('Barack_Obama' in vocab_dict)) 
print ('New_York: %s' %str('New_York' in vocab_dict)) 
print ('Example Borowitz Report Title: %s' %all_combined['51685819']['title'])

Unique titles from all: 107929
Number of unique words from all titles: 42204
Number of words found in word2vec: 39778
Barack_Obama: True
New_York: True
Example Borowitz Report Title: Sentiment Building to Deport Nation's Billionaires


In [4]:
## This section of the code will modify the existing titles so they are a bit more "sentence" like. Non-entity words like
## play, basketball will be converted to lower-case while entites like New York will be converted to New_York.

## Function to reformat title so non-entity words are made lower case and entity words remain upper case.
## Function also converts words like New York to New_York
def reformat_title(entities, entities_list_len2, ER_dict): ## pass entities, entites that are 
    for key in ER_dict.keys():
        title = ER_dict[key]['title']
        reformatted = ''
        for entity in entities_list_len2:
            if entity in title:
                entity_ = re.sub(' ', '_', entity)
                title = re.sub(entity, entity_, title)

        for index, string in enumerate(title.split()):
            if index != 0 and string not in entities and '_' not in string: ## make word lower case and add _
                reformatted = reformatted + ' ' + string.lower()

            elif index == 0: ## First word remains as is
                reformatted = string
                
            else:
                reformatted = reformatted + ' ' + string ## add word as is
              
        ER_dict[key]['reformatted_title'] = reformatted
    return ER_dict

with open('theonion_fulltext.json',encoding='utf-8') as f:
    full_text_json = json.load(f)
    
onion_texts = []
for key in full_text_json.keys():
    try:
        onion_texts.append(full_text_json[key]['full_text'])
    except:
        next

onion_texts = list(set(onion_texts))
entities, entities_list_len2 = return_entity_list(onion_texts) ## This is a helper script to extract named entities
onion = reformat_title(entities, entities_list_len2, onion)
del onion_texts

## Borowitz Report
with open('borowitz_fulltext.json',encoding='utf-8') as f:
    full_text_json = json.load(f)
    
borowitz_texts = []
for key in full_text_json.keys():
    try:
        borowitz_texts.append(full_text_json[key]['full_text'])
    except:
        next

borowitz_texts = list(set(borowitz_texts))
entities_, entities_list_len2_ = return_entity_list(borowitz_texts) ## This is a helper script to extract entities
entities.extend(entities_)
entities_list_len2.extend(entities_list_len2_)
bw = reformat_title(entities, entities_list_len2, bw)
del borowitz_texts, entities_, entities_list_len2_

## NY Times
with open('nytimes_fulltext.json',encoding='utf-8') as f:
    full_text_json = json.load(f)
    
nytimes_texts = []
for key in full_text_json.keys():
    try:
        nytimes_texts.append(full_text_json[key]['full_text'])
    except:
        next

nytimes_texts = list(set(nytimes_texts))
entities_, entities_list_len2_ = return_entity_list(nytimes_texts) ## This is a helper script to extract entities
ny_times = reformat_title(entities, entities_list_len2, ny_times)
entities.extend(entities_)
entities_list_len2.extend(entities_list_len2_)
del nytimes_texts, entities_, entities_list_len2_

## Re-combine all dict 
all_combined = dict(list(bw.items()) + list(onion.items()) + list(ny_times.items()))

## Also reformat titles in related_articles
for key in all_combined.keys():
    if 'related_articles' in all_combined[key].keys():
        all_combined[key]['related_articles'] = reformat_title(entities, entities_list_len2, all_combined[key]['related_articles'])

print ('Example Borowitz Report title: %s' %all_combined['61852980']['title'])
print ('Example Borowitz Report title after modification: %s' %all_combined['61852980']['reformatted_title'])

Example Borowitz Report title: Rand Paul No Longer Most Embarrassing Thing About Kentucky
Example Borowitz Report title after modification: Rand_Paul No longer most embarrassing thing about Kentucky


In [5]:
## Helper function that will generate a list of candidate articles, i.e. all related articles listed in Event Registry
## as well as all NY Times articles that are within 2 days of the satirical article.
## The output is a tuple so it can be mapped back to the original unedited article title.
def candidate_articles_list(ER_value): 
    
    candidate_articles = []
    date_time = ER_value['datetime']
    if 'related_articles' in ER_value.keys():
        for key in ER_value['related_articles'].keys():
            title = ''
            for index, word in enumerate(ER_value['related_articles'][key]['title'].split()):
                word = re.sub('\.', '', word)
                if word in vocab_dict and index != 0:
                    title = title + ' ' + word
                elif word in vocab_dict and index == 0:
                    title = word
                else:
                    next
            candidate_articles.append((key, title))

    for key, value in ny_times.items():
        if value['datetime'] <= date_time and value['datetime'] >= date_time + timedelta(days=-2):
            title = ''
            for index, word in enumerate(value['reformatted_title'].split()):
                word = re.sub('\.', '', word)
                if word in vocab_dict and index != 0:
                    title = title + ' ' + word
                elif word in vocab_dict and index == 0:
                    title = word
                else:
                    next
            candidate_articles.append((key,title))
    return list(set(candidate_articles))

In [6]:
all_titles = []
for key, values in all_combined.items():
    if values['reformatted_title'] not in all_titles:
        all_titles.append(values['reformatted_title'])
        
vect_total = CountVectorizer(stop_words="english", lowercase=False).fit(all_titles)
common = [word for word in vect_total.get_feature_names() if word in vocab_dict]

In [14]:
## Given input of the satirical article to be mapped, along with list of candidate articles this function will 
## print out the top 5 candidate articles according to cosine similarity based on tf-idf scores.

def mapped_articles_tfidf(key_related, satire_article, candidate_articles_tuples):
    candidate_articles = list(set([x[1] for x in candidate_articles_tuples]))
    vectorizer = TfidfVectorizer(vocabulary=common, stop_words="english", lowercase=False).fit([satire_article] + candidate_articles)
    cosine_similarities = linear_kernel(vectorizer.transform([satire_article]), 
                                        sparse.csr_matrix(vectorizer.transform(candidate_articles))).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-6:-1]
    print ('Satire article: %s \n' %all_combined[key_related]['title'])
    print ('Candidate articles:')
    for index in related_docs_indices:
        key = [x[0] for x in candidate_articles_tuples if x[1] == candidate_articles[index]][0]
        try:
            print (all_combined[key]['title'])
        except:
            print(all_combined[key_related]['related_articles'][key]['title'])

satire_article = all_combined['25518311']['reformatted_title']
candidate_articles = candidate_articles_list(all_combined['25518311'])
mapped_articles_tfidf('25518311',satire_article, candidate_articles)

Satire article: Jeb Bush Resigns as George W. Bush's Brother 

Candidate articles:
Jeb Bush Resigns From Board Seats, Possibly Edging Closer to Presidential Run
No. ## Oklahoma Shuts Down George Mason, ##-##
Jeb Bush Won’t Attend Immigration Critic’s Event in Iowa
Jeb Bush resigns from all boards
Jeb Bush resigns from board memberships


In [15]:
## Given input of the satirical article to be mapped, along with list of candidate articles this function will 
## print out the top 5 candidate articles according to Word Mover's distance.
def mapped_articles_wmd(key_related, satire_article, candidate_articles_tuples):
    candidate_articles = list(set([x[1] for x in candidate_articles_tuples]))
    vect_wmd = CountVectorizer(stop_words="english", lowercase=False).fit([satire_article] + candidate_articles)
    common_wmd = [word for word in vect_wmd.get_feature_names()]
    W_common_wmd = W[[vocab_dict[w] for w in common_wmd]]
    D_ = euclidean_distances(W_common_wmd)
    v_1 = vect_wmd.transform([satire_article]).toarray().ravel().astype(np.double)
    v_1 /= v_1.sum()
    D_ = D_.astype(np.double)
    D_ /= D_.max()  # just for comparison purposes
    top_5_scores = {}

    for index, title in enumerate(candidate_articles):
        v_2 = vect_wmd.transform([candidate_articles[index]]).toarray().ravel().astype(np.double)
        emd_score = emd(v_1, v_2, D_)
        if len(top_5_scores) < 5:
            top_5_scores[title] = emd_score
        else:
            if emd_score < max(top_5_scores.values()):
                top_5_scores[title] = emd_score
                for key in list(top_5_scores.keys()):
                    if top_5_scores[key] == max(top_5_scores.values()):
                        del top_5_scores[key]
                
    print ('Satire article: %s \n' %all_combined[key_related]['title'])
    print ('Candidate articles:')
    for title, values in top_5_scores.items():
        key = [x[0] for x in candidate_articles_tuples if x[1] == title][0]
        try:
            print ('%s: score: %.2f' %(all_combined[key]['title'], values))
        except:
            print('%s: score: %.2f' %(all_combined[key_related]['related_articles'][key]['title'], values))  
                
satire_article = all_combined['25518311']['title']
candidate_articles = candidate_articles_list(all_combined['25518311'])
mapped_articles_wmd('25518311', satire_article, candidate_articles)

Satire article: Jeb Bush Resigns as George W. Bush's Brother 

Candidate articles:
Edith Pearlman’s ‘Honeydew’: score: 0.83
A Simple Gift: score: 1.85
Soy on the Lower East Side: score: 0.87
The Rise of Evgeny Lebedev: score: 1.83
Can Writers Still ‘Make It New’?: score: 1.84


#### Some Experiments

In [20]:
## Onion article with related articles
satire_article = all_combined['26464958']['title']
candidate_articles = candidate_articles_list(all_combined['26464958'])
print('tf-idf:')
mapped_articles_tfidf('26464958',satire_article, candidate_articles)
print("\nWord Mover's:")
mapped_articles_wmd('26464958', satire_article, candidate_articles)

tf-idf:
Satire article: Trump Unveils Sprawling New Presidential Retreat Where He Can Escape From Stresses Of Mar-A-Lago 

Candidate articles:
GAO Agrees To Review Costs Of Trump's Trips To Mar-A-Lago
Joe Girardi Trots Out a New Look for the Yankees
He Turned His Home Into a Reality Television Show
Where Everyone 'Knows Hockey': Tiny Clarkson Stands Tall Again
Mar-A-Lago Act: Bill To Force Trump To Publish Visitors As He Skips Trip No. 6

Word Mover's:
Satire article: Trump Unveils Sprawling New Presidential Retreat Where He Can Escape From Stresses Of Mar-A-Lago 

Candidate articles:
Just Bulbs: Still Burning Bright on the Upper East Side: score: 2.81
#### Dilemma for Republicans: Which Way Now on Obamacare?: score: 2.82
How Well Do You Sleep?: score: 0.80
What's Going On in This Picture? | March ##, ####: score: 2.79
'Billions' Season #, Episode # Recap: Wendy's Back: score: 2.80


In [17]:
## Onion article with no related articles
satire_article = all_combined['152378490']['title']
candidate_articles = candidate_articles_list(all_combined['152378490'])
print('tf-idf:')
mapped_articles_tfidf('152378490',satire_article, candidate_articles)
print("\nWord Mover's:")
mapped_articles_wmd('152378490', satire_article, candidate_articles)

tf-idf:
Satire article: Secret Service Adds Emotional Protection Division To Safeguard Trump’s Psyche 

Candidate articles:
Trump Seems to Side With Russia in Comments on Ukraine
How Trump Chose His Supreme Court Nominee
How Attorneys General Became Democrats' Bulwark Against Trump
A Quiet Giant of Investing Weighs In on Trump
Palestinians Fear Being Sidelined by Trump White House

Word Mover's:
Satire article: Secret Service Adds Emotional Protection Division To Safeguard Trump’s Psyche 

Candidate articles:
Today in History: score: 1.86
Amnesty: Up to ##,### Hanged in Syria's 'Slaughterhouse': score: 0.90
Well, Then, Would You Like to Dance?: score: 0.88
A Gravity-Defying Champion at Rest: score: 1.85
Norman Rockwell's : score: 0.89


In [18]:
## Borowitz Report article with no related articles
satire_article = all_combined['145463847']['title']
candidate_articles = candidate_articles_list(all_combined['145463847'])
print('tf-idf:')
mapped_articles_tfidf('145463847',satire_article, candidate_articles)
print("\nWord Mover's:")
mapped_articles_wmd('145463847', satire_article, candidate_articles)

tf-idf:
Satire article: Intel Chiefs Say Trump's Twitter Account Was Hacked by Four-Year-Old  

Candidate articles:
New York Lawmakers Start the Year Weighted With Old Tensions
Kanye West's Year of Breaking Bad
Four Movies You Should Know About Before the Golden Globes
A New Congress Is Sworn In, but With Many Old Faces
The Latest: # House Democrats Say They'll Skip Inauguration

Word Mover's:
Satire article: Intel Chiefs Say Trump's Twitter Account Was Hacked by Four-Year-Old  

Candidate articles:
Tantrum on the No. #: score: 0.85
Is Single-Sex Education Still Useful?: score: 1.79
Word + Quiz: interregnum: score: 1.84
Rake's Progress: A Look at the Well-Traveled Casanova: score: 1.83
Jobs Report: What to Watch For: score: 1.84


In [21]:
## Borowitz Report article with no related articles
satire_article = all_combined['112329250']['title']
candidate_articles = candidate_articles_list(all_combined['112329250'])
print('tf-idf:')
mapped_articles_tfidf('112329250',satire_article, candidate_articles)
print("\nWord Mover's:")
mapped_articles_wmd('112329250', satire_article, candidate_articles)

tf-idf:
Satire article: Trump Economic Plan Calls for Every American to Inherit Millions from Father 

Candidate articles:
Trump on Clinton: 'I Don't Think She's All There'
Michael Phelps Wins His ##th Gold With American Relay Team
When Every Company Is a Tech Company, Does the Label Matter?
Starting With a Bang, an American Prodigy Wins Rio's First Gold
Normandy Bar Fire Kills at Least ## at Birthday Party

Word Mover's:
Satire article: Trump Economic Plan Calls for Every American to Inherit Millions from Father 

Candidate articles:
Golf Capsules: score: 1.85
Ahmed H. Zewail, Nobel-Prize-Winning Chemist, Dies at ##: score: 1.85
Donald Trump's Diet: He'll Have Fries With That: score: 1.85
Alex Rodriguez's Orchestrated Move May Not Be His Last: score: 1.85
What's on TV Sunday: 'Inspector Lewis' and Simone Biles at the Rio Olympics: score: 1.83
