# Word2Vec

In [1]:
# imports
import sys
import os
import numpy as np
import pandas as pd
import sqlite3
import json
import re
import spacy
import math
import datetime

from gensim.models import Word2Vec

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

## Model

In this approach we will use a Word2Vec model. The Word2Vec model is used for learning vector representations of words called word embeddings. We will find the word embeddings of the entire corpus. The next step will be to go through each document and find the words that has the most similar words from the entire corpus. These words will be marked as relevant words in the document.


## Dataset

For this experiment lets pull in the news articles for the last few days

In [2]:
# Connect to database
database_url = "../datastore/app_data.db"
database = sqlite3.connect(database_url)

sql = "select * from articles"
source_data = pd.read_sql_query(sql, database)

print("Shape:",source_data.shape)
source_data.head()

Shape: (166, 7)


Unnamed: 0,id,source,article_link,article_date,article_title,article_content,article_dts
0,7c66bfc6f7b115ac9ea1c443d64d9f662a3c7257d06d2a...,npr,https://www.npr.org/2019/12/24/791102803/trump...,"December 24, 2019",Trump Downplays Threat Of 'Gift' From North Ko...,President Trump did not seem concerned Tuesday...,1577146000.0
1,d5e50fa5f13830087bedc86232317ea1790d2417d4d729...,npr,https://www.npr.org/2019/12/23/790747698/newly...,"December 23, 2019",Ukraine Emails Fuel Democrats' Call For Impeac...,Party leaders in Congress continued to spar Mo...,1577059000.0
2,2b767e199bd897158dd7f8b999bef7aa592b82fd4548eb...,nymag,http://nymag.com/intelligencer/2019/12/matt-sh...,"Dec. 24, 2019",GOP Lawmaker Plotted Insurrections to Establis...,Shea’s rebellion. Photo: Ted S Warren/AP/Shutt...,1577146000.0
3,139c45cf3296a8e4f8bf50d3525b808be1620b3b670778...,nymag,http://nymag.com/intelligencer/2020/01/iran-ge...,"Jan. 2, 2020",U.S. Kills Iranian General Qasem Suleimani in ...,A Shiite Muslim pilgrim walks with a bag adorn...,1577923000.0
4,f99ee44a2f210564c2eb3bc91a781f36dd6de44142047a...,nymag,http://nymag.com/intelligencer/2019/12/a-gloss...,"Dec. 31, 2019","A Glossary of Everyday Things, According to Trump",Donald Trump. Photo: Chip Somodevilla/Getty Im...,1577750000.0


In [3]:
source_data['word_count'] = source_data['article_content'].str.split().str.len()

# View some metrics of data
print("Number of Article:",f'{source_data.shape[0]:,}')
print("Minimum Article Date:",datetime.datetime.fromtimestamp(min(source_data['article_dts'])).strftime("%b %d %Y"))
print("Maximum Article Date:",datetime.datetime.fromtimestamp(max(source_data['article_dts'])).strftime("%b %d %Y"))
print("Minimum Word Count:",min(source_data['word_count']))
print("Maximum Word Count:",f'{max(source_data["word_count"]):,}')

Number of Article: 166
Minimum Article Date: Dec 23 2019
Maximum Article Date: Jan 05 2020
Minimum Word Count: 111
Maximum Word Count: 5,195


In [4]:
# Preview some articles
print("------ Article 1--------")
print(source_data["article_title"][0],source_data["article_content"][0][:500])
print("------ Article 2--------")
print(source_data["article_title"][1],source_data["article_content"][1][:500])
print("------ Article 3--------")
print(source_data["article_title"][2],source_data["article_content"][2][:500])

------ Article 1--------
Trump Downplays Threat Of 'Gift' From North Korea: Maybe It's A 'Beautiful Vase' President Trump did not seem concerned Tuesday when asked about the threat of a "Christmas present" from North Korea if the U.S. doesn't roll back economic sanctions on the country by the end of the year. "Maybe it's a nice present," Trump told reporters at an event at his Mar-a-Lago resort in Florida. "Maybe it's a present where he sends me a beautiful vase, as opposed to a missile test." Pyongyang imposed an end-of-year deadline for concessions from the U.S. earlier this month, but the Trump a
------ Article 2--------
Ukraine Emails Fuel Democrats' Call For Impeachment Trial Witnesses Party leaders in Congress continued to spar Monday over details of an impending impeachment trial in the Senate, with newly released emails giving more ammunition to Democrats in their requests for new witnesses. The emails, released late Friday to the Center for Public Integrity, are heavily redact

## Data Preprocessing

The data preprocessing steps that we will follow inorder to feed the data to the model are:
- Combine Title with Blog Content
- Remove line breaks
- Remove Special Characters
- Remove small words < 3 letters
- Convert text to lowercase
- Remove stop words
- Tokenize
- Lemmatization
- Remove custom stop words

In [5]:
# Custom stop words
custom_stopwords_file ='../datastore/custom_stopwords.txt'
custom_stopwords_df = pd.read_csv(custom_stopwords_file, header=None)
print("Shape:",custom_stopwords_df.shape)
custom_stopwords = custom_stopwords_df[0].tolist()

Shape: (6, 1)


In [6]:
# Utilities to perfrom data cleaning and preparation

nlp = spacy.load('en', disable=['parser', 'ner'])

# function to remove stopwords
def remove_stopwords(rev):
    rev_new = " ".join([i for i in rev if i not in stop_words])
    return rev_new

def lemmatization(texts, tags=['NOUN', 'ADJ']):
    output = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        output.append([token.lemma_ for token in doc if token.pos_ in tags])
    return output

# function to remove custom stopwords
def remove_custom_stopwords(texts):
    output = []
    for sent in texts:
        output.append([word for word in sent if word not in custom_stopwords])
    return output

In [7]:
# Merge title with content
source_data['text'] = source_data['article_title'] + " " + source_data["article_content"]

# Convert column to str
source_data['text'] = source_data['text'].apply(str)

# Replace line breaks
article_text = source_data['text'].str.replace("\n", " ")

# remove unwanted characters, numbers and symbols
article_text = article_text.str.replace("[^a-zA-Z#]", " ")

# remove short words (length < 3)
article_text = article_text.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

# make entire text lowercase
article_text = [r.lower() for r in article_text]

# remove stopwords from the text
article_text = [remove_stopwords(r.split()) for r in article_text]

# Tokenize
tokenized_text = pd.Series(article_text).apply(lambda x: x.split())
# Lemmatize
tokenized_text = lemmatization(tokenized_text)
# Remove custom stopwords
tokenized_text = remove_custom_stopwords(tokenized_text)

flattened_text = []
for i in range(len(tokenized_text)):
    flattened_text.append(' '.join(tokenized_text[i]))

source_data['text'] = flattened_text

# Update word count
source_data['word_count'] = source_data['text'].str.split().str.len()

# Remove word count < 15
source_data = source_data[source_data['word_count'] > 14]
source_data = source_data.reset_index()

In [8]:
print("Shape:",source_data.shape)
source_data.head()

Shape: (166, 10)


Unnamed: 0,index,id,source,article_link,article_date,article_title,article_content,article_dts,word_count,text
0,0,7c66bfc6f7b115ac9ea1c443d64d9f662a3c7257d06d2a...,npr,https://www.npr.org/2019/12/24/791102803/trump...,"December 24, 2019",Trump Downplays Threat Of 'Gift' From North Ko...,President Trump did not seem concerned Tuesday...,1577146000.0,64,beautiful vase president trump threat economic...
1,1,d5e50fa5f13830087bedc86232317ea1790d2417d4d729...,npr,https://www.npr.org/2019/12/23/790747698/newly...,"December 23, 2019",Ukraine Emails Fuel Democrats' Call For Impeac...,Party leaders in Congress continued to spar Mo...,1577059000.0,149,fuel impeachment trial party leader spar detai...
2,2,2b767e199bd897158dd7f8b999bef7aa592b82fd4548eb...,nymag,http://nymag.com/intelligencer/2019/12/matt-sh...,"Dec. 24, 2019",GOP Lawmaker Plotted Insurrections to Establis...,Shea’s rebellion. Photo: Ted S Warren/AP/Shutt...,1577146000.0,219,insurrection christian state rebellion photo l...
3,3,139c45cf3296a8e4f8bf50d3525b808be1620b3b670778...,nymag,http://nymag.com/intelligencer/2020/01/iran-ge...,"Jan. 2, 2020",U.S. Kills Iranian General Qasem Suleimani in ...,A Shiite Muslim pilgrim walks with a bag adorn...,1577923000.0,250,iranian major qud force morning photo getty im...
4,4,f99ee44a2f210564c2eb3bc91a781f36dd6de44142047a...,nymag,http://nymag.com/intelligencer/2019/12/a-gloss...,"Dec. 31, 2019","A Glossary of Everyday Things, According to Trump",Donald Trump. Photo: Chip Somodevilla/Getty Im...,1577750000.0,173,glossary everyday thing image mundane element ...


In [9]:
# Preview some pre processed text
print("------ Article 1--------")
print(source_data["text"][0][:500])
print("------ Article 2--------")
print(source_data["text"][1][:500])
print("------ Article 3--------")
print(source_data["text"][2][:500])

------ Article 1--------
beautiful vase president trump threat economic sanction country end year nice present trump reporter present beautiful vase missile test end year deadline concession early month administration sign plan pressure campaign entire nuclear arsenal sanction ambition top foreign policy priority series meeting solution surprise option unclear option rocket payload space underground nuclear test testing long range missile capable trump missile test surprise deal surprise trump
------ Article 2--------
fuel impeachment trial party leader spar detail impeachment trial email ammunition new witness email late center public integrity particular bolster argument president military aid political investigation message senior official office management less hour trump phone ukrainian president matter quiet duffey halt guidance light administration plan review assistance sensitive nature request information execute direction duffey email government official issue administration 

## Build Word2Vec


In [11]:
tokenized_text_source = source_data['text'].apply(lambda x: x.split()).tolist()

# Set parameters
feature_size = 20    # Word vector dimensionality  
window_context = 30  # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3        # Downsample setting for frequent words

w2v_model = Word2Vec(tokenized_text_source, size=feature_size, 
                          window=window_context, min_count=min_word_count,
                          sample=sample, iter=100)

In [14]:
TOP_N_SIMILAR_WORDS = 10
COSINE_SIMILARITY_THRESHOLD = 0.70
NUM_SIMILAR_WORDS_THRESHOLD = 6

# Find relevant words
source_data["relevant_words"] = ""
for index, row in source_data.iterrows():
    # Get tokenized text
    tokenized_text = row["text"].split()
    # Get the unique words in the document
    unique_words = set(tokenized_text)
    relevant_words = []
    for word in unique_words:
        # Find similar words from the corpus
        similar_words = w2v_model.wv.most_similar([word], topn=TOP_N_SIMILAR_WORDS)
        similar_words = [x[0] for x in similar_words if x[1] >= COSINE_SIMILARITY_THRESHOLD]
        if len(similar_words) >= NUM_SIMILAR_WORDS_THRESHOLD:
            relevant_words.append(word)
    
    source_data.at[index, "relevant_words"]=relevant_words

print("Shape:",source_data.shape)
source_data.head()

Shape: (166, 11)


Unnamed: 0,index,id,source,article_link,article_date,article_title,article_content,article_dts,word_count,text,relevant_words
0,0,7c66bfc6f7b115ac9ea1c443d64d9f662a3c7257d06d2a...,npr,https://www.npr.org/2019/12/24/791102803/trump...,"December 24, 2019",Trump Downplays Threat Of 'Gift' From North Ko...,President Trump did not seem concerned Tuesday...,1577146000.0,64,beautiful vase president trump threat economic...,"[nice, range, sign, arsenal, test, testing, ca..."
1,1,d5e50fa5f13830087bedc86232317ea1790d2417d4d729...,npr,https://www.npr.org/2019/12/23/790747698/newly...,"December 23, 2019",Ukraine Emails Fuel Democrats' Call For Impeac...,Party leaders in Congress continued to spar Mo...,1577059000.0,149,fuel impeachment trial party leader spar detai...,"[impeachment, ukrainian, chamber, schumer, tri..."
2,2,2b767e199bd897158dd7f8b999bef7aa592b82fd4548eb...,nymag,http://nymag.com/intelligencer/2019/12/matt-sh...,"Dec. 24, 2019",GOP Lawmaker Plotted Insurrections to Establis...,Shea’s rebellion. Photo: Ted S Warren/AP/Shutt...,1577146000.0,219,insurrection christian state rebellion photo l...,"[conservative, renounce, site, writing, phone,..."
3,3,139c45cf3296a8e4f8bf50d3525b808be1620b3b670778...,nymag,http://nymag.com/intelligencer/2020/01/iran-ge...,"Jan. 2, 2020",U.S. Kills Iranian General Qasem Suleimani in ...,A Shiite Muslim pilgrim walks with a bag adorn...,1577923000.0,250,iranian major qud force morning photo getty im...,"[ready, pre, site, dangerous, phone, foreign, ..."
4,4,f99ee44a2f210564c2eb3bc91a781f36dd6de44142047a...,nymag,http://nymag.com/intelligencer/2019/12/a-gloss...,"Dec. 31, 2019","A Glossary of Everyday Things, According to Trump",Donald Trump. Photo: Chip Somodevilla/Getty Im...,1577750000.0,173,glossary everyday thing image mundane element ...,"[site, sea, dangerous, glossary, pilot, worker..."


## Model Evaluation

We will visually look at a few of the articles to see if the relevant words identified makes sense from the model

In [15]:
def evaluate_results(row):
    text = row["text"]
    relevant_words = row["relevant_words"]
    for relevant_word in relevant_words:
        text = text.replace(relevant_word, '\x1b[1;03;31;46m'+ relevant_word + '\x1b[0m')
    
    print(text)

In [16]:
# View some results
evaluate_results(source_data.loc[0])

beautiful vase president trump [1;03;31;46mthreat[0m [1;03;31;46meconomic[0m [1;03;31;46msanction[0m country end year [1;03;31;46mnice[0m [1;03;31;46mpresent[0m trump reporter [1;03;31;46mpresent[0m beautiful vase [1;03;31;46mmissile[0m [1;03;31;46mtest[0m end year deadline [1;03;31;46mconcession[0m early [1;03;31;46mmonth[0m administration [1;03;31;46msign[0m plan [1;03;31;46mpressure[0m campaign entire [1;03;31;46mnuclear[0m [1;03;31;46marsenal[0m [1;03;31;46msanction[0m ambition top [1;03;31;46mforeign[0m policy priority series meeting solution surprise option unclear option [1;03;31;46mrocket[0m payload [1;03;31;46mspace[0m underground [1;03;31;46mnuclear[0m [1;03;31;46mtest[0m [1;03;31;46mtest[0ming long [1;03;31;46mrange[0m [1;03;31;46mmissile[0m [1;03;31;46mcapable[0m trump [1;03;31;46mmissile[0m [1;03;31;46mtest[0m surprise deal surprise trump


In [17]:
# View some results
evaluate_results(source_data.loc[1])

fuel [1;03;31;46mimpeachment[0m [1;03;31;46mtrial[0m [1;03;31;46mparty[0m leader spar [1;03;31;46mdetail[0m [1;03;31;46mimpeachment[0m [1;03;31;46mtrial[0m email ammunition new [1;03;31;46mwitness[0m email late center public [1;03;31;46mintegrity[0m particular bolster argument president military [1;03;31;46maid[0m political [1;03;31;46minvestigation[0m message senior official office [1;03;31;46mmanagement[0m less hour trump [1;03;31;46mphone[0m [1;03;31;46mukrainian[0m president matter quiet [1;03;31;46mduffey[0m halt [1;03;31;46mguidance[0m [1;03;31;46mlight[0m administration plan review [1;03;31;46massistance[0m [1;03;31;46msensitive[0m nature [1;03;31;46mrequest[0m information [1;03;31;46mexecute[0m direction [1;03;31;46mduffey[0m email government official issue administration decision [1;03;31;46maid[0m [1;03;31;46mhuge[0m anxiety government [1;03;31;46maid[0m [1;03;31;46munwise[0m illegal center public [1;03;31;46mintegrity[0m pr

In [18]:
# View some results
evaluate_results(source_data.loc[2])



In [19]:
# View some results
evaluate_results(source_data.loc[3])



In [20]:
# View some results
evaluate_results(source_data.loc[5])

[1;03;31;46mperilous[0m decade [1;03;31;46mprogressivism[0m photo image [1;03;31;46mplenty[0m decade [1;03;31;46mteen[0m end [1;03;31;46mproper[0m [1;03;31;46mweight[0m shocking event world politic [1;03;31;46mteen[0m [1;03;31;46mcontinuation[0m [1;03;31;46mstrong[0m [1;03;31;46mdemocrat[0mic [1;03;31;46mtrend[0m [1;03;31;46mlandslide[0m [1;03;31;46mseat[0m [1;03;31;46mchamber[0m time special [1;03;31;46melection[0m [1;03;31;46maberration[0m [1;03;31;46mirregular[0m [1;03;31;46mturnout[0m [1;03;31;46mfeckless[0m campaign [1;03;31;46multimate[0m [1;03;31;46moman[0m win [1;03;31;46mtrauma[0m [1;03;31;46mdefcon[0m [1;03;31;46mmidterm[0m [1;03;31;46melection[0m [1;03;31;46munprepared[0m [1;03;31;46mdebacle[0m state [1;03;31;46mlegislature[0m new [1;03;31;46mcase[0m [1;03;31;46melection[0m [1;03;31;46mmap[0m [1;03;31;46mcensus[0m race governor able [1;03;31;46mtemper[0m [1;03;31;46mveto[0m [1;03;31;46mmap[0m house [1;03;