# WordFrequency

In [1]:
# imports
import sys
import os
import numpy as np
import pandas as pd
import sqlite3
import json
import datetime

from nltk import FreqDist
from nltk.util import ngrams
import re
import spacy
import math

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

## Model

Word Frequencies:
A simple and robust method to find the number of times a word has been used in a document. Words with higher count are assumed to be more important than than those with lesser counts.


## Dataset

For this experiment lets pull in the news articles for the last few days

In [2]:
# Connect to database
database_url = "../datastore/app_data.db"
database = sqlite3.connect(database_url)

sql = "select * from articles"
source_data = pd.read_sql_query(sql, database)

print("Shape:",source_data.shape)
source_data.head()

Shape: (157, 7)


Unnamed: 0,id,source,article_link,article_date,article_title,article_content,article_dts
0,7c66bfc6f7b115ac9ea1c443d64d9f662a3c7257d06d2a...,npr,https://www.npr.org/2019/12/24/791102803/trump...,"December 24, 2019",Trump Downplays Threat Of 'Gift' From North Ko...,President Trump did not seem concerned Tuesday...,1577146000.0
1,d5e50fa5f13830087bedc86232317ea1790d2417d4d729...,npr,https://www.npr.org/2019/12/23/790747698/newly...,"December 23, 2019",Ukraine Emails Fuel Democrats' Call For Impeac...,Party leaders in Congress continued to spar Mo...,1577059000.0
2,2b767e199bd897158dd7f8b999bef7aa592b82fd4548eb...,nymag,http://nymag.com/intelligencer/2019/12/matt-sh...,"Dec. 24, 2019",GOP Lawmaker Plotted Insurrections to Establis...,Shea’s rebellion. Photo: Ted S Warren/AP/Shutt...,1577146000.0
3,139c45cf3296a8e4f8bf50d3525b808be1620b3b670778...,nymag,http://nymag.com/intelligencer/2020/01/iran-ge...,"Jan. 2, 2020",U.S. Kills Iranian General Qasem Suleimani in ...,A Shiite Muslim pilgrim walks with a bag adorn...,1577923000.0
4,f99ee44a2f210564c2eb3bc91a781f36dd6de44142047a...,nymag,http://nymag.com/intelligencer/2019/12/a-gloss...,"Dec. 31, 2019","A Glossary of Everyday Things, According to Trump",Donald Trump. Photo: Chip Somodevilla/Getty Im...,1577750000.0


In [7]:
source_data['word_count'] = source_data['article_content'].str.split().str.len()

# View some metrics of data
print("Number of Article:",f'{source_data.shape[0]:,}')
print("Minimum Article Date:",datetime.datetime.fromtimestamp(min(source_data['article_dts'])).strftime("%b %d %Y"))
print("Maximum Article Date:",datetime.datetime.fromtimestamp(max(source_data['article_dts'])).strftime("%b %d %Y"))
print("Minimum Word Count:",min(source_data['word_count']))
print("Maximum Word Count:",f'{max(source_data["word_count"]):,}')

Number of Article: 166
Minimum Article Date: Dec 23 2019
Maximum Article Date: Jan 05 2020
Minimum Word Count: 111
Maximum Word Count: 5,195


## Data Preprocessing

The data preprocessing steps that we will follow inorder to feed the data to the model are:
- Combine Title with Blog Content
- Remove line breaks
- Remove Special Characters
- Remove small words < 3 letters
- Convert text to lowercase
- Remove stop words
- Tokenize
- Lemmatization
- Remove custom stop words

In [8]:
# Custom stop words
custom_stopwords_file ='../datastore/custom_stopwords.txt'
custom_stopwords_df = pd.read_csv(custom_stopwords_file, header=None)
print("Shape:",custom_stopwords_df.shape)
custom_stopwords = custom_stopwords_df[0].tolist()

Shape: (6, 1)


In [9]:
# Utilities to perfrom data cleaning and preparation

nlp = spacy.load('en', disable=['parser', 'ner'])

# function to remove stopwords
def remove_stopwords(rev):
    rev_new = " ".join([i for i in rev if i not in stop_words])
    return rev_new

def lemmatization(texts, tags=['NOUN', 'ADJ']):
    output = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        output.append([token.lemma_ for token in doc if token.pos_ in tags])
    return output

# function to remove custom stopwords
def remove_custom_stopwords(texts):
    output = []
    for sent in texts:
        output.append([word for word in sent if word not in custom_stopwords])
    return output

In [10]:
# Merge title with content
source_data['text'] = source_data['article_title'] + " " + source_data["article_content"]

# Convert column to str
source_data['text'] = source_data['text'].apply(str)

# Replace line breaks
article_text = source_data['text'].str.replace("\n", " ")

# remove unwanted characters, numbers and symbols
article_text = article_text.str.replace("[^a-zA-Z#]", " ")

# remove short words (length < 3)
article_text = article_text.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

# make entire text lowercase
article_text = [r.lower() for r in article_text]

# remove stopwords from the text
article_text = [remove_stopwords(r.split()) for r in article_text]

# Tokenize
tokenized_text = pd.Series(article_text).apply(lambda x: x.split())
# Lemmatize
tokenized_text = lemmatization(tokenized_text)
# Remove custom stopwords
tokenized_text = remove_custom_stopwords(tokenized_text)

flattened_text = []
for i in range(len(tokenized_text)):
    flattened_text.append(' '.join(tokenized_text[i]))

source_data['text'] = flattened_text

# Update word count
source_data['word_count'] = source_data['text'].str.split().str.len()

# Remove word count < 15
source_data = source_data[source_data['word_count'] > 14]
source_data = source_data.reset_index()

In [11]:
print("Shape:",source_data.shape)
source_data.head()

Shape: (166, 10)


Unnamed: 0,index,id,source,article_link,article_date,article_title,article_content,article_dts,word_count,text
0,0,7c66bfc6f7b115ac9ea1c443d64d9f662a3c7257d06d2a...,npr,https://www.npr.org/2019/12/24/791102803/trump...,"December 24, 2019",Trump Downplays Threat Of 'Gift' From North Ko...,President Trump did not seem concerned Tuesday...,1577146000.0,64,beautiful vase president trump threat economic...
1,1,d5e50fa5f13830087bedc86232317ea1790d2417d4d729...,npr,https://www.npr.org/2019/12/23/790747698/newly...,"December 23, 2019",Ukraine Emails Fuel Democrats' Call For Impeac...,Party leaders in Congress continued to spar Mo...,1577059000.0,149,fuel impeachment trial party leader spar detai...
2,2,2b767e199bd897158dd7f8b999bef7aa592b82fd4548eb...,nymag,http://nymag.com/intelligencer/2019/12/matt-sh...,"Dec. 24, 2019",GOP Lawmaker Plotted Insurrections to Establis...,Shea’s rebellion. Photo: Ted S Warren/AP/Shutt...,1577146000.0,219,insurrection christian state rebellion photo l...
3,3,139c45cf3296a8e4f8bf50d3525b808be1620b3b670778...,nymag,http://nymag.com/intelligencer/2020/01/iran-ge...,"Jan. 2, 2020",U.S. Kills Iranian General Qasem Suleimani in ...,A Shiite Muslim pilgrim walks with a bag adorn...,1577923000.0,250,iranian major qud force morning photo getty im...
4,4,f99ee44a2f210564c2eb3bc91a781f36dd6de44142047a...,nymag,http://nymag.com/intelligencer/2019/12/a-gloss...,"Dec. 31, 2019","A Glossary of Everyday Things, According to Trump",Donald Trump. Photo: Chip Somodevilla/Getty Im...,1577750000.0,173,glossary everyday thing image mundane element ...


## Word Frequency Model

In [12]:
def word_counts(all_words,cutoff=2):
    #all_words = ' '.join([text for text in x])
    all_words = all_words.split()
    fdist = FreqDist(all_words)
    
    df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})
    df = df[df['count'] > cutoff]
    df = df.reset_index()
    
    frequency_table = {}
    for index, row in df.iterrows():
        frequency_table[row['word']] = row['count']
    
    return frequency_table

In [14]:
frequency_table = word_counts(source_data["text"].loc[0],cutoff=2)
print(frequency_table)

frequency_table = word_counts(source_data["text"].loc[1],cutoff=2)
print(frequency_table)

frequency_table = word_counts(source_data["text"].loc[2],cutoff=2)
print(frequency_table)

{'trump': 4, 'missile': 3, 'test': 3, 'surprise': 3}
{'impeachment': 5, 'trial': 10, 'email': 3, 'new': 3, 'witness': 5, 'president': 4, 'aid': 3, 'official': 3, 'government': 3, 'document': 4}
{'state': 7, 'war': 3, 'shea': 7, 'government': 3, 'plan': 3, 'trump': 3, 'report': 4, 'occupation': 3, 'law': 3, 'enforcement': 3, 'email': 3}


## Model Evaluation

We will visually look at a few of the articles to see how the model performed

In [15]:
def evaluate_blog_results(row,frequency_table):
    text = row["text"]
    for keyword in frequency_table:
        text = text.replace(' '+keyword+' ', ' \x1b[1;03;31;46m'+ keyword + '\x1b[0m ')
    
    print(text)

In [16]:
# View some results
frequency_table = word_counts(source_data["text"].loc[1],cutoff=2)
evaluate_blog_results(source_data.loc[1],frequency_table)

fuel [1;03;31;46mimpeachment[0m [1;03;31;46mtrial[0m party leader spar detail [1;03;31;46mimpeachment[0m [1;03;31;46mtrial[0m [1;03;31;46memail[0m ammunition [1;03;31;46mnew[0m [1;03;31;46mwitness[0m [1;03;31;46memail[0m late center public integrity particular bolster argument [1;03;31;46mpresident[0m military [1;03;31;46maid[0m political investigation message senior [1;03;31;46mofficial[0m office management less hour trump phone ukrainian [1;03;31;46mpresident[0m matter quiet duffey halt guidance light administration plan review assistance sensitive nature request information execute direction duffey [1;03;31;46memail[0m [1;03;31;46mgovernment[0m [1;03;31;46mofficial[0m issue administration decision [1;03;31;46maid[0m huge anxiety [1;03;31;46mgovernment[0m [1;03;31;46maid[0m unwise illegal center public integrity [1;03;31;46mpresident[0m [1;03;31;46mgovernment[0m [1;03;31;46mofficial[0m money inquiry key people fact [1;03;31;46mdocument[0m c

In [17]:
# View some results
frequency_table = word_counts(source_data["text"].loc[3],cutoff=2)
evaluate_blog_results(source_data.loc[3],frequency_table)



In [18]:
# View some results
frequency_table = word_counts(source_data["text"].loc[4],cutoff=2)
evaluate_blog_results(source_data.loc[4],frequency_table)

glossary everyday [1;03;31;46mthing[0m image mundane element modern life rich decade gold penthouse powerful man [1;03;31;46mworld[0m person life real [1;03;31;46mworld[0m concern [1;03;31;46mgood[0m sense expert [1;03;31;46mdishwasher[0m [1;03;31;46mlight[0m [1;03;31;46mbulb[0m [1;03;31;46mgood[0m sense last month [1;03;31;46mtrump[0m subject confidence narcissist tongue [1;03;31;46mtrump[0m glossary everyday [1;03;31;46mthing[0m definition wrong obvious embarrassing airplane complex fly airplane complex fly pilot computer scientist [1;03;31;46mtime[0m many product unnecessary step old second decision complexity danger great cost little gain know pilot great professional control plane [1;03;31;46mdishwasher[0m old [1;03;31;46mdishwasher[0m press boom explosion minute open steam [1;03;31;46mdishwasher[0m press [1;03;31;46mtime[0m woman drop [1;03;31;46mwater[0m place much [1;03;31;46mwater[0m energy efficient [1;03;31;46mlight[0m [1;03;31;46mbulb