In [1]:
import json
import requests 
import numpy as np
from time import sleep
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go

In [3]:
api_key = 'VUJxGmvIj8BWhJ62OjYFxjzDay4ijc56'

In [4]:
time_periods = [
    (2019,8),
    (2019,9),
    (2019,10),
    (2019,11),
    (2019,12),
    (2020,1),
    (2020,2),
    (2020,3),
    (2020,4),
    (2020,5),
    (2020,6),
    (2020,7)
]
monthly_segments = [0, 7, 15, 22, 31]
monthly_segment_categories = ['Q1', 'Q2', 'Q3', 'Q4'] # Must be one less than number of monthly_segments

def find_month_bucket(day):
    for i in range(len(monthly_segment_categories)):
        if day > monthly_segments[i] and day <= monthly_segments[i+1]:
            return monthly_segment_categories[i]
    return "ERROR"

In [5]:
# Collect raw data

abstracts = {}
headlines = {}
for year, month in time_periods:
    url = 'https://api.nytimes.com/svc/archive/v1/{}/{}.json?api-key={}'.format(year, month, api_key)
    r = requests.get(url)
    sleep(10) # NYT API only accepts 10 request/min
    json_data = r.json()
    
    for article in json_data['response']['docs']:
        cur_publish_day = int(article['pub_date'][8:10])
        save_index = '{}-{}-{}'.format(year, month, find_month_bucket(cur_publish_day))
        
        # Abstracts
        if save_index in abstracts:
            abstracts[save_index].append(article['abstract'])
        else:
            abstracts[save_index] = [article['abstract']]

        # Headlines
        if save_index in headlines:
            headlines[save_index].append(article['headline']['main'])
        else:
            headlines[save_index] = [article['headline']['main']]

In [6]:
# TF-IDF
tfidf = TfidfVectorizer(
    strip_accents='ascii',
    analyzer='word',
    stop_words='english'
    #     pre_processor=,
    #     tokenizer=,
)

In [7]:
# Manually clean terms

# convert city names to "hong_kong"
word_mappings = {
    'Hong Kong': 'hong_kong',
    'New York City': 'new_york_city',
    'New York': 'new_york',
    'El Paso': 'el_paso',
    'White House': 'white_house',
    'Jeffrey Epstein': 'jefrey_epstein',
}

In [8]:
# prepare doument list
documents = []
target = headlines # or abstracts
for k, v in target.items():
    content = ' '.join(v)
    for word, replacement in word_mappings.items():
        content = content.replace(word, replacement)
    documents.append(content)

# Fit vocabulary to all documents
tfidf.fit_transform(documents)

<48x29618 sparse matrix of type '<class 'numpy.float64'>'
	with 153616 stored elements in Compressed Sparse Row format>

In [9]:
# 20 most important features per time period

top_features_per_time_period = []
for i, time_period in enumerate(list(target.keys())):
    cur_doc_scores = tfidf.transform([documents[i]]).toarray().flatten()
    top_features = cur_doc_scores.argsort()[-20:][::-1]
    all_features = tfidf.get_feature_names()

    for idx in top_features:
        top_features_per_time_period.append([time_period, all_features[idx], cur_doc_scores[idx]])
        
df = pd.DataFrame(top_features_per_time_period, columns=['time_period', 'term', 'tfidf'])

In [4]:
# Read dataframe from saved data
df.to
df = pd.read_csv('data/top_terms_per_period.csv')

In [5]:
df.head()

Unnamed: 0,time_period,term,tfidf
0,2019-8-Q1,shootings,0.229416
1,2019-8-Q1,el_paso,0.225219
2,2019-8-Q1,toni,0.21157
3,2019-8-Q1,trump,0.179828
4,2019-8-Q1,dayton,0.150939


In [12]:
all_terms = list(df.groupby('time_period')['term'].apply(lambda x: set(x)))
all_term_set = set()
for t in all_terms:
    all_terms

In [14]:
import plotly.express as px

fig = px.bar(df, x='term', y='tfidf', animation_frame='time_period')
fig.show()
fig.write_html('index.html', include_plotlyjs='cdn', full_html=False)