# The New York Times articles in 2019 and 2020: what prevails.

This is the first notebook of the New York Times project series. The goal of this notebook is to analyze the headlines, keywords and the lead paragraphs of the NYT throughout the past year and a half and distinguish the most prevalent issues and enduring topics.

#### Import tools and libraries:

In [2]:
import os
import pandas as pd
import requests
import json
import time
import dateutil
import datetime
import configparser
from dateutil.relativedelta import relativedelta
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#### Determine the timeframe of the analysis:

In [7]:
end = datetime.date.today()
start = datetime.date(2019, 1, 1)
print('Start date: ' + str(start))
print('End date: ' + str(end))

Start date: 2019-01-01
End date: 2020-08-31


#### Breaking the data into the monthly groups:

In [8]:
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]
months_in_range

[['2019', '1'],
 ['2019', '2'],
 ['2019', '3'],
 ['2019', '4'],
 ['2019', '5'],
 ['2019', '6'],
 ['2019', '7'],
 ['2019', '8'],
 ['2019', '9'],
 ['2019', '10'],
 ['2019', '11'],
 ['2019', '12'],
 ['2020', '1'],
 ['2020', '2'],
 ['2020', '3'],
 ['2020', '4'],
 ['2020', '5'],
 ['2020', '6'],
 ['2020', '7'],
 ['2020', '8']]

#### The helper functions below extract the NYT data through the API and save it into csv files:

In [18]:
def send_request(date):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + 'F9FPP1mJjiX8pAEFAxBYBg08vZECa39n'
    try:
        response = requests.get(url, verify=False).json()
    except Exception:
        return None
    time.sleep(6)
    return response


def is_valid(article, date):
    '''An article is only worth checking if it is in range, and has a headline.'''
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline


def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    data = {'headline': [],  
        'date': [], 
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': [],
        'lead_paragraph': []}
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'section' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
            if 'lead_paragraph' in article:
                data['lead_paragraph'].append(article['lead_paragraph'])
            else:
                data['lead_paragraph'].append(None)
    return pd.DataFrame(data) 


def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        print('Working on ' + str(date) + '...')
        csv_path = 'headlines/' + date[0] + '-' + date[1] + '.csv'
        if not os.path.exists(csv_path): # If we don't already have this month 
            response = send_request(date)
            if response is not None:
                df = parse_response(response)
                total += len(df)
                df.to_csv(csv_path, index=False)
                print('Saving ' + csv_path + '...')
    print('Number of articles collected: ' + str(total))

In [19]:
get_data(months_in_range)

Date range: ['2019', '1'] to ['2020', '8']
Working on ['2019', '1']...
Saving headlines/2019-1.csv...
Working on ['2019', '2']...
Saving headlines/2019-2.csv...
Working on ['2019', '3']...
Saving headlines/2019-3.csv...
Working on ['2019', '4']...
Saving headlines/2019-4.csv...
Working on ['2019', '5']...
Saving headlines/2019-5.csv...
Working on ['2019', '6']...
Saving headlines/2019-6.csv...
Working on ['2019', '7']...
Saving headlines/2019-7.csv...
Working on ['2019', '8']...
Saving headlines/2019-8.csv...
Working on ['2019', '9']...
Saving headlines/2019-9.csv...
Working on ['2019', '10']...
Saving headlines/2019-10.csv...
Working on ['2019', '11']...
Saving headlines/2019-11.csv...
Working on ['2019', '12']...
Saving headlines/2019-12.csv...
Working on ['2020', '1']...
Saving headlines/2020-1.csv...
Working on ['2020', '2']...
Saving headlines/2020-2.csv...
Working on ['2020', '3']...
Saving headlines/2020-3.csv...
Working on ['2020', '4']...
Saving headlines/2020-4.csv...
Working

#### Concatenate csv monthly files:

In [6]:
import glob
import pandas as pd

# get data file names
path = "headlines/"
filenames = glob.glob("*.csv")

dfs = []
print(filenames)
for filename in filenames:
    dfs.append(pd.read_csv(filename))

# Concatenate all data into one DataFrame
big_frame = pd.concat(dfs, ignore_index=True)

['2019-1.csv', '2020-5.csv', '2020-4.csv', '2019-2.csv', '2020-6.csv', '2020-7.csv', '2019-3.csv', '2019-7.csv', '2020-3.csv', '2020-2.csv', '2019-6.csv', '2019-4.csv', '2020-1.csv', '2019-5.csv', '2019-10.csv', '2019-11.csv', '2019-12.csv', '2019-8.csv', '2019-9.csv', '2020-8.csv']


* **big_frame** constitutes all the review for 2019 and 2020 (until August 2020 - that's the timeframe I'm using)

In [7]:
big_frame

Unnamed: 0,headline,date,doc_type,material_type,section,keywords,lead_paragraph
0,Your Hopes for 2019,2019-01-02,article,Op-Ed,,"['Love (Emotion)', 'United States Politics and...","How do you define hope — a state of mind, a pl..."
1,‘The Venerable W.’ Review: A Buddhist Monk Pre...,2019-01-03,article,Review,,"['Documentary Films and Programs', 'Muslims an...","This film completes what its director, Barbet ..."
2,Yellow Vest Leader Is Arrested in France Amid ...,2019-01-03,article,News,,"['Yellow Vests Movement', 'Demonstrations, Pro...",PARIS — The French police have arrested a prom...
3,Learning With: ‘NASA’s New Horizons Spacecraft...,2019-01-02,article,News,,[],Before reading the article:
4,White Cube? These 3 Art Shows Buck Convention,2019-01-03,article,News,,['Art'],"Art galleries, especially those on the Lower E..."
...,...,...,...,...,...,...,...
135949,Big Oil Is in Trouble. Its Plan: Flood Africa ...,2020-08-30,article,News,,"['Global Warming', 'Environment', 'Greenhouse ...",Confronting a climate crisis that threatens th...
135950,The Princess vs. the Portrait in Trumpworld,2020-08-29,article,Op-Ed,,"['Presidential Election of 2020', 'Republican ...",WASHINGTON — As long as the Trumps were hijack...
135951,"Jeanette Carlson, Who Fought Apartheid, Dies a...",2020-08-30,article,News,,"['Apartheid (Policy)', 'Deaths (Obituaries)']","Jeanette Carlson, an anti-apartheid activist w..."
135952,What to Know About Colon Cancer,2020-08-29,article,News,,"['Colon and Colorectal Cancer', 'Tests (Medica...",In the wake of Chadwick Boseman’s death from c...


#### Topic modeling: 2019 - 2020

The purpose of the analysis below is to run topic modeling on headlines, keywords and lead paragraphs of The New York Times articles for the past year and a half. I want to make sure that headlines are consistent with the introductory paragraphs and keywords. Also, I wanted to make sure I am not in any way misinterpreting the Times' journalism style and the topic of my research is relevant, before I dig deeper into the archives.

In [17]:
from collections import defaultdict 
import re, string
from gensim import corpora
from gensim.models import LdaModel 
import pyLDAvis.gensim 

In [18]:
big_frame_corpus_headline = big_frame['headline']
big_frame_corpus_keywords = big_frame['keywords']
big_frame_corpus_lead = big_frame['lead_paragraph']

In [25]:
from nltk.corpus import stopwords

Data pre-processing:

In [19]:
headlines = [re.sub(r'[^\w\s]','',str(item)) for item in big_frame_corpus_headline]

In [26]:
keywords = [re.sub(r'[^\w\s]','',str(item)) for item in big_frame_corpus_keywords]

In [37]:
lead = [re.sub(r'[^\w\s]','',str(item)) for item in big_frame_corpus_lead]

In [28]:
stopwords = set(stopwords.words('english'))

In [31]:
headline_texts = [[word for word in document.lower().split() if word not in stopwords] for document in headlines]

In [32]:
headline_texts

[['hopes', '2019'],
 ['venerable', 'w', 'review', 'buddhist', 'monk', 'preaches', 'hate'],
 ['yellow', 'vest', 'leader', 'arrested', 'france', 'amid', 'crackdown'],
 ['learning',
  'nasas',
  'new',
  'horizons',
  'spacecraft',
  'triumphant',
  'encounter',
  'distant',
  'object',
  'ever',
  'visited'],
 ['white', 'cube', '3', 'art', 'shows', 'buck', 'convention'],
 ['new', 'sentences', 'nasser', 'hussains', 'sky', 'wri', 'tei', 'ngs'],
 ['12', 'dance', 'performances', 'see', 'nyc', 'weekend'],
 ['5', 'comedy', 'shows', 'catch', 'nyc', 'weekend'],
 ['chinas', 'moon', 'landing', 'lunar', 'rover', 'begins', 'exploration'],
 ['mitt', 'romney', 'faces', 'counterattacks', 'trump', 'allies'],
 ['chelsea', 'signs', 'christian', 'pulisic', '73', 'million'],
 ['saudi', 'arabia', 'denies', 'issuing', 'american', 'weapons', 'sudanese'],
 ['undocumented',
  'worker',
  'says',
  'trump',
  'resort',
  'shielded',
  'secret',
  'service'],
 ['miss', 'california', 'long', 'farewell', 'jerry', 'b

In [33]:
keywords_texts = [[word for word in document.lower().split() if word not in stopwords] for document in keywords]

In [34]:
keywords_texts

[['love',
  'emotion',
  'united',
  'states',
  'politics',
  'government',
  'health',
  'insurance',
  'managed',
  'care',
  'children',
  'childhood',
  'global',
  'warming'],
 ['documentary',
  'films',
  'programs',
  'muslims',
  'islam',
  'rohingya',
  'ethnic',
  'group',
  'buddhism',
  'discrimination'],
 ['yellow',
  'vests',
  'movement',
  'demonstrations',
  'protests',
  'riots',
  'politics',
  'government'],
 [],
 ['art'],
 ['poetry', 'poets', 'airports'],
 ['dancing'],
 ['comedy', 'humor'],
 ['moon', 'space', 'astronomy'],
 ['united', 'states', 'politics', 'government'],
 ['soccer', 'mergers', 'acquisitions', 'divestitures'],
 ['defense', 'military', 'forces'],
 ['illegal',
  'immigration',
  'identification',
  'devices',
  'foreign',
  'workers',
  'immigration',
  'emigration',
  'united',
  'states',
  'politics',
  'government'],
 ['politics',
  'government',
  'governors',
  'us',
  'united',
  'states',
  'politics',
  'government'],
 ['united', 'states', '

In [42]:
lead_texts = [[word for word in document.lower().split() if word not in stopwords] for document in lead]

In [43]:
lead_texts

[['define',
  'hope',
  'state',
  'mind',
  'plan',
  'action',
  'simply',
  'act',
  'grateful'],
 ['film',
  'completes',
  'director',
  'barbet',
  'schroeder',
  'calls',
  'trilogy',
  'evil',
  'subject',
  'first',
  'documentary',
  'informal',
  'series',
  'ugandan',
  'dictator',
  'idi',
  'amin',
  'dada',
  'enthusiastic',
  'participant',
  'project',
  '1976',
  'result',
  'wound',
  'titled',
  'general',
  'idi',
  'amin',
  'dada',
  'self',
  'portrait'],
 ['paris',
  'french',
  'police',
  'arrested',
  'prominent',
  'leader',
  'yellow',
  'vest',
  'movement',
  'second',
  'time',
  'clear',
  'sign',
  'government',
  'following',
  'pledge',
  'crack',
  'protests',
  'shaken',
  'france',
  'much',
  'past',
  'six',
  'weeks'],
 ['reading', 'article'],
 ['art',
  'galleries',
  'especially',
  'lower',
  'east',
  'side',
  'environs',
  'sometimes',
  'resemble',
  'found',
  'objects',
  'art',
  'dealers',
  'shoestring',
  'budgets',
  'take',
  's

* Removing less frequent words:

In [54]:
frequency = defaultdict(int)
for headline_text in headline_texts:
    for token in headline_text:
         frequency[token] += 1
for keywords_text in keywords_texts:
    for token in keywords_text:
         frequency[token] += 1
for lead_text in lead_texts:
    for token in lead_text:
         frequency[token] += 1
            
headline_texts = [[token for token in headline_text if frequency[token] > 1] for headline_text in headline_texts]
keywords_texts = [[token for token in keywords_text if frequency[token] > 1] for keywords_text in keywords_texts]
lead_texts = [[token for token in lead_text if frequency[token] > 1] for lead_text in lead_texts]

In [56]:
dictionary_headline = corpora.Dictionary(headline_texts)
dictionary_keywords = corpora.Dictionary(keywords_texts)
dictionary_lead = corpora.Dictionary(lead_texts)

In [57]:
headline_corpus = [dictionary.doc2bow(headline_text) for headline_text in headline_texts]
keywords_corpus = [dictionary.doc2bow(keywords_text) for keywords_text in keywords_texts]
lead_corpus = [dictionary.doc2bow(lead_text) for lead_text in lead_texts]

In [65]:
NUM_TOPICS = 5  
ldamodel_headlines = LdaModel(headline_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=12)
ldamodel_keywords = LdaModel(keywords_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=12)
ldamodel_lead = LdaModel(lead_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=12)

In [66]:
topics_headlines = ldamodel_headlines.show_topics()
for topic_headlines in topics_headlines:
    print(topic_headlines)

(0, '0.019*"briefing" + 0.017*"dies" + 0.006*"trump" + 0.006*"dead" + 0.006*"police" + 0.005*"us" + 0.005*"joe" + 0.005*"man" + 0.004*"top" + 0.004*"hurricane"')
(1, '0.014*"election" + 0.012*"trump" + 0.012*"day" + 0.009*"primary" + 0.008*"results" + 0.008*"california" + 0.007*"democratic" + 0.007*"us" + 0.007*"million" + 0.006*"district"')
(2, '0.050*"new" + 0.014*"york" + 0.011*"tv" + 0.010*"convention" + 0.009*"night" + 0.008*"whats" + 0.008*"city" + 0.007*"power" + 0.006*"climate" + 0.006*"change"')
(3, '0.019*"review" + 0.012*"biden" + 0.010*"week" + 0.010*"2020" + 0.009*"2019" + 0.006*"corrections" + 0.006*"court" + 0.006*"open" + 0.005*"best" + 0.005*"republicans"')
(4, '0.025*"trump" + 0.012*"impeachment" + 0.009*"world" + 0.008*"hong" + 0.007*"says" + 0.007*"house" + 0.007*"kong" + 0.007*"trumps" + 0.006*"white" + 0.006*"us"')


In [67]:
topics_keywords = ldamodel_keywords.show_topics()
for topic_keywords in topics_keywords:
    print(topic_keywords)

(0, '0.036*"estate" + 0.036*"real" + 0.034*"coronavirus" + 0.031*"housing" + 0.028*"residential" + 0.028*"music" + 0.022*"fashion" + 0.021*"weddings" + 0.019*"travel" + 0.015*"global"')
(1, '0.082*"states" + 0.081*"united" + 0.076*"government" + 0.075*"politics" + 0.044*"2020" + 0.041*"election" + 0.041*"presidential" + 0.021*"international" + 0.018*"impeachment" + 0.018*"us"')
(2, '0.046*"books" + 0.045*"literature" + 0.023*"deaths" + 0.023*"coronavirus" + 0.023*"art" + 0.016*"theater" + 0.015*"cooking" + 0.015*"safety" + 0.015*"life" + 0.015*"cookbooks"')
(3, '0.032*"media" + 0.026*"social" + 0.025*"trumpukraine" + 0.025*"complaint" + 0.025*"inquiry" + 0.025*"whistleblower" + 0.023*"news" + 0.019*"computers" + 0.019*"shootings" + 0.018*"internet"')
(4, '0.030*"black" + 0.024*"movies" + 0.024*"people" + 0.023*"blacks" + 0.019*"television" + 0.019*"rights" + 0.018*"women" + 0.018*"girls" + 0.018*"race" + 0.016*"discrimination"')


In [68]:
topics_lead = ldamodel_lead.show_topics()
for topic_lead in topics_lead:
    print(topic_lead)

(0, '0.022*"new" + 0.013*"york" + 0.010*"city" + 0.009*"people" + 0.009*"said" + 0.008*"police" + 0.007*"school" + 0.006*"via" + 0.006*"week" + 0.006*"two"')
(1, '0.020*"president" + 0.015*"trump" + 0.011*"washington" + 0.009*"states" + 0.008*"united" + 0.006*"said" + 0.006*"would" + 0.006*"democratic" + 0.005*"national" + 0.005*"house"')
(2, '0.028*"get" + 0.022*"want" + 0.019*"new" + 0.016*"sign" + 0.015*"email" + 0.015*"times" + 0.015*"heres" + 0.013*"york" + 0.011*"signup" + 0.011*"newsletter"')
(3, '0.006*"years" + 0.005*"new" + 0.005*"first" + 0.005*"home" + 0.004*"one" + 0.004*"world" + 0.004*"two" + 0.004*"married" + 0.003*"died" + 0.003*"art"')
(4, '0.010*"like" + 0.009*"one" + 0.007*"time" + 0.005*"even" + 0.005*"would" + 0.004*"night" + 0.004*"many" + 0.004*"first" + 0.004*"us" + 0.004*"back"')


In [74]:
word_dict_headlines = {};
for i in range(NUM_TOPICS):
    words_headlines = ldamodel_headlines.show_topic(i, topn = 20)
    word_dict_headlines['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words_headlines]
pd.DataFrame(word_dict_headlines)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05
0,briefing,election,new,review,trump
1,dies,trump,york,biden,impeachment
2,trump,day,tv,week,world
3,dead,primary,convention,2020,hong
4,police,results,night,2019,says
5,us,california,whats,corrections,house
6,joe,democratic,city,court,kong
7,man,us,power,open,trumps
8,top,million,climate,best,white
9,hurricane,district,change,republicans,us


In [73]:
word_dict_keywords = {};
for i in range(NUM_TOPICS):
    words_keywords = ldamodel_keywords.show_topic(i, topn = 20)
    word_dict_keywords['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words_keywords]
pd.DataFrame(word_dict_keywords)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05
0,estate,states,books,media,black
1,real,united,literature,social,movies
2,coronavirus,government,deaths,inquiry,people
3,housing,politics,coronavirus,whistleblower,blacks
4,residential,2020,art,trumpukraine,television
5,music,election,theater,complaint,rights
6,fashion,presidential,cooking,news,women
7,weddings,international,safety,computers,girls
8,travel,impeachment,life,shootings,race
9,global,us,cookbooks,internet,discrimination


In [72]:
word_dict_lead = {};
for i in range(NUM_TOPICS):
    words_lead  = ldamodel_lead.show_topic(i, topn = 20)
    word_dict_lead ['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words_lead]
pd.DataFrame(word_dict_lead)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05
0,new,president,get,years,like
1,york,trump,want,new,one
2,city,washington,new,first,time
3,people,states,sign,home,even
4,said,united,email,one,would
5,police,said,times,world,night
6,school,would,heres,two,many
7,via,democratic,york,married,first
8,week,national,signup,died,us
9,two,house,newsletter,art,back


### Predominant topics discovered:

* Politics: elections, federal government, impeachment and immigration
* Race and gender: representation, activism and police brutality
* Crisis: pandemic, global warming and natural disaster
* Quarantine: new way of life and new pastimes