# Tagging the USA by New York Times

### To understand what are the top 10 frequently used tags in all articles every month on the New York Times, and look at overall activities around these tags on other media platforms
- Text Analytics with Sentimental Analysis
- Interpreting the media world - Google searches, Reddit, and Youtube from tags of the Times Perspective


In [1]:
import sys
sys.executable

'/usr/local/opt/python/bin/python3.7'

In [2]:
sys.path

['/Users/hh/Documents/Pratt/Adv.ProjectsinVis/Monthly-Frequency-of-NYTimes-Tag',
 '/usr/local/Cellar/python/3.7.7/Frameworks/Python.framework/Versions/3.7/lib/python37.zip',
 '/usr/local/Cellar/python/3.7.7/Frameworks/Python.framework/Versions/3.7/lib/python3.7',
 '/usr/local/Cellar/python/3.7.7/Frameworks/Python.framework/Versions/3.7/lib/python3.7/lib-dynload',
 '',
 '/Users/hh/Library/Python/3.7/lib/python/site-packages',
 '/usr/local/lib/python3.7/site-packages',
 '/usr/local/lib/python3.7/site-packages/IPython/extensions',
 '/Users/hh/.ipython']

In [3]:
import json
import requests
import pandas as pd
import numpy as np
import datetime
import operator
import time
import praw
import nltk
import configparser

from pytrends.request import TrendReq
from praw.models import MoreComments
from googleapiclient import discovery
from textblob import TextBlob
from pandas.io.json import json_normalize

import pprint
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Table, Column, Integer, String, MetaData, ForeignKey

from sqlalchemy.orm import sessionmaker

In [5]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/hh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
# Use CofigParser to safely store the password or key
config = configparser.ConfigParser()
config.read('key_pair.ini')

Times_key = config['Times']['key']

Reddit_client_id = config['Reddit']['client_id']
Reddit_client_secret = config['Reddit']['client_secret']
Reddit_username = config['Reddit']['username']
Reddit_password = config['Reddit']['password']
Reddit_user_agent = config['Reddit']['user_agent']

Youtube_YOUTUBE_API_SERVICE_NAME = config['Youtube']['YOUTUBE_API_SERVICE_NAME']
Youtube_YOUTUBE_API_VERSION = config['Youtube']['YOUTUBE_API_VERSION']
Youtube_DEVELOPER_KEY = config['Youtube']['DEVELOPER_KEY']

### Storing Data with SQalchemy

In [7]:
Base = declarative_base()

class Init(Base): 
    __tablename__ = 'metadata'
    __table_args__ = {'extend_existing': True}

    id = Column(Integer, primary_key=True)
    Tag = Column(String(255))
    Frequency = Column(String(255))
    Title = Column(String(255))
    Date = Column(String(25))
    Url = Column(String(255))
    img_URL = Column(String(255))

### New York Times API
https://developer.nytimes.com/apis

Archive API
- Finds all articles's metadata from archive APi (about 7000 every month): title, section_name, category, url, word_count, keywords

In [21]:
int(str(datetime.datetime.now())[5:7])

4

In [23]:
def get_times_metadata():
    today = str(datetime.datetime.now())
    monthly_archive = {}
    monthly_popular_tags = {}
    for yy in range(2018, int(today[:4]) + 1):
        if str(yy) == today[:4]:
            ends = int(today[5:7]) + 1
        else:
            ends = 13
        for mm in range(1,ends):
            print(yy,mm)
            parameters = {'api-key': Times_key}           
            archived_Url = 'https://api.nytimes.com/svc/archive/v1/'+ str(yy) +'/'+ str(mm) +'.json'
            archives = requests.get(archived_Url, params=parameters).json()

            each_metadata = []
            for a in archives['response']['docs']:
                articles = {}
                articles['title'] = a['headline']['main']
                articles['pub_date'] = a['pub_date'][:10]
                articles['url'] = a['web_url']
                if len(a['multimedia']) !=0 and a['multimedia'][0]['url']:
                    articles['thm_img'] = 'https://static01.nyt.com/' + a['multimedia'][0]['url']
                else:
                    articles['thm_img'] = 'no_image_found'
                articles['tags'] = [''.join(tag['value']) for tag in a['keywords']]
                each_metadata.append(articles)

            tag_arr = []
            for m in each_metadata:
                for t in m['tags']:
                    for string in t.split(', '):
                        tag_arr.append(', '.join(string.split(', ')))
            
            count_tag = {}
            for tag in tag_arr:
                if tag in ['Trump', 'Donald J']:
                    tag = 'Donald Trump'
                if tag in ['Joseph R Jr', 'Biden']:
                    tag = 'Joe Biden'
                if tag in ['Brett M', 'Supreme Court (US)', 'Kavanaugh']:
                    tag = 'Brett Kavanaugh'
                if tag in ['Putin', 'Vladimir V']:
                    tag = 'Putin'
                if tag in ['New York City', 'NYC','NY)', 'NY']:
                    tag = 'New York City'
                if tag in ['Fla', 'Parkland']:
                    tag = 'Parkland'
                if tag in ['Coronavirus Aid', 'Relief', 'and Economic Security Act (2020)']:
                    tag = 'Coronavirus Aid, Relief, and Economic Security Act (2020)'
                if 'Trump-Ukraine' in tag:
                    tag = 'Trump-Ukraine'
                if tag in ['School Shootings and Armed Attacks']:
                    tag = 'School Shootings'
                if tag in ['Shutdowns (Institutional)']:
                    tag = 'Shutdowns'
                if tag in count_tag:
                    count_tag[tag] += 1
                else:
                    count_tag[tag] = 1   

            tags_with_frequency = sorted(count_tag.items(),key=operator.itemgetter(1),reverse=True)[:20]      
                       
            data = []
            for top_tag in tags_with_frequency:
                multi_articles = {}
                for each in reversed(each_metadata) :
                    if top_tag[0] in multi_articles.values():
                        pass      
                    else:
                        if len(each['tags']) != 0 and top_tag[0] in each['tags']:
                            # store the data into variable 'monthly_archive'
                            if len(str(mm)) == 1:
                                mm = '0' + str(mm)
                            multi_articles['Category'] = str(yy) + '-' +  str(mm)
                            multi_articles['Tag'] = top_tag[0]    
                            multi_articles['Frequency'] = str(top_tag[1])
                            multi_articles['Title'] = each['title']
                            multi_articles['Date'] = each['pub_date']
                            multi_articles['Url'] = each['url']
                            multi_articles['img_URL'] = each['thm_img']
                            data.append(multi_articles)
            monthly_archive[str(yy)+'-'+str(mm)] = data
            print('Length: ', len(monthly_archive[str(yy)+'-'+str(mm)]))
#     print('tags_with_frequency', tags_with_frequency)       
    return monthly_archive

In [24]:
times_metadata = get_times_metadata()
times_metadata

2018 1
Length:  19
2018 2
Length:  16
2018 3
Length:  17
2018 4
Length:  19
2018 5
Length:  19
2018 6
Length:  18
2018 7
Length:  17
2018 8
Length:  19
2018 9
Length:  16
2018 10
Length:  16
2018 11
Length:  19
2018 12
Length:  19
2019 1
Length:  18
2019 2
Length:  19
2019 3
Length:  17
2019 4
Length:  16
2019 5
Length:  18
2019 6
Length:  16
2019 7
Length:  16
2019 8
Length:  16
2019 9
Length:  17
2019 10
Length:  16
2019 11
Length:  17
2019 12
Length:  16
2020 1
Length:  15
2020 2
Length:  11
2020 3
Length:  15
2020 4
Length:  17


{'2018-01': [{'Category': '2018-01',
   'Tag': 'United States Politics and Government',
   'Frequency': '880',
   'Title': 'Is America Growing Less Tolerant on L.G.B.T.Q. Rights?',
   'Date': '2018-01-29',
   'Url': 'https://www.nytimes.com/2018/01/29/opinion/america-tolerant-lgbtq-rights.html',
   'img_URL': 'https://static01.nyt.com/images/2018/01/29/opinion/29Boylan2-web/merlin_133078676_ade3da63-ff97-44f7-88b2-4cbccc70b1ac-articleLarge.jpg'},
  {'Category': '2018-01',
   'Tag': 'New York City',
   'Frequency': '530',
   'Title': 'Mobility Is a Mess in New York',
   'Date': '2018-01-29',
   'Url': 'https://www.nytimes.com/2018/01/29/opinion/new-york-traffic-transit.html',
   'img_URL': 'https://static01.nyt.com/images/2018/01/09/opinion/08leonhardt-newsletter-traffic/merlin_22655794_9d2d0c35-7648-4b8d-aaad-2e51a995a829-articleLarge.jpg'},
  {'Category': '2018-01',
   'Tag': 'Books and Literature',
   'Frequency': '309',
   'Title': 'Oprah Winfrey Drops Russell Simmons From Spiritual

In [11]:
def get_specific_metadata(yy, mm): #these arguments need to be number
    parameters = {'api-key': Times_key}           
    archived_Url = 'https://api.nytimes.com/svc/archive/v1/'+ str(yy) +'/'+ str(mm) +'.json'
    archives = requests.get(archived_Url, params=parameters).json()
    articles_all = []
    for a in archives['response']['docs']:
        articles = {}
        articles['title'] = a['headline']['main']
        articles['pub_date'] = a['pub_date'][:10]
        articles['url'] = a['web_url']
        articles['word_count'] = a['word_count']
        articles['tags'] = [''.join(tag['value']) for tag in a['keywords']]# if a['keywords'].index(tag) == 0 or a['keywords'].index(tag) == 1 or a['keywords'].index(tag) == 2]
        articles_all.append(articles)
    return articles_all

In [12]:
get_specific_metadata(2020, 4)

[{'title': 'Corrections: April 1, 2020',
  'pub_date': '2020-04-01',
  'url': 'https://www.nytimes.com/2020/03/31/pageoneplus/corrections-april-1-2020.html',
  'word_count': 377,
  'tags': []},
 {'title': 'Quotation of the Day: Cases Spiral Aboard an Aircraft Carrier, and a Commander Pleads for Help',
  'pub_date': '2020-04-01',
  'url': 'https://www.nytimes.com/2020/03/31/todayspaper/quotation-of-the-day-cases-spiral-aboard-an-aircraft-carrier-and-a-commander-pleads-for-help.html',
  'word_count': 67,
  'tags': []},
 {'title': 'Coronavirus in N.Y.C.: The Latest',
  'pub_date': '2020-04-01',
  'url': 'https://www.nytimes.com/2020/04/01/nyregion/coronavirus-nyc.html',
  'word_count': 1099,
  'tags': ['New York City',
   'Coronavirus (2019-nCoV)',
   'Laundry and Laundromats',
   'Hygiene and Cleanliness',
   'New York State',
   'Cuomo, Andrew M',
   'de Blasio, Bill',
   'Cuomo, Christopher']},
 {'title': 'Coronavirus Spreads Amid Supply Shortages, Stay-at-Home Orders and Sobering Econ

In [13]:
def get_most_popular_viewedArticle():
    parameters = {'api-key': Times_key}           
    archived_Url = 'https://api.nytimes.com/svc/mostpopular/v2/viewed/7.json'
    archives = requests.get(archived_Url, params=parameters).json()
#     pprint.pprint(archives['results'])
    popular_article = []
    for a in archives['results']:
        articles = {}
        articles['title'] = a['title']
        articles['pub_date'] = a['published_date']
        articles['url'] = a['url']
        if a['des_facet']:
            articles['tags'] = a['des_facet']
        if a['media']:
            articles['thm_img'] = a['media'][0]['media-metadata'][2]['url']
        popular_article.append(articles)
    return popular_article

In [14]:
popular_article = get_most_popular_viewedArticle()
popular_article

[{'title': 'Putting Jared Kushner In Charge Is Utter Madness',
  'pub_date': '2020-04-02',
  'url': 'https://www.nytimes.com/2020/04/02/opinion/jared-kushner-coronavirus.html',
  'tags': ['Coronavirus (2019-nCoV)',
   'Epidemics',
   'United States Politics and Government',
   'Disasters and Emergencies',
   'Medical Devices',
   'Ventilators (Medical)'],
  'thm_img': 'https://static01.nyt.com/images/2020/04/02/opinion/02goldbergWeb/02goldbergWeb-mediumThreeByTwo440.jpg'},
 {'title': 'F.A.Q. on Stimulus Checks, Unemployment and the Coronavirus Plan',
  'pub_date': '2020-03-26',
  'url': 'https://www.nytimes.com/article/coronavirus-stimulus-package-questions-answers.html',
  'tags': ['Coronavirus (2019-nCoV)',
   'Coronavirus Aid, Relief, and Economic Security Act (2020)',
   'Stimulus (Economic)',
   'Federal Aid (US)',
   'Unemployment Insurance',
   'United States Economy',
   'United States Politics and Government'],
  'thm_img': 'https://static01.nyt.com/images/2020/03/27/business/

### Pytrends API
https://pypi.org/project/pytrends/

In [16]:
pytrends = TrendReq(hl='en-US', tz=360, timeout=(10,25))
tag_arr = ['Coronavirus (2019-nCoV)']
pytrends.build_payload(tag_arr, cat=0, timeframe='2018-01-01 ' + str(datetime.datetime.now())[:10], geo='', gprop='')
time.sleep(2)
df = pytrends.interest_over_time().reset_index()
df

ResponseError: The request failed: Google returned a response with code 429.

In [None]:
str(datetime.datetime.now())[:7]

In [None]:
# Get a unique tag collection for the search query
def get_trends_Tags(times_metadata):    
    result = {}
    for time_category in times_metadata:
        data = []
        for each in times_metadata[time_category]:     
            data.append(each['Tag'])
            result[time_category] = data
        
    pytrends = TrendReq(hl='en-US', tz=360)
    monthly_interests = {}
    
    for i, Category in enumerate(result):
        print(Category + ': ', len(result))
        print(result[Category])
        data = []
        for tag in result[Category]:
            if tag in ['Russian Interference in 2016 US Elections and Ties to Trump Associates']:
                tag = 'Russian Ties to Trump'
            if tag in ['Appointments and Executive Changes']:
                tag = 'Appointments and Executive'
            tag_arr = []
            tag_arr.append(tag)
            print(tag_arr)
            interest_over_time = {}
            
            if Category == str(datetime.datetime.now())[:7]:
                end_date = (str(datetime.datetime.now())[8:10])
            elif int(str(datetime.datetime.now())[5:7]) % 2 == 0:
                if str(datetime.datetime.now())[5:7][5:7] == '02':
                    end_date = 28
                else:
                    end_date = 31
            else:
                 end_date = 30

            print(end_date)
            
            pytrends.build_payload(tag_arr, cat=0, timeframe='2018-01-01 ' + str(end_date), geo='', gprop='')
            time.sleep(2)
            df = pytrends.interest_over_time().reset_index()
            interest_over_time['Tag'] = tag

            for i in range(1, 20):
                try:
                    interest_over_time['Interests_' + str(i)] = list(df[tag])[i-1]
                except Exception as e:
                    print('No-result: ', tag)
                    interest_over_time['Interests_' + str(i)] = 0
                    pass
            data.append(interest_over_time)     
        monthly_interests['Category'] = data
    return monthly_interests

In [None]:
monthly_interests = get_trends_Tags(times_metadata)
monthly_interests

In [None]:
len(interest_over_time['2020-2'])

### Reddit API
https://praw.readthedocs.io/en/latest/

In [31]:
# Reddit API
def get_reddit_comments(times_metadata):
    
    result = {}
    for time_category in times_metadata:
        data = []
        for each in times_metadata[time_category]:     
            data.append(each['Tag'])
            result[time_category] = data
            
    reddit = praw.Reddit(client_id = Reddit_client_id,
                         client_secret = Reddit_client_secret,
                         username = Reddit_username,
                         password = Reddit_password,
                         user_agent = Reddit_user_agent)
    
    for i, Category in enumerate(result):
        print(Category + ': ', len(result))
        print(result[Category])
        reddit_metadata = []
        for tag in result[Category]:
            print(tag)
            subreddit = reddit.subreddit('all')
            each_tag = {}
            for post in subreddit.search(tag, limit=5):
                each_tag['Tag'] = tag
                each_tag['Title'] = post.title
                each_tag['Url'] = post.url
                each_tag_comments = post.comments.list()
                comments_arr = []
                for comment in each_tag_comments:
                    if isinstance(comment, MoreComments):
                        continue
                    
                    comments_arr.append(comment.body)
                each_tag['Comments'] = comments_arr
        reddit_metadata.append(each_tag)
    return reddit_metadata

In [None]:
# times_metadata
# {'2018-01': [{'Category': '', 'Tag': '','Frequency': '',...},{}], '2018-02':[{},{}]}
reddit_metadata = get_reddit_comments(times_metadata)
reddit_metadata
# reddit_metadata = [{'Tag': 'tag1', 'Title': 'title1', 'url': 'url1', 'Comments': ['comment1', 'comment2',...]}]

2018-01:  28
['United States Politics and Government', 'New York City', 'Books and Literature', 'Politics and Government', 'Movies', 'Immigration and Emigration', '#MeToo Movement', 'Television', 'Sexual Harassment', 'Women and Girls', 'Republican Party', 'Senate', 'Real Estate and Housing (Residential)', 'United States International Relations', 'Deaths (Obituaries)', 'Fashion and Apparel', 'China', 'House of Representatives', 'Elections']
United States Politics and Government
New York City
Books and Literature
Politics and Government
Movies
Immigration and Emigration
#MeToo Movement
Television
Sexual Harassment
Women and Girls
Republican Party
Senate
Real Estate and Housing (Residential)
United States International Relations
Deaths (Obituaries)
Fashion and Apparel
China
House of Representatives
Elections
2018-02:  28
['United States Politics and Government', 'New York City', 'Olympic Games (2018)', 'Politics and Government', 'Movies', 'Books and Literature', 'Television', 'Gun Control

In [None]:

def get_sentimental_chart(reddit_metadata):
    tags_metadata = []
    for single_tag in reddit_metadata:
        tag_sentiment = {}
        avg_polarity = 0
        pos_count = 0
        neg_count = 0
        neutral_count = 0
        for single_comment in single_tag['Comments']: 
            
            # Get Word Sentimental analysis
            blobed_word = TextBlob(single_comment)
            if blobed_word.sentiment.polarity > 0:
                pos_count += 1
            elif blobed_word.sentiment.polarity == 0:
                neutral_count += 1
            else:
                neg_count += 1      
            avg_polarity += blobed_word.sentiment.polarity
            
            tag_sentiment['tag'] = single_tag['Tag']    
            tag_sentiment['avg_polarity'] = avg_polarity / len(single_tag['Comments'])
            tag_sentiment['pos_count'] = pos_count
            tag_sentiment['neg_count'] = neg_count
            tag_sentiment['neutral_count'] = neutral_count
            
            # Get Adjective words' frequencies
            adgs_arr = []
            for blobed_tags in blobed_word.tags:
                if blobed_tags[1] in ['JJ', 'JJR', 'JJS']:
                    adgs_arr.append(blobed_tags[0])
            adg_frequency = {}
            for adg in adgs_arr: 
                if adg in adg_frequency:
                    adg_frequency[adg] += 1
                else:
                    adg_frequency[adg] = 1 
            tag_sentiment['adg_frequency'] = sorted(adg_frequency.items(),key=operator.itemgetter(1),reverse=True)[:3]       
        tags_metadata.append(tag_sentiment)
        
    return tags_metadata

In [None]:
tags_metadata = get_sentimental_chart(reddit_metadata)
tags_metadata

### Youtube API
https://developers.google.com/youtube/v3/docs/search/list?hl=en_US

In [None]:
# creating Youtube Resource Object 
youtube_object = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey = DEVELOPER_KEY) 
   
def youtube_search_keyword(unique_tags_only):  
    # calling the search.list method to retrieve youtube search results 
    max_results = 5
    results = []
    for tag in unique_tags_only:
        print(tag)
        search_tags = youtube_object.search().list(q = tag, part = "id, snippet", order = 'viewCount', maxResults = max_results, publishedAfter = "2020-02-17T00:00:00Z").execute() 
        videos = []
        for item in search_tags.get("items", []):
            video = {}
            video["Tag"] = tag
            video["videoId"] = item["id"]["videoId"]
            video["publishedAt"] = item['snippet']['publishedAt']
            video["title"] = item['snippet']['title']
            video['description'] = item['snippet']['description']
            stats = youtube_object.videos().list(part='statistics, snippet', id=item["id"]["videoId"]).execute()
            video['statistics'] = stats.get("items", [])[0]['statistics']
            videos.append(video)
        results.append(videos)
    return results

In [None]:
youtube_search_keyword(unique_tags_only)

#### [Not using anymore] 

- Top Stories API: (about 30 articles by 7 days): title, pub_date, url, section, des_facet, geo_facet
- Newswire API: Finds by Archive API's url : des_facet
- Most Popular API
- Top Stories API, Times Tags API, Community API