In [27]:
!pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz


Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz (12.0 MB)
     ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
     --------------------------------------- 0.1/12.0 MB 825.8 kB/s eta 0:00:15
     --------------------------------------- 0.1/12.0 MB 825.8 kB/s eta 0:00:15
     --------------------------------------- 0.1/12.0 MB 656.4 kB/s eta 0:00:19
     --------------------------------------- 0.1/12.0 MB 657.1 kB/s eta 0:00:19
     --------------------------------------- 0.2/12.0 MB 541.0 kB/s eta 0:00:22
      -------------------------------------- 0.2/12.0 MB 623.6 kB/s eta 0:00:19
      -------------------------------------- 0.2/12.0 MB 577.6 kB/s eta 0:00:21
     - ------------------------------------

In [36]:
#API Modules
import requests
import creds
import json

#Text transformation Modules
import string
import re

import pandas as pd

#NLP Modules
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

import spacy
from spacy.pipeline import Sentencizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chaub\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [85]:
#Making file for caching and working on dataset, do not need to retain all the data
filename = 'Processed_articles.csv'
f1_file = open(filename, 'w')
colnames_1 = 'title, author, published_date, location, link, excerpt, summary, rank, topic, id, media\n'
f1_file.write(colnames_1)
f1_file.close()

#cols to save to permanent record to track user interests
f2_col_names = ['title', 'published_date', 'location', 'link']


In [86]:
endpoint = 'https://api.newscatcherapi.com/v2/search?'
places = ['Boulder, CO', 'Longmont, CO', 'Colorado Springs, Colorado']
start_date = '2024/01/20'
end_date = '2024/02/05'
headers = {'x-api-key': creds.api_key}

#initializing nlp object
nlp = spacy.load("en_core_web_sm")
sentencizer = Sentencizer()
nlp.add_pipe('sentencizer', before = "parser")

#Write column names of csv
f1_file = open(filename, 'w')
cols_to_write = 'title, author, published_date, location,link, excerpt, summary, rank, topic, id, media,\n'
f1_file.write(cols_to_write)
f1_file.close()

#Add constraint using regex to avoid any searches with Reporting by Brad Brooks in Longmont, Colorado
f1_file = open(filename, 'a',encoding = 'utf-8')
for place in places:

    params = {
        'q': place,
        'lang': 'en',
        'countries': 'US',
        'ranked_only': True,
        'sort_by': 'rank',
        'page_size': 100,
        'page':1,
        'to': end_date,
        'from': start_date
    }

    response = requests.get(endpoint, headers=headers, params=params)
    print(response)
    json_text = response.json()
    json_str = json.dumps(json_text, indent=4)
    print(json_str)
    print(json_text['total_hits'])

    
    for item in json_text['articles']:

        #INSERT condition to check item['summary'] for regex
        #if present continue to next iteration
        regex_pattern = rf'Reporting by .{{0,50}} in {place}'
        if re.search(regex_pattern, item['summary']):
            continue

        
        #title, author, published_date, location, link, excerpt, summary, rank, topic, id, media
        #Get title
        title = str(item['title']).replace(",","")
        doc = nlp(title)
        spacy_title = ' '.join([token.text \
                                  for token in doc \
                                    if not token.is_punct and not token.is_space])
        #print(spacy_title)
       
        #Get author
        author = item['author']
        
        #Get published_date
        published_date = item['published_date'][:10] #getting first 9 characters

        #Get link
        link = item['link']

        #Get excerpt
        excerpt = str(item['excerpt']).replace(",", "")
        doc = nlp(excerpt)
        spacy_excerpt = ' '.join([token.text \
                                  for token in doc \
                                    if not token.is_punct and not token.is_space])
       

        #Get summary
        summary = str(item['summary']).replace(",", "")

        #UNCOMMENT BELOW TO SEE NLTK SUMMARIES
        # tokenizer = RegexpTokenizer(r'\w+')
        # tokens = tokenizer.tokenize(summary)

        # #Removing stopwords
        # filtered_summary= [word for word in tokens if word.lower() not in stopwords.words('english')]
        # nltk_summary = ' '.join([token for token in filtered_summary])
        # print(nltk_summary)
    
        #text prep using Spacy
        doc = nlp(summary)
        spacy_summary = ' '.join([token.text \
                                  for token in doc \
                                    if not token.is_punct and not token.is_space])
        
        #Get rank
        rank = item['rank']

        #Get topic
        topic = item['topic']

        #Get id
        id = item['_id']

        #Get media for thumbnail image
        #CANNOT SAVE MEDIA BECAUSE LINKS HAVE COMMAS IN THEM
        # media = item['media']
        # print(media)

        #preparing name of place without comma
        place = place.replace(", ", "")

        #title, author, published_date, location, 
        #link, excerpt, summary, rank, topic, id
        write_row = spacy_title +','+str(author)+','+str(published_date)+','\
                    +str(place)+','+str(link)+','+spacy_excerpt+','\
                    +spacy_summary+','+str(rank)+','+str(topic)+','\
                    +str(id)+'\n'
        #print(write_row)
        f1_file.write(write_row)
    #time.sleep(0.5)  #UNCOMMENT IF API REQUEST FAILS 
f1_file.close()





<Response [200]>
{
    "status": "ok",
    "total_hits": 430,
    "page": 1,
    "total_pages": 5,
    "page_size": 100,
    "articles": [
        {
            "title": "Odesza coming to Boulder as Folsom Field will host two new concerts this summer",
            "author": "Olivia Doak",
            "published_date": "2024-01-30 04:26:00",
            "published_date_precision": "full",
            "link": "https://finance.yahoo.com/news/odesza-coming-boulder-folsom-field-042600308.html",
            "clean_url": "yahoo.com",
            "excerpt": "Jan. 29\u2014For the first time in more than two decades, Folsom Field will see new concert acts outside of Dead & Company this summer, including its first electronic music group. \"It's going to be\u2026",
            "summary": "Jan. 29\u2014For the first time in more than two decades, Folsom Field will see new concert acts outside of Dead & Company this summer, including its first electronic music group.\n\"It's going to be awesome. It'

In [30]:
def get_topic_proportion(places, model):

    proportion_dict = {}
    return proportion_dict




Match found!


In [84]:
#saving data as a json

endpoint = 'https://api.newscatcherapi.com/v2/search?'
places = ['Boulder, CO', 'Longmont, CO', 'Colorado Springs, Colorado']
start_date = '2024/01/20'
end_date = '2024/02/05'
headers = {'x-api-key': creds.api_key}

processed_articles = []

#Add constraint using regex to avoid any searches with Reporting by Brad Brooks in Longmont, Colorado
for place in places:

    params = {
        'q': place,
        'lang': 'en',
        'countries': 'US',
        'ranked_only': True,
        'sort_by': 'rank',
        'page_size': 100,
        'page':1,
        'to': end_date,
        'from': start_date
    }

    response = requests.get(endpoint, headers=headers, params=params)
    print(response)
    json_text = response.json()
    json_str = json.dumps(json_text, indent=4)
    #print(json_str)
    print(json_text['total_hits'])

    for item in json_text['articles']:
              #INSERT condition to check item['summary'] for regex
        #if present continue to next iteration
        regex_pattern = rf'Reporting by .{{0,50}} in {place}'
        if re.search(regex_pattern, item['summary']):
            continue

        
        #title, author, published_date, location, 
        #link, excerpt, summary, rank, id, media

        #Get title
        title = str(item['title']).replace(",","")
        doc = nlp(title)
        spacy_title = ' '.join([token.text \
                                  for token in doc \
                                    if not token.is_punct and not token.is_space])
        #print(spacy_title)
        
        #Get published_date
        published_date = item['published_date'][:10] #getting first 9 characters

        #Get excerpt
        excerpt = str(item['excerpt']).replace(",", "")
        doc = nlp(excerpt)
        spacy_excerpt = ' '.join([token.text \
                                  for token in doc \
                                    if not token.is_punct and not token.is_space])
       

        #Get summary
        summary = str(item['summary']).replace(",", "")
    
        #text prep using Spacy
        doc = nlp(summary)
        spacy_summary = ' '.join([token.text \
                                  for token in doc \
                                    if not token.is_punct and not token.is_space])
        
    
        #title, author, published_date, location, 
        #link, excerpt, summary, rank, topic, id, media
        processed_article = {
            "id": item['_id'],
            "rank": int(item['rank']),
            "location": place,
            "title": spacy_title,
            "excerpt": spacy_excerpt,
            "summary": spacy_summary,
            "link": item['link'],
            "author": str(item['author']),
            "published_date": item['published_date'][:10],
            "image_link": item['media']
        }

        processed_articles.append(processed_article)

processed_json = {
    "articles": processed_articles
}

with open('processed_articles.json', 'w') as json_file:
    json.dump(processed_json, json_file, indent = 4)


        

<Response [200]>
430
<Response [200]>
77
<Response [200]>
4378
