In [4]:
#API Modules
import requests
import creds
import json

#Text transformation Modules
import string
import re

#General Data processing Modules
import pandas as pd
import numpy as np
from typing import Dict, List

#NLP Modules
import spacy
from spacy.pipeline import Sentencizer

#Modules for checking for title similarity
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
def get_cities(latitude, longitude):

    """
    This function returns the names of all cities
    that fall within the latitude and longitude range
    of the map on Whats Happenin UI

    Args: 
        latitude range (float): 
        longitude range

    Returns
        city_names (List(str)): Names of all cities within 
                                the latitude, longitude range
    
    """
    
    #replace below after Justin provides input
    city_names = ['Boulder, CO', 'Longmont, CO', 'Colorado Springs, Colorado']
    return city_names


In [6]:
def get_unique_indices(data):
    """
    This function checks the titles of each article, compares
    the similar groups and then picks the article with the 
    best rank, in this case the lowest rank. 

    Args:
        data (json): A json value of key 'articles', with more keys like 'title', 'rank' etc
    
    Returns:
        unique_indices (List): List of all indices that have completely unique 
                                titles.

    """

    titles = [entry['title'] for entry in data]
    ranks = [entry['rank'] for entry in data]

    # Compute similarity score between titles
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(titles)
    similarity_matrix = cosine_similarity(tfidf_matrix)

    # Setting similarity threshold
    cosine_sim_threshold = 0.8

    # Identify pairs of titles with high similarity scores
    similar_titles = []
    for i,j in combinations(range(len(titles)), 2):
        similarity_score = similarity_matrix[i,j]
        if similarity_score > cosine_sim_threshold:
            similar_titles.append((i,j))
    
    indices_in_similar_titles = set([item for sublist in similar_titles for item in sublist])
    json_full_indices = set(np.arange(len(data)))
    originally_unique_indices = list(json_full_indices-indices_in_similar_titles)

    sim_articles_dict = {}

    for idx1, idx2 in similar_titles:
        sim_articles_dict.setdefault(idx1, []). append(idx2)

    similar_groups = [sorted(sim_articles_dict[key]+[key]) for key in sim_articles_dict]

    best_article_indices = []

    # Iterating over list of list  of similar indices
    for group in similar_groups:

        # Creating an array to store the ranks of each index
        ranks_in_group = []

        # Iterating over each index of the list of similar indices
        for idx in group:
            rank = ranks[idx]
            ranks_in_group.append(rank)
        best_rank_idx = np.argmin(ranks_in_group)

        best_article_indices.append(group[best_rank_idx])

    unique_indices = originally_unique_indices+best_article_indices
    
    return unique_indices

In [7]:
def get_and_process_json_data(city_names, start_date, end_date):

    """
    This function takes in the name of the cities,
    start date of query, end date of query, makes an API call
    cleans, processes and shortens the json ouput of the API call
    and returns a json file

    Args:
        city_names (List(str)): Nems of cities extracted using get_cities()
        start_date (str): Start date of the query
        end_date (str): End date of the query

    Returns:
        articles_json (json): A json with cleaned, processed
                             and no duplicate data
    """

    #initializing nlp object
    nlp = spacy.load("en_core_web_sm")
    sentencizer = Sentencizer()
    nlp.add_pipe('sentencizer', before = "parser")
    
    def process_string(raw_string):

        """
        Takes in raw string and makes it an nlp object
        then returns a string that can be used for NLP

        Args:
            raw_string (str): Raw string
        
        Returns:
            process_str (str): Fully processed string
        """
        string_1 = str(raw_string).replace(",","")
        doc = nlp(string_1)
        processed_string = ' '.join([token.text \
                                    for token in doc \
                                    if not token.is_punct and not token.is_space])
        return processed_string

    endpoint = 'https://api.newscatcherapi.com/v2/search?'
    headers = {'x-api-key': creds.api_key}

    processed_articles = []

    #Add constraint using regex to avoid any searches with Reporting by Brad Brooks in Longmont, Colorado
    for place in city_names:

        params = {
            'q': place,
            'lang': 'en',
            'countries': 'US',
            'ranked_only': True,
            'sort_by': 'rank',
            'page_size': 100,
            'page':1,
            'to': end_date,
            'from': start_date
        }

        response = requests.get(endpoint, headers=headers, params=params)
        json_text = response.json()
        total_hits = json_text['total_hits']

        #REMOVE BELOW PRINT STATEMENT WHEN DONE TESTING
        print(f'Number of articles fetched for {place}: {total_hits}')

        for item in json_text['articles']:
            
            #Condition to continue to next iteration if str present
            regex_pattern = rf'Reporting by .{{0,50}} in {place}'
            if re.search(regex_pattern, item['summary']):
                continue

            #Get title
            title = process_string(item['title'])
            
            #Get excerpt
            excerpt = process_string(item['excerpt'])
            
            #Get summary
            summary = process_string(item['summary'])
            
            processed_article = {
                "id": item['_id'],
                "rank": int(item['rank']),
                "location": place,
                "title": title,
                "excerpt": excerpt,
                "summary": summary,
                "link": item['link'],
                "author": str(item['author']),
                "published_date": item['published_date'][:10],
                "image_link": item['media']
            }

            processed_articles.append(processed_article)

    articles_json = {
        "articles": processed_articles
    }

    #REMOVE BELOW PRINT STATEMENT WHEN DONE TESTING
    print(len(articles_json['articles']))

    data = articles_json['articles']

    unique_indices = get_unique_indices(data)

    articles_json['articles'] = [articles_json['articles'][i] for i in unique_indices]
    
    #REMOVE BELOW PRINT STATEMENT WHEN DONE TESTING
    print(len(articles_json['articles']))

    return articles_json

In [8]:
start_date = '2024/01/20'
end_date = '2024/01/25'
city_names = ['New York, NY', 'Longmont, CO', 'Colorado Springs, Colorado']
print(get_and_process_json_data(city_names, start_date=start_date, end_date=end_date))



Number of articles fetched for New York, NY: 2519
Number of articles fetched for Longmont, CO: 29
Number of articles fetched for Colorado Springs, Colorado: 1267
229
209
