Media Cloud: Measuring Attention
================================
- Reference: https://github.com/mediacloud/api-tutorial-notebooks/blob/main/MC02%20-%20attention.ipynb
- Media Ids Query: https://search.mediacloud.org/sources/1

In [2]:
# Set up your API key and import needed things
import os, mediacloud.api
from importlib.metadata import version

# from dotenv import load_dotenv
import datetime as dt
from IPython.display import JSON, display
import bokeh.io
from rich import print as pp
import json
from tqdm import tqdm
import pandas as pd

tqdm.pandas()
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=4, verbose=0)
bokeh.io.reset_output()
bokeh.io.output_notebook(hide_banner=True)

MC_API_KEY = os.getenv("MEDIA_CLOUD_API_KEY")
if MC_API_KEY is None:
    try:
        with open("../config/media.cloud.key") as f:
            MC_API_KEY = f.read().strip()
        pp("[bold green][SUCCESS] MC API Key found.[/bold green]")
    except FileNotFoundError:
        pp(
            "[bold red][ERROR] MC API key not found. Check ENV 'MEDIA_CLOUD_API_KEY' or file './config/media.cloud.key'[/bold red]"
        )
else:
    pp("[bold green][SUCCESS] MC API Key found.[/bold green]")
search_api = mediacloud.api.SearchApi(MC_API_KEY)
# pp(f"[gray][INFO] Using Media Cloud python client v{version('mediacloud')}[/gray]")

## Listing Stories

Story counts are fine, but often what you really want is the story themselves. Note that **we cannot provide story content** due to copyright restrictions. However, you can get a list of all the URLs and then fetch them yourself. We can also return word counts down to the story level (see the "language" notebook for more info on that).

In [3]:
from email.mime import base
import requests


def qury_media_id(media_name: str = "nytimes"):
    try:
        response = requests.get(
            url="https://search.mediacloud.org/api/sources/sources/",
            params={
                "limit": "3",
                "name": media_name,
            },
            headers={
                "Cookie": "csrftoken=qqQVYfPizRLDITAtYCMn4ShmotfrK69T; sessionid=nk9ykemged6ukcvfjgfsg63p8l93p7ra",
            },
        )
        if response.status_code == 200:
            # print(response.json())
            return response.json()["results"]
        else:
            pp(f"[red]ERROR: {response.status_code}[/red]")
    except requests.exceptions.RequestException:
        pp("[red]HTTP Request failed[/red]")


# test
qury_media_id("Sean Hannity")

[{'id': 28136,
  'name': 'hannity.com',
  'url_search_string': None,
  'label': 'The Sean Hannity Show',
  'homepage': 'http://www.hannity.com/',
  'notes': None,
  'platform': 'online_news',
  'stories_per_week': 56,
  'first_story': None,
  'created_at': '2022-12-23T17:43:28.547804Z',
  'modified_at': '2024-08-31T13:46:53.492705Z',
  'pub_country': None,
  'pub_state': None,
  'primary_language': 'en',
  'media_type': None,
  'collection_count': 9}]

In [4]:
# let's fetch all the stories matching our query on one day
my_query = "Li Qiang OR Qiang Li"  # note the double quotes used to indicate use of the whole phrase
start_date = dt.date(2023, 1, 1)
end_date = dt.date(2024, 9, 1)
all_stories = []
US_NATIONAL_COLLECTION = 34412234
more_stories = True
pagination_token = None

while more_stories:
    page, pagination_token = search_api.story_list(
        my_query,
        start_date,
        end_date,
        collection_ids=[US_NATIONAL_COLLECTION],
        # source_ids=sources,
        # source_ids=media_ids,
        pagination_token=pagination_token,
    )
    all_stories += page
    more_stories = pagination_token is not None


def clean_story_dates(story):
    story["publish_date"] = str(story["publish_date"].strftime("%Y-%m-%d"))
    story["indexed_date"] = str(story["indexed_date"].strftime("%Y-%m-%d"))
    return story


all_stories = [clean_story_dates(story) for story in all_stories]
pp(f"[bold green]Found {len(all_stories)} stories[/bold green]")
all_stories[:3]

KeyboardInterrupt: 

In [None]:
df_stories = pd.DataFrame(all_stories)
df_stories.head(3)

Unnamed: 0,id,media_name,media_url,title,publish_date,url,language,indexed_date
0,7853cd2a17dea3793aa3f90f7836eae5f80d4d657c2d39...,bbc.co.uk,bbc.co.uk,A quick guide to smoking bans across the world,2023-11-27,https://www.bbc.co.uk/news/uk-67545363,en,2024-08-30
1,0fc0218db50dde7663b1b183e56a05a6b97ec6d8d4b11b...,buzzfeed.com,buzzfeed.com,"Over, Under, Or Accurately Rated ""Evermore"" So...",2023-11-27,https://www.buzzfeed.com/paigeswiftie/evermore...,en,2024-08-23
2,cc1ebfd683325dac32b7cc0bd9553ff2051b60086d5838...,foxnews.com,foxnews.com,"What to know about Legionnaires’ disease, the ...",2023-11-06,https://www.foxnews.com/health/what-know-about...,en,2024-08-15


In [None]:
df_stories.to_csv("../data/stories/stories.li.qiang.by.id.csv", index=False)

## Parallelized by dates

In [21]:
def split_query(
    query: str = "Donald Trump OR Donald J Trump OR RealDonaldTrump OR donald trump OR donald j trump OR realdonaldtrump",
):
    query = query.lower()
    if "congress" in query:
        return "congress"
    elif "senate" in query:
        return "senate"
    elif " or " in query:
        query = query.split(" or ")[0]
        return query
        # if " " in query:
        #     return query.split(" ")[0]
        # else:
        #     return query
    elif " and " in query:
        query = query.split(" and ")[0]
        return query
        # if " " in query:
        #     return query.split(" ")[0]
        # else:
        #     return query
    elif " " in query:
        return query.split(" ")[0]
    else:
        return query


def query_by_ids(
    query: str = "the",
    start_date: dt.date = dt.date(2023, 11, 1),
    end_date: dt.date = dt.date(2023, 12, 1),
):
    data_folder = f"../data/stories/by.ids/{split_query(query)}"
    try:
        # let's fetch all the stories matching our query on one day
        my_query = f"{query}"  # note the double quotes used to indicate use of the whole phrase
        all_stories = []
        more_stories = True
        pagination_token = None

        while more_stories:
            page, pagination_token = search_api.story_list(
                my_query,
                start_date,
                end_date,
                # source_ids=ids,
                collection_ids=[34412234],
                pagination_token=pagination_token,
            )
            all_stories += page
            more_stories = pagination_token is not None

        def clean_story_dates(story):
            story["publish_date"] = str(story["publish_date"].strftime("%Y-%m-%d"))
            story["indexed_date"] = str(story["indexed_date"].strftime("%Y-%m-%d"))
            return story

        all_stories = [clean_story_dates(story) for story in all_stories]
        pp(f"[bold green]Found {len(all_stories)}  by ids.[/bold green]")
        if not os.path.exists(data_folder):
            os.makedirs(data_folder)
        pd.DataFrame(all_stories).to_csv(
            f"{data_folder}/{str(start_date)}.csv",
            index=False,
        )
        return len(all_stories)
    except Exception as e:
        pp(f"[red]ERROR: {e}[/red], pass")
        return 0


# split_query("Li Qiang OR Qiang Li")
query_by_ids("Li Qiang OR Qiang Li")

1934

In [22]:
from datetime import timedelta

dates = pd.date_range(
    dt.date(2023, 1, 1), dt.date(2024, 9, 1) - timedelta(days=1), freq="d"
)
df_dates = pd.DataFrame(dates, columns=["date"])
df_dates["date"] = df_dates["date"].dt.date
pp(f"[bold green] {len(df_dates)} dates[/bold green]")
df_dates.head(3)

Unnamed: 0,date
0,2023-01-01
1,2023-01-02
2,2023-01-03


In [23]:
queries = ["Li Qiang OR Qiang Li"]

In [25]:
for query in tqdm(queries):
    df_dates[f"{split_query(query)}"] = df_dates["date"].parallel_apply(
        lambda x: query_by_ids(query=query, start_date=x, end_date=x)
    )
df_dates.head()

  0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153), Label(value='0 / 153'))), HB…

100%|██████████| 1/1 [04:06<00:00, 246.34s/it]


Unnamed: 0,date,li qiang
0,2023-01-01,0
1,2023-01-02,33
2,2023-01-03,51
3,2023-01-04,56
4,2023-01-05,63


In [None]:
import