Media Cloud: Measuring Attention
================================
- Reference: https://github.com/mediacloud/api-tutorial-notebooks/blob/main/MC02%20-%20attention.ipynb
- Media Ids Query: https://search.mediacloud.org/sources/1

In [1]:
# Set up your API key and import needed things
import os, mediacloud.api
from importlib.metadata import version

# from dotenv import load_dotenv
import datetime as dt
from IPython.display import JSON, display
import bokeh.io
from rich import print as pp
import json
from tqdm import tqdm
import pandas as pd

tqdm.pandas()
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=False, nb_workers=4, verbose=0)
bokeh.io.reset_output()
bokeh.io.output_notebook(hide_banner=True)

MC_API_KEY = os.getenv("MEDIA_CLOUD_API_KEY")
if MC_API_KEY is None:
    try:
        with open("../config/media.cloud.key") as f:
            MC_API_KEY = f.read().strip()
        pp("[bold green][SUCCESS] MC API Key found.[/bold green]")
    except FileNotFoundError:
        pp(
            "[bold red][ERROR] MC API key not found. Check ENV 'MEDIA_CLOUD_API_KEY' or file './config/media.cloud.key'[/bold red]"
        )
else:
    pp("[bold green][SUCCESS] MC API Key found.[/bold green]")
search_api = mediacloud.api.SearchApi(MC_API_KEY)
# pp(f"[gray][INFO] Using Media Cloud python client v{version('mediacloud')}[/gray]")

## Listing Stories

Story counts are fine, but often what you really want is the story themselves. Note that **we cannot provide story content** due to copyright restrictions. However, you can get a list of all the URLs and then fetch them yourself. We can also return word counts down to the story level (see the "language" notebook for more info on that).

In [2]:
from email.mime import base
import requests


def qury_media_id(media_name: str = "nytimes"):
    try:
        response = requests.get(
            url="https://search.mediacloud.org/api/sources/sources/",
            params={
                "limit": "3",
                "name": media_name,
            },
            headers={
                "Cookie": "csrftoken=qqQVYfPizRLDITAtYCMn4ShmotfrK69T; sessionid=nk9ykemged6ukcvfjgfsg63p8l93p7ra",
            },
        )
        if response.status_code == 200:
            # print(response.json())
            return response.json()["results"]
        else:
            pp(f"[red]ERROR: {response.status_code}[/red]")
    except requests.exceptions.RequestException:
        pp("[red]HTTP Request failed[/red]")


# test
qury_media_id("Sean Hannity")

[{'id': 28136,
  'name': 'hannity.com',
  'url_search_string': None,
  'label': 'The Sean Hannity Show',
  'homepage': 'http://www.hannity.com/',
  'notes': None,
  'platform': 'online_news',
  'stories_per_week': 56,
  'first_story': None,
  'created_at': '2022-12-23T17:43:28.547804Z',
  'modified_at': '2024-08-31T13:46:53.492705Z',
  'pub_country': None,
  'pub_state': None,
  'primary_language': 'en',
  'media_type': None,
  'collection_count': 9}]

In [32]:
df_media_id = pd.read_csv(
    "/home/rongxin/data/sync/trump-exposure/scripts/media-cloud-search/data/meta/Collection-34412234-United States - National-sources-20240907062857.csv"
).drop(
    columns=[
        "url_search_string",
        "notes",
        "platform",
        "first_story",
        "pub_country",
        "media_type",
        "pub_state",
    ]
)
df_media_id["primary_language"] = df_media_id["primary_language"].fillna("")
df_media_id["name_clean"] = df_media_id["name"].parallel_apply(
    lambda x: x.lower().split(".")[0]
)

name_mapping = {
    "abcnews": "abc",
    "cbsnews": "cbs",
    "theguardian": "guardian",
    "huffingtonpost": "hp",
    "nypost": "nyp",
    "nytimes": "nyt",
    "usatoday": "usa",
    "washingtonpost": "wp",
    "breitbart": "bb",
    "businessinsider": "bi",
    "dailycaller": "caller",
    "foxnews": "fox",
    "nbcnews": "nbc",
    "politico": "pol",
    "buzzfeed": "buzz",
    "theguardian": "guard",
    "newsweek": "week",
}
df_media_id["name_clean"] = df_media_id["name"].parallel_apply(
    lambda x: x.lower().split(".")[0]
)
df_media_id["name_clean"] = df_media_id["name_clean"].replace(name_mapping)
df_media_id["stories_per_week"] = df_media_id["stories_per_week"].fillna(-1).astype(int)
df_media_id.to_csv(
    "../data/meta/Collection-34412234-United States - National-sources-20240907062857-cleaned.csv",
    index=False,
)
# df_media_id.head(1)

In [33]:
df_media_ideo = pd.read_csv(
    "/home/rongxin/data/sync/trump-exposure/scripts/media-cloud-search/data/ideology/media_ideo.csv"
)
df_media_ideo["name_clean"] = df_media_ideo["media"].parallel_apply(
    lambda x: x.lower().split("u_")[1]
)
# df_media_ideo.head()
df_media_id = (
    pd.merge(df_media_id, df_media_ideo, on="name_clean", how="outer")
    .dropna()
    .reset_index(drop=True)
)
df_media_id["id"] = df_media_id["id"].astype(int)
df_media_id = df_media_id[["id", "name", "medianame", "name_clean"]]
medium_not_listed = [
    [18897, "bbc.co.uk", "BBC News", "bbc"],
    # [2, "Washington Examiner", "exam"], # no records on media cloud
    # [3, "The Hill", "hill"],  # no records on media cloud
    [28136, "hannity.com", "Sean Hannity", "sean"],
]
medium_not_listed = pd.DataFrame(
    medium_not_listed, columns=["id", "name", "medianame", "name_clean"]
)
df_media_id = (
    pd.concat([df_media_id, medium_not_listed])
    .rename(columns={"medianame": "media"})
    .sort_values(by="name")
    .reset_index(drop=True)
)
pp(
    f"[bold yellow] [Warning] {len(df_media_id)} outlets; Washington Examiner and The Hill have no records on media cloud.[/bold yellow]"
)
df_media_id.to_csv(
    "../data/meta/cloud.ideo.mapping.csv",
    index=False,
)
df_media_id.head(3)

Unnamed: 0,id,name,media,name_clean
0,19260,abcnews.go.com,ABC,abc
1,18897,bbc.co.uk,BBC News,bbc
2,19334,breitbart.com,Breitbart,bb


Create a function to detect:

In [37]:
def check_media_id_working(id: int = 2, query='"climate change"'):
    try:
        start_date = dt.date(2023, 11, 1)
        end_date = dt.date(2023, 12, 1)
        sources = [id]
        result = search_api.story_count(query, start_date, end_date, source_ids=sources)
        return int(result["total"])
    except Exception as e:
        pp(f"[red]ERROR: {e}[/red]")
        return 0


check_media_id_working(1, '"the"')

13423010

Check if ids work:

In [38]:
df_media_id["total"] = df_media_id["id"].progress_apply(
    lambda x: check_media_id_working(x)
)
df_media_id[df_media_id["total"] <= 0]

100%|██████████| 26/26 [00:09<00:00,  2.88it/s]


Unnamed: 0,id,name,media,name_clean,total
10,314,huffingtonpost.com,HuffPost,hp,0


In [40]:
df_media_id = df_media_id[df_media_id.name_clean != "hp"]

df_media_id.to_csv(
    "../data/meta/cloud.ideo.mapping.csv",
    index=False,
)
pp(
    f"[bold green] [SUCCESS] {len(df_media_id)} outlets; Washington Examiner, The Hill, and HuffPost have no records on media cloud.[/bold green]"
)

Export media ids:

In [42]:
media_ids = sorted(df_media_id["id"].tolist())
media_ids[:3]

[1, 2, 4]

In [53]:
# let's fetch all the stories matching our query on one day
my_query = '"the"'  # note the double quotes used to indicate use of the whole phrase
start_date = dt.date(2023, 11, 1)
end_date = dt.date(2023, 12, 1)
all_stories = []
US_NATIONAL_COLLECTION = 34412234
more_stories = True
pagination_token = None

while more_stories:
    page, pagination_token = search_api.story_list(
        my_query,
        start_date,
        end_date,
        # collection_ids=[US_NATIONAL_COLLECTION],
        # source_ids=sources,
        source_ids=media_ids,
        pagination_token=pagination_token,
    )
    all_stories += page
    more_stories = pagination_token is not None


def clean_story_dates(story):
    story["publish_date"] = str(story["publish_date"].strftime("%Y-%m-%d"))
    story["indexed_date"] = str(story["indexed_date"].strftime("%Y-%m-%d"))
    return story


all_stories = [clean_story_dates(story) for story in all_stories]
pp(f"[bold green]Found {len(all_stories)} stories[/bold green]")
all_stories[:3]

In [None]:
df_stories = pd.DataFrame(all_stories)
df_stories.head(3)

Unnamed: 0,id,media_name,media_url,title,publish_date,url,language,indexed_date
0,ed7820ec86e026771655874c0bba915ef2dbf721e5ff43...,bbc.co.uk,bbc.co.uk,US election 2024: A really simple guide,2023-11-02,https://www.bbc.co.uk/news/world-us-canada-672...,en,2024-04-21
1,d1cf1f89237aaff12af829813ded844caf68abf7c459da...,nbcnews.com,nbcnews.com,Alaska man charged with threatening to kidnap ...,2023-11-01,https://www.nbcnews.com/politics/congress/alas...,en,2024-02-17
2,d2b0d9be6544be30b08263bd766809cc43ec5eeaea547e...,businessinsider.com,businessinsider.com,Bad news keeps coming for Kyiv as problems far...,2023-11-01,https://www.businessinsider.com/bad-news-ukrai...,en,2024-02-17
3,85fb7df5412912edca11b777f3bb2db87c8aaa288fb027...,abcnews.go.com,abcnews.go.com,Donald Trump Jr. to testify in Trump Organizat...,2023-11-01,https://abcnews.go.com/US/donald-trump-jr-test...,en,2024-02-17
4,1bdc420d5fe2af96fbe1ddf5af9642297083219ec8a3d2...,abcnews.go.com,abcnews.go.com,Video Microsoft launches AI work assistant,2023-11-01,https://abcnews.go.com/Technology/video/micros...,en,2024-02-17


In [None]:
df_stories.value_counts("media_name")

media_name
washingtonpost.com     59
newsweek.com           58
foxnews.com            38
cbsnews.com            28
nypost.com             27
dailycaller.com        27
abcnews.go.com         26
politico.com           23
msnbc.com              22
nbcnews.com            16
businessinsider.com    14
wsj.com                13
pbs.org                12
bbc.co.uk               9
breitbart.com           5
time.com                3
vox.com                 1
vice.com                1
Name: count, dtype: int64

In [None]:
df_stories.to_csv("../data/stories/stories.the.all.by.id.csv", index=False)

## Parallelized by dates

In [None]:
dates = [dt.date(2021, 1, day) for day in range(1, 8)]