Media Cloud: Measuring Attention
================================
- Reference: https://github.com/mediacloud/api-tutorial-notebooks/blob/main/MC02%20-%20attention.ipynb
- Media Ids Query: https://search.mediacloud.org/sources/1

In [1]:
# Set up your API key and import needed things
import os, mediacloud.api
from importlib.metadata import version

# from dotenv import load_dotenv
import datetime as dt
from IPython.display import JSON, display
import bokeh.io
from rich import print as pp
import json
from tqdm import tqdm
import pandas as pd

tqdm.pandas()
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=4, verbose=0)
bokeh.io.reset_output()
bokeh.io.output_notebook(hide_banner=True)

MC_API_KEY = os.getenv("MEDIA_CLOUD_API_KEY")
if MC_API_KEY is None:
    try:
        with open("../config/media.cloud.key") as f:
            MC_API_KEY = f.read().strip()
        pp("[bold green][SUCCESS] MC API Key found.[/bold green]")
    except FileNotFoundError:
        pp(
            "[bold red][ERROR] MC API key not found. Check ENV 'MEDIA_CLOUD_API_KEY' or file './config/media.cloud.key'[/bold red]"
        )
else:
    pp("[bold green][SUCCESS] MC API Key found.[/bold green]")
search_api = mediacloud.api.SearchApi(MC_API_KEY)
# pp(f"[gray][INFO] Using Media Cloud python client v{version('mediacloud')}[/gray]")

## Listing Stories

Story counts are fine, but often what you really want is the story themselves. Note that **we cannot provide story content** due to copyright restrictions. However, you can get a list of all the URLs and then fetch them yourself. We can also return word counts down to the story level (see the "language" notebook for more info on that).

In [2]:
from email.mime import base
import requests


def qury_media_id(media_name: str = "nytimes"):
    try:
        response = requests.get(
            url="https://search.mediacloud.org/api/sources/sources/",
            params={
                "limit": "3",
                "name": media_name,
            },
            headers={
                "Cookie": "csrftoken=qqQVYfPizRLDITAtYCMn4ShmotfrK69T; sessionid=nk9ykemged6ukcvfjgfsg63p8l93p7ra",
            },
        )
        if response.status_code == 200:
            # print(response.json())
            return response.json()["results"]
        else:
            pp(f"[red]ERROR: {response.status_code}[/red]")
    except requests.exceptions.RequestException:
        pp("[red]HTTP Request failed[/red]")


# test
qury_media_id("Sean Hannity")

[{'id': 28136,
  'name': 'hannity.com',
  'url_search_string': None,
  'label': 'The Sean Hannity Show',
  'homepage': 'http://www.hannity.com/',
  'notes': None,
  'platform': 'online_news',
  'stories_per_week': 56,
  'first_story': None,
  'created_at': '2022-12-23T17:43:28.547804Z',
  'modified_at': '2024-08-31T13:46:53.492705Z',
  'pub_country': None,
  'pub_state': None,
  'primary_language': 'en',
  'media_type': None,
  'collection_count': 9}]

In [3]:
df_media_id = pd.read_csv(
    "/home/rongxin/data/sync/trump-exposure/scripts/media-cloud-search/data/meta/Collection-34412234-United States - National-sources-20240907062857.csv"
).drop(
    columns=[
        "url_search_string",
        "notes",
        "platform",
        "first_story",
        "pub_country",
        "media_type",
        "pub_state",
    ]
)
df_media_id["primary_language"] = df_media_id["primary_language"].fillna("")
df_media_id["name_clean"] = df_media_id["name"].parallel_apply(
    lambda x: x.lower().split(".")[0]
)

name_mapping = {
    "abcnews": "abc",
    "cbsnews": "cbs",
    "theguardian": "guardian",
    "huffingtonpost": "hp",
    "nypost": "nyp",
    "nytimes": "nyt",
    "usatoday": "usa",
    "washingtonpost": "wp",
    "breitbart": "bb",
    "businessinsider": "bi",
    "dailycaller": "caller",
    "foxnews": "fox",
    "nbcnews": "nbc",
    "politico": "pol",
    "buzzfeed": "buzz",
    "theguardian": "guard",
    "newsweek": "week",
}
df_media_id["name_clean"] = df_media_id["name"].parallel_apply(
    lambda x: x.lower().split(".")[0]
)
df_media_id["name_clean"] = df_media_id["name_clean"].replace(name_mapping)
df_media_id["stories_per_week"] = df_media_id["stories_per_week"].fillna(-1).astype(int)
df_media_id.to_csv(
    "../data/meta/Collection-34412234-United States - National-sources-20240907062857-cleaned.csv",
    index=False,
)
# df_media_id.head(1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

In [4]:
df_media_ideo = pd.read_csv(
    "/home/rongxin/data/sync/trump-exposure/scripts/media-cloud-search/data/ideology/media_ideo.csv"
)
df_media_ideo["name_clean"] = df_media_ideo["media"].parallel_apply(
    lambda x: x.lower().split("u_")[1]
)
# df_media_ideo.head()
df_media_id = (
    pd.merge(df_media_id, df_media_ideo, on="name_clean", how="outer")
    .dropna()
    .reset_index(drop=True)
)
df_media_id["id"] = df_media_id["id"].astype(int)
df_media_id = df_media_id[["id", "name", "medianame", "name_clean"]]
medium_not_listed = [
    [18897, "bbc.co.uk", "BBC News", "bbc"],
    # [2, "Washington Examiner", "exam"], # no records on media cloud
    # [3, "The Hill", "hill"],  # no records on media cloud
    [28136, "hannity.com", "Sean Hannity", "sean"],
]
medium_not_listed = pd.DataFrame(
    medium_not_listed, columns=["id", "name", "medianame", "name_clean"]
)
df_media_id = (
    pd.concat([df_media_id, medium_not_listed])
    .rename(columns={"medianame": "media"})
    .sort_values(by="name")
    .reset_index(drop=True)
)
pp(
    f"[bold yellow] [Warning] {len(df_media_id)} outlets; Washington Examiner and The Hill have no records on media cloud.[/bold yellow]"
)
df_media_id.to_csv(
    "../data/meta/cloud.ideo.mapping.csv",
    index=False,
)
df_media_id.head(3)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=8), Label(value='0 / 8'))), HBox(c…

Unnamed: 0,id,name,media,name_clean
0,19260,abcnews.go.com,ABC,abc
1,18897,bbc.co.uk,BBC News,bbc
2,19334,breitbart.com,Breitbart,bb


Create a function to detect:

In [5]:
def check_media_id_working(id: int = 2, query='"climate change"'):
    try:
        start_date = dt.date(2023, 11, 1)
        end_date = dt.date(2023, 12, 1)
        sources = [id]
        result = search_api.story_count(query, start_date, end_date, source_ids=sources)
        return int(result["total"])
    except Exception as e:
        pp(f"[red]ERROR: {e}[/red]")
        return 0


check_media_id_working(1, '"the"')

13423010

Check if ids work:

In [7]:
df_media_id["total"] = df_media_id["id"].progress_apply(
    lambda x: check_media_id_working(x)
)
df_media_id[df_media_id["total"] <= 0]

100%|██████████| 26/26 [00:07<00:00,  3.27it/s]


Unnamed: 0,id,name,media,name_clean,total
10,314,huffingtonpost.com,HuffPost,hp,0


In [6]:
df_media_id = df_media_id[df_media_id.name_clean != "hp"]

df_media_id.to_csv(
    "../data/meta/cloud.ideo.mapping.csv",
    index=False,
)
pp(
    f"[bold green] [SUCCESS] {len(df_media_id)} outlets; Washington Examiner, The Hill, and HuffPost have no records on media cloud.[/bold green]"
)

Export media ids:

In [7]:
media_ids = sorted(df_media_id["id"].tolist())
media_ids[:3]

[1, 2, 4]

In [53]:
# let's fetch all the stories matching our query on one day
my_query = '"the"'  # note the double quotes used to indicate use of the whole phrase
start_date = dt.date(2023, 11, 1)
end_date = dt.date(2023, 12, 1)
all_stories = []
US_NATIONAL_COLLECTION = 34412234
more_stories = True
pagination_token = None

while more_stories:
    page, pagination_token = search_api.story_list(
        my_query,
        start_date,
        end_date,
        # collection_ids=[US_NATIONAL_COLLECTION],
        # source_ids=sources,
        source_ids=media_ids,
        pagination_token=pagination_token,
    )
    all_stories += page
    more_stories = pagination_token is not None


def clean_story_dates(story):
    story["publish_date"] = str(story["publish_date"].strftime("%Y-%m-%d"))
    story["indexed_date"] = str(story["indexed_date"].strftime("%Y-%m-%d"))
    return story


all_stories = [clean_story_dates(story) for story in all_stories]
pp(f"[bold green]Found {len(all_stories)} stories[/bold green]")
all_stories[:3]

[{'id': '7853cd2a17dea3793aa3f90f7836eae5f80d4d657c2d39cc5f26210a24d66696',
  'media_name': 'bbc.co.uk',
  'media_url': 'bbc.co.uk',
  'title': 'A quick guide to smoking bans across the world',
  'publish_date': '2023-11-27',
  'url': 'https://www.bbc.co.uk/news/uk-67545363',
  'language': 'en',
  'indexed_date': '2024-08-30'},
 {'id': '0fc0218db50dde7663b1b183e56a05a6b97ec6d8d4b11b04fefa86291e3c4005',
  'media_name': 'buzzfeed.com',
  'media_url': 'buzzfeed.com',
  'title': 'Over, Under, Or Accurately Rated "Evermore" Songs Poll',
  'publish_date': '2023-11-27',
  'url': 'https://www.buzzfeed.com/paigeswiftie/evermore-songs-overrated-underrated-poll',
  'language': 'en',
  'indexed_date': '2024-08-23'},
 {'id': 'cc1ebfd683325dac32b7cc0bd9553ff2051b60086d5838b3b7e6aff7f28596bc',
  'media_name': 'foxnews.com',
  'media_url': 'foxnews.com',
  'title': 'What to know about Legionnaires’ disease, the lung infection reported in New Hampshire',
  'publish_date': '2023-11-06',
  'url': 'https:

In [54]:
df_stories = pd.DataFrame(all_stories)
df_stories.head(3)

Unnamed: 0,id,media_name,media_url,title,publish_date,url,language,indexed_date
0,7853cd2a17dea3793aa3f90f7836eae5f80d4d657c2d39...,bbc.co.uk,bbc.co.uk,A quick guide to smoking bans across the world,2023-11-27,https://www.bbc.co.uk/news/uk-67545363,en,2024-08-30
1,0fc0218db50dde7663b1b183e56a05a6b97ec6d8d4b11b...,buzzfeed.com,buzzfeed.com,"Over, Under, Or Accurately Rated ""Evermore"" So...",2023-11-27,https://www.buzzfeed.com/paigeswiftie/evermore...,en,2024-08-23
2,cc1ebfd683325dac32b7cc0bd9553ff2051b60086d5838...,foxnews.com,foxnews.com,"What to know about Legionnaires’ disease, the ...",2023-11-06,https://www.foxnews.com/health/what-know-about...,en,2024-08-15


In [55]:
df_stories.value_counts("media_name")

media_name
cbsnews.com            13368
foxnews.com             7699
bbc.co.uk               7633
wsj.com                 6944
nypost.com              6650
washingtonpost.com      5761
newsweek.com            3815
abcnews.go.com          3749
businessinsider.com     3035
dailycaller.com         2516
buzzfeed.com            2175
breitbart.com           1922
nbcnews.com             1580
politico.com             991
pbs.org                  852
usatoday.com             774
time.com                 575
msnbc.com                514
vice.com                 404
vox.com                  166
hannity.com              132
Name: count, dtype: int64

In [56]:
df_stories.to_csv("../data/stories/stories.the.all.by.id.csv", index=False)

## Parallelized by dates

In [9]:
def split_query(
    query: str = "Donald Trump OR Donald J Trump OR RealDonaldTrump OR donald trump OR donald j trump OR realdonaldtrump",
):
    query = query.lower()
    if "congress" in query:
        return "congress"
    elif "senate" in query:
        return "senate"
    elif " or " in query:
        query = query.split(" or ")[0]
        return query
        # if " " in query:
        #     return query.split(" ")[0]
        # else:
        #     return query
    elif " and " in query:
        query = query.split(" and ")[0]
        return query
        # if " " in query:
        #     return query.split(" ")[0]
        # else:
        #     return query
    elif " " in query:
        return query.split(" ")[0]
    else:
        return query


def query_by_ids(
    ids: list,
    query: str = "the",
    start_date: dt.date = dt.date(2023, 11, 1),
    end_date: dt.date = dt.date(2023, 12, 1),
):
    data_folder = f"../data/stories/by.ids/{split_query(query)}"
    try:
        # let's fetch all the stories matching our query on one day
        my_query = f"{query}"  # note the double quotes used to indicate use of the whole phrase
        all_stories = []
        more_stories = True
        pagination_token = None

        while more_stories:
            page, pagination_token = search_api.story_list(
                my_query,
                start_date,
                end_date,
                source_ids=ids,
                pagination_token=pagination_token,
            )
            all_stories += page
            more_stories = pagination_token is not None

        def clean_story_dates(story):
            story["publish_date"] = str(story["publish_date"].strftime("%Y-%m-%d"))
            story["indexed_date"] = str(story["indexed_date"].strftime("%Y-%m-%d"))
            return story

        all_stories = [clean_story_dates(story) for story in all_stories]
        pp(f"[bold green]Found {len(all_stories)}  by ids.[/bold green]")
        if not os.path.exists(f"../data/stories/by.ids/{split_query(query)}"):
            os.makedirs(f"../data/stories/by.ids/{split_query(query)}")
        pd.DataFrame(all_stories).to_csv(
            f"../data/stories/by.ids/{split_query(query)}/{str(start_date)}.csv",
            index=False,
        )
        return len(all_stories)
    except Exception as e:
        pp(f"[red]ERROR: {e}[/red], pass")
        return 0


def query_by_collections(
    collections=[34412234],
    query: str = "the",
    start_date: dt.date = dt.date(2023, 11, 1),
    end_date: dt.date = dt.date(2023, 12, 1),
):
    # let's fetch all the stories matching our query on one day
    my_query = (
        f"{query}"  # note the double quotes used to indicate use of the whole phrase
    )
    all_stories = []
    more_stories = True
    pagination_token = None

    while more_stories:
        page, pagination_token = search_api.story_list(
            my_query,
            start_date,
            end_date,
            collection_ids=collections,
            pagination_token=pagination_token,
        )
        all_stories += page
        more_stories = pagination_token is not None

    def clean_story_dates(story):
        story["publish_date"] = str(story["publish_date"].strftime("%Y-%m-%d"))
        story["indexed_date"] = str(story["indexed_date"].strftime("%Y-%m-%d"))
        return story

    all_stories = [clean_story_dates(story) for story in all_stories]
    if not os.path.exists(f"../data/stories/by.collections/{split_query(query)}"):
        os.makedirs(f"../data/stories/by.collections/{split_query(query)}")
    pd.DataFrame(all_stories).to_csv(
        f"../data/stories/by.collections/{split_query(query)}/{str(start_date)}.csv",
        index=False,
    )
    pp(f"[bold green]Found {len(all_stories)} stories by collections. [/bold green]")
    return len(all_stories)


split_query()

'donald trump'

In [10]:
from datetime import timedelta

# dates = [dt.date(2020, 1, day) for day in range(1, 3)]
dates = pd.date_range(
    dt.date(2020, 1, 7), dt.date(2022, 1, 7) - timedelta(days=1), freq="d"
)
df_dates = pd.DataFrame(dates, columns=["date"])
df_dates["date"] = df_dates["date"].dt.date
pp(f"[bold green] {len(df_dates)} dates[/bold green]")
df_dates.head(3)

Unnamed: 0,date
0,2020-01-07
1,2020-01-08
2,2020-01-09


In [13]:
queries = [
    "Donald Trump OR Donald J Trump OR RealDonaldTrump OR donald trump OR donald j trump OR realdonaldtrump",
    "Joe Biden OR joebiden",
    "Mike Pence OR mikepence",
    "Howie Hawkins OR howiehawkins2020",
    "Kamala Harris OR kamalaharris",
    "Jo Jorgensen OR JoForLiberty",
    "Angela Nicole Walker OR Angela Walker",
    "Spike Cohen OR literallyspikecohe",
    "Republican Party OR GOP OR republicanparty OR The Republican",
    "Democratic Party OR Democrats OR TheDemocrats OR The Democratic",
    "2020 United States presidential election OR #election OR presidential election",
    "white house OR whitehouse",
    "United States Congress OR congress",
    "United States Senate OR senate",
]

In [14]:
for query in tqdm(queries):
    df_dates[f"{query}"] = df_dates["date"].parallel_apply(
        lambda x: query_by_collections(query=query, start_date=x, end_date=x)
    )
    df_dates.head()

  0%|          | 0/14 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

  7%|▋         | 1/14 [04:58<1:04:38, 298.35s/it]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

 14%|█▍        | 2/14 [38:42<4:22:42, 1313.56s/it]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

 21%|██▏       | 3/14 [55:02<3:32:53, 1161.24s/it]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

 29%|██▊       | 4/14 [58:14<2:09:45, 778.51s/it] 

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

 36%|███▌      | 5/14 [1:04:57<1:36:29, 643.27s/it]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

 43%|████▎     | 6/14 [1:08:35<1:06:29, 498.66s/it]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

 50%|█████     | 7/14 [1:11:47<46:29, 398.45s/it]  

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

 57%|█████▋    | 8/14 [1:18:49<40:34, 405.69s/it]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

 64%|██████▍   | 9/14 [1:41:01<57:56, 695.35s/it]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

 71%|███████▏  | 10/14 [1:59:13<54:31, 817.87s/it]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

 79%|███████▊  | 11/14 [2:03:26<32:14, 644.93s/it]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

 86%|████████▌ | 12/14 [2:43:04<39:04, 1172.06s/it]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

 93%|█████████▎| 13/14 [3:14:57<23:16, 1396.74s/it]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=183), Label(value='0 / 183'))), HB…

100%|██████████| 14/14 [3:45:47<00:00, 967.66s/it] 
100%|██████████| 14/14 [3:45:47<00:00, 967.66s/it] 
