Media Cloud: Measuring Attention
================================
- Reference: https://github.com/mediacloud/api-tutorial-notebooks/blob/main/MC02%20-%20attention.ipynb
- Media Ids Query: https://search.mediacloud.org/sources/1

In [43]:
# Set up your API key and import needed things
import os, mediacloud.api
from importlib.metadata import version

# from dotenv import load_dotenv
import datetime as dt
from IPython.display import JSON, display
import bokeh.io
from rich import print as pp
import json
from tqdm import tqdm
import pandas as pd

tqdm.pandas()
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=8, verbose=0)
bokeh.io.reset_output()
bokeh.io.output_notebook(hide_banner=True)

MC_API_KEY = os.getenv("MEDIA_CLOUD_API_KEY")
if MC_API_KEY is None:
    try:
        with open("../config/media.cloud.key") as f:
            MC_API_KEY = f.read().strip()
        pp("[bold green][SUCCESS] MC API Key found.[/bold green]")
    except FileNotFoundError:
        pp(
            "[bold red][ERROR] MC API key not found. Check ENV 'MEDIA_CLOUD_API_KEY' or file './config/media.cloud.key'[/bold red]"
        )
else:
    pp("[bold green][SUCCESS] MC API Key found.[/bold green]")
search_api = mediacloud.api.SearchApi(MC_API_KEY)
# pp(f"[gray][INFO] Using Media Cloud python client v{version('mediacloud')}[/gray]")

## Listing Stories

Story counts are fine, but often what you really want is the story themselves. Note that **we cannot provide story content** due to copyright restrictions. However, you can get a list of all the URLs and then fetch them yourself. We can also return word counts down to the story level (see the "language" notebook for more info on that).

In [33]:
from email.mime import base
import requests

# curl 'https://search.mediacloud.org/api/sources/sources/?limit=100&name=nytimes' \
#   -H 'accept: */*' \
#   -H 'accept-language: zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7' \
#   -H 'baggage: sentry-environment=prod,sentry-public_key=8fad60f457894e6485492dadaf593a55,sentry-trace_id=f19cb62c5a18487f9a05b2ebcc6de967' \
#   -H 'cookie: csrftoken=qqQVYfPizRLDITAtYCMn4ShmotfrK69T; sessionid=nk9ykemged6ukcvfjgfsg63p8l93p7ra' \
#   -H 'dnt: 1' \
#   -H 'priority: u=1, i' \
#   -H 'referer: https://search.mediacloud.org/directory' \
#   -H 'sec-ch-ua: "Not;A=Brand";v="24", "Chromium";v="128"' \
#   -H 'sec-ch-ua-mobile: ?0' \
#   -H 'sec-ch-ua-platform: "macOS"' \
#   -H 'sec-fetch-dest: empty' \
#   -H 'sec-fetch-mode: cors' \
#   -H 'sec-fetch-site: same-origin' \
#   -H 'sentry-trace: f19cb62c5a18487f9a05b2ebcc6de967-b4a3cdbb323d7396' \
#   -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36' \
#   -H 'x-csrftoken: qqQVYfPizRLDITAtYCMn4ShmotfrK69T'


def qury_media_id(media_name: str = "nytimes"):
    try:
        response = requests.get(
            url="https://search.mediacloud.org/api/sources/sources/",
            params={
                "limit": "3",
                "name": media_name,
            },
            headers={
                "Cookie": "csrftoken=qqQVYfPizRLDITAtYCMn4ShmotfrK69T; sessionid=nk9ykemged6ukcvfjgfsg63p8l93p7ra",
            },
        )
        if response.status_code == 200:
            # print(response.json())
            return response.json()["results"]
        else:
            pp(f"[red]ERROR: {response.status_code}[/red]")
    except requests.exceptions.RequestException:
        pp("[red]HTTP Request failed[/red]")


# test
qury_media_id("nytimes")

[{'id': 1,
  'name': 'nytimes.com',
  'url_search_string': '',
  'label': 'New York Times',
  'homepage': 'http://nytimes.com',
  'notes': None,
  'platform': 'online_news',
  'stories_per_week': 1245,
  'first_story': None,
  'created_at': '2022-12-23T17:43:28.547804Z',
  'modified_at': '2024-08-31T13:43:19.039389Z',
  'pub_country': 'USA',
  'pub_state': 'US-NY',
  'primary_language': None,
  'media_type': 'print_native',
  'collection_count': 45},
 {'id': 111803,
  'name': 'nytimes.com',
  'url_search_string': 'cn.nytimes.com',
  'label': 'cn.nytimes.com',
  'homepage': 'http://cn.nytimes.com/',
  'notes': None,
  'platform': 'online_news',
  'stories_per_week': 24,
  'first_story': None,
  'created_at': '2022-12-23T17:43:28.547804Z',
  'modified_at': '2024-08-31T13:52:13.435650Z',
  'pub_country': 'USA',
  'pub_state': 'US-NY',
  'primary_language': 'zh',
  'media_type': 'print_native',
  'collection_count': 3},
 {'id': 897236,
  'name': 'www.nytimes.com',
  'url_search_string': No

In [48]:
df_media_id = pd.read_csv(
    "/home/rongxin/data/sync/trump-exposure/scripts/media-cloud-search/data/meta/Collection-34412234-United States - National-sources-20240907062857.csv"
).drop(
    columns=[
        "url_search_string",
        "notes",
        "platform",
        "first_story",
        "pub_country",
        "media_type",
        "pub_state",
    ]
)
df_media_id["primary_language"] = df_media_id["primary_language"].fillna("")
df_media_id["name"] = df_media_id["name"].parallel_apply(
    lambda x: x.lower().split(".com")[0]
)
df_media_id["stories_per_week"] = df_media_id["stories_per_week"].fillna(-1).astype(int)
df_media_id.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=32), Label(value='0 / 32'))), HBox…

Unnamed: 0,id,name,label,homepage,stories_per_week,primary_language
0,1,nytimes,New York Times,http://nytimes.com,1245,
1,2,washingtonpost,Washington Post,http://washingtonpost.com,1940,
2,3,csmonitor,Christian Science Monitor,http://csmonitor.com,93,en
3,4,usatoday,USA Today,http://www.usatoday.com,1422,
4,6,latimes,LA Times,http://www.latimes.com/,1310,


In [30]:
# let's fetch all the stories matching our query on one day
my_query = '"trump"'  # note the double quotes used to indicate use of the whole phrase
start_date = dt.date(2023, 11, 1)
end_date = dt.date(2023, 11, 2)
sources = [1, 4, 111803, 897236]
all_stories = []
US_NATIONAL_COLLECTION = 34412234
more_stories = True
pagination_token = None

while more_stories:
    page, pagination_token = search_api.story_list(
        my_query,
        start_date,
        end_date,
        collection_ids=[US_NATIONAL_COLLECTION],
        source_ids=sources,
        pagination_token=pagination_token,
    )
    all_stories += page
    more_stories = pagination_token is not None


def clean_story_dates(story):
    story["publish_date"] = str(story["publish_date"].strftime("%Y-%m-%d"))
    story["indexed_date"] = str(story["indexed_date"].strftime("%Y-%m-%d"))
    return json.dumps(story)


all_stories = [clean_story_dates(story) for story in all_stories]
pp(f"[bold green]Found {len(all_stories)} stories[/bold green]")
all_stories

['{"id": "21498eb52c9c22bc16f0002a025abf9538fd04623f63cda8368c70943b85c967", "media_name": "npr.org", "media_url": "npr.org", "title": "Eric and Donald Trump Jr. Take the Stand", "publish_date": "2023-11-02", "url": "https://www.npr.org/2023/11/02/1198908958/consider-this-from-npr-draft-11-02-2023", "language": "en", "indexed_date": "2024-05-03"}',
 '{"id": "13ff9c66a66e93172de8abc934633226cf3da83a872946d2149adce765306ad3", "media_name": "npr.org", "media_url": "npr.org", "title": "Gaza War Escalates, Pakistan Deportation Deadline, Trump Organization Trial", "publish_date": "2023-11-01", "url": "https://www.npr.org/2023/11/01/1198909020/up-first-draft-11-01-2023", "language": "en", "indexed_date": "2024-05-02"}',
 '{"id": "32c6f9332d6625be0e27ca2deb6b777879b19f6654bd30de3c7416b4a85efb79", "media_name": "recorder.com", "media_url": "recorder.com", "title": "My Turn: Time for words", "publish_date": "2023-11-01", "url": "https://www.recorder.com/my-turn-Charney-Time-For-Words-49494418", 

### Writing a CSV of Story Data

What you probably want is a csv of all this story data. Here's a quick exmaple of dumping that data to a CSV (like our Search tool does).

In [None]:
import csv

fieldnames = [
    "id",
    "publish_date",
    "title",
    "url",
    "language",
    "media_name",
    "media_url",
    "indexed_date",
]
with open("story-list.csv", "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction="ignore")
    writer.writeheader()
    for s in all_stories:
        writer.writerow(s)

In [None]:
# and let's make sure it worked by checking out by loading it up as a pandas DataFrame
import pandas

df = pandas.read_csv("story-list.csv")
df.head()