# Batik Resilience Project 

In [2]:
import mediacloud, datetime, time, json, re, random, os, time, csv, mediacloud.api
from datetime import date, timedelta
mc = mediacloud.api.AdminMediaCloud(os.environ['MC_API_KEY'])
mediacloud.__version__

'3.7.4'

In [7]:
timespan = "publish_day:[2017-01-01T00:00:00Z TO 2019-12-01T00:00:00Z]"
us_media = ['34412234', '38379429']
places = {
    'Sydney': ['34412282', '38378024'],
    'Chennai': ['34412118', '38379954'],
    'Florida': ['38379430'],
    'Alaska': ['38381315'],
    'Kenya': ['34412126', '38380260'],
    'Singapore': ['34412474'],
    'London': ['34412476', '38381111'],
    'Brazil': ['34412257', '38379250'],
    'Islands': [ '34412175', '34412204', '34412411', '38381481', '34412109', '34412204', '34412168', '34412399', '38380877', '34412137' ],
    '"Saudi Arabia"': ['34412050', '38380804'],
    # 'Argentina': ['34412043', '38376412'],
    # 'Paris': ['34412146', '38379799'],
    # 'Amsterdam': ['34412382', '38380454'],
    # 'Rome': ['34412372', '38380117'],
    # 'Milan': ['34412372', '38380117'],
    # 'Madrid': ['34412356', '38002034'],
    # 'Athens': ['34412477', '38379845'],
    #'Mexico': ['34412427', '38380322'],
    # 'Rotterdam': ['34412382', '38380454'],
    
}
queries = {
    'security': '"security climate"~40',
    'resilience': '"resilience climate"~40',
    'migration': '(("migration climate"~40) OR  ("migrant climate"~40) OR  ("migrate climate"~40)  OR  ("immigration climate"~40)  OR  ("immigrate climate"~40))'
}

### Compute Batik Length Data

In [262]:
length_data = []
for p, m in places.items():
    for t, q in queries.items():
        local_relevant_story_count = mc.storyCount("{} AND {} AND language:en AND tags_id_media:({})".format(p,q," ".join(m)), solr_filter=timespan)
        local_total_story_count = mc.storyCount("{} AND language:en AND tags_id_media:({})".format(p," ".join(m)), solr_filter=timespan)
        us_relevant_story_count = mc.storyCount("{} AND {} AND language:en AND tags_id_media:({})".format(p,q," ".join(us_media)), solr_filter=timespan)
        us_total_story_count = mc.storyCount("{} AND language:en AND tags_id_media:({})".format(p," ".join(us_media)), solr_filter=timespan)
        row = {
            'place': p,
            'topic': t,
            'local_relevant': local_relevant_story_count['count'],
            'local_total': local_total_story_count['count'],
            'us_relevant': us_relevant_story_count['count'],
            'us_total': us_total_story_count['count'],
            'combined_relevant': local_relevant_story_count['count'] + us_relevant_story_count['count'],
            'combined_total': local_total_story_count['count'] + us_total_story_count['count'],
        }
        length_data.append(row)

In [263]:
import csv, json
with open('batik-data/batik-length.json', 'w') as f:
    f.write(json.dumps(length_data))
with open('batik-data/batik-length.csv', 'w') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter = csv.DictWriter(csvfile, fieldnames=['place', 'topic', 'local_relevant', 'local_total', 
                                                    'us_relevant', 'us_total', 'combined_relevant', 'combined_total'])
    spamwriter.writeheader()
    for row in length_data:
        spamwriter.writerow(row)

### Compute Batik Themes

In [None]:
NYT_LABELS_TAG_SET_ID = 1963  # the tag set all the descriptor tags are in
theme_data = []
for p, m in places.items():
    for t, q in queries.items():
        top_themes = mc.storyTagCount("{} AND {} AND language:en AND tags_id_media:({})".format(p,q," ".join(m)), solr_filter=timespan, tag_sets_id=NYT_LABELS_TAG_SET_ID)
        item = {
            'place': p,
            'topic': t,
            'top_themes': [{'count': tag['count'], 'name': tag['description'], 'tags_id': tag['tags_id']} for tag in top_themes]
        }
        theme_data.append(item)

In [None]:
import csv
import json
with open('batik-data/batik-themes.json', 'w') as f:
    f.write(json.dumps(theme_data))
with open('batik-data/batik-themes.csv', 'w') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(["{} - {}".format(i['place'], i['topic']) for i in theme_data])
    for idx in range(0, 20):
        spamwriter.writerow([i['top_themes'][idx]['name'] for i in theme_data if idx < len(i['top_themes'])])

## Compute Climate Coverage Counts

In [260]:
climate_data = []
for p, m in places.items():
    climate_story_count = mc.storyCount('{} AND ("climate change" OR "global warming") AND language:en AND tags_id_media:({})'.format(p," ".join(m)), solr_filter=timespan)
    security_story_count = mc.storyCount('{} AND {} AND language:en AND tags_id_media:({})'.format(p,queries['security']," ".join(m)), solr_filter=timespan)
    resilience_story_count = mc.storyCount('{} AND {} AND language:en AND tags_id_media:({})'.format(p,queries['resilience']," ".join(m)), solr_filter=timespan)
    migration_story_count = mc.storyCount('{} AND {} AND language:en AND tags_id_media:({})'.format(p,queries['migration']," ".join(m)), solr_filter=timespan)
    total_story_count = mc.storyCount("{} AND language:en AND tags_id_media:({})".format(p," ".join(m)), solr_filter=timespan)
    row = {
        'place': p,
        'security-stories': security_story_count['count'],
        'resilience-stories': resilience_story_count['count'],
        'migration-stories': migration_story_count['count'],
        'climate-change-stories': climate_story_count['count'],
        'total-stories': total_story_count['count'],
    }
    climate_data.append(row)

In [261]:
import csv, json
with open('batik-data/batik-climate-coverage.json', 'w') as f:
    f.write(json.dumps(climate_data))
with open('batik-data/batik-climate-coverage.csv', 'w') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=["place", "security-stories", "resilience-stories", "migration-stories", "climate-change-stories", "total-stories"])
    spamwriter.writeheader()
    for row in climate_data:
        spamwriter.writerow(row)

## Compute Climate Coverage Over Time

In [264]:
climate_attention_data = []
for p, m in places.items():
    climate_story_count = mc.storyCount('{} AND ("climate change" OR "global warming") AND language:en AND tags_id_media:({})'.format(p," ".join(m)), solr_filter=timespan, split=True, split_period='week')
    security_story_count = mc.storyCount('{} AND {} AND language:en AND tags_id_media:({})'.format(p,queries['security']," ".join(m)), solr_filter=timespan, split=True, split_period='week')
    resilience_story_count = mc.storyCount('{} AND {} AND language:en AND tags_id_media:({})'.format(p,queries['resilience']," ".join(m)), solr_filter=timespan, split=True, split_period='week')
    migration_story_count = mc.storyCount('{} AND {} AND language:en AND tags_id_media:({})'.format(p,queries['migration']," ".join(m)), solr_filter=timespan, split=True, split_period='week')
    total_story_count = mc.storyCount("{} AND language:en AND tags_id_media:({})".format(p," ".join(m)), solr_filter=timespan, split=True, split_period='week')
    row = {
        'place': p,
        'security-stories': security_story_count,
        'resilience-stories': resilience_story_count,
        'migration-stories': migration_story_count,
        'climate-change-stories': climate_story_count,
        'total-stories': total_story_count,
    }
    climate_attention_data.append(row)

In [265]:
import csv, json
with open('batik-data/batik-climate-attention.json', 'w') as f:
    f.write(json.dumps(climate_attention_data))


## Top Words for each Pair

In [183]:
word_data = []
word_query = "{} AND {} AND language:en AND tags_id_media:({})"
for p, m in places.items():
    for t, q in queries.items():
        top_words = mc.wordCount(word_query.format(p,q," ".join(m)), solr_filter=timespan)
        item = {
            'place': p,
            'topic': t,
            'top_words': [{'word': w['term'], 'freq': w['count']} for w in top_words if w['term'] not in [p.lower(), 'climate']]
        }
        word_data.append(item)

In [185]:
import csv, json
with open('batik-data/batik-words.json', 'w') as f:
    f.write(json.dumps(word_data))
with open('batik-data/batik-words.csv', 'w') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(["{} - {}".format(i['place'], i['topic']) for i in word_data])
    for idx in range(0, 40):
        spamwriter.writerow([i['top_words'][idx]['word'] for i in word_data if idx < len(i['top_words'])])

## Source Counts

In [188]:
counts = []
for p, m in places.items():
    place_media_count = 0
    for tags_id in m:
        last_id = 0
        media_count = 0
        more_media = True
        while more_media:
            media_page = mc.mediaList(tags_id=tags_id, rows=100, last_media_id=last_id)
            if len(media_page) == 0:
                more_media = False
            else:
                media_count += len(media_page)
                last_id = media_page[-1]['media_id']
        place_media_count += media_count
    item = {
        'place': p,
        'media_count': place_media_count,
    }
    counts.append(item)

In [189]:
import csv, json
with open('batik-data/batik-media-counts.json', 'w') as f:
    f.write(json.dumps(counts))
with open('batik-data/batik-media-counts.csv', 'w') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=["place", "media_count"])
    spamwriter.writeheader()
    for row in counts:
        spamwriter.writerow(row)

--------------
## Write Summary Report

In [267]:
# initialize report
from yattag import Doc, indent
import slug
import json
import plotly.express as px
import pandas as pd 
with open('batik-data/batik-words.json') as f:
    word_data = json.load(f)
with open('batik-data/batik-media-counts.json', 'r') as f:
    media_counts = json.load(f)
with open('batik-data/batik-themes.json', 'r') as f:
    theme_data = json.load(f)
with open('batik-data/batik-climate-attention.json', 'r') as f:
    attention_data = json.load(f)

In [279]:
REGEN_CHARTS = True

for p, m in places.items():
    doc, tag, text = Doc().tagtext()
    doc.asis('<!DOCTYPE html>')
    place_name = p.replace('"', '')
    title = "{}: Climate in the News".format(place_name)
    with tag('html', lang="en"):
        with tag('head'):
            doc.stag('meta', charset="utf-8")
            doc.stag('meta', name="viewport", content="width=device-width, initial-scale=1")
            doc.stag('link', rel="stylesheet", href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css", integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh", crossorigin="anonymous")
            doc.stag('link', rel="stylesheet", href="report.css")
            with tag('script', src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"):
                text()
            with tag('script', src="https://code.highcharts.com/highcharts.js"):
                text()
            with tag('script', src="https://code.highcharts.com/modules/wordcloud.js"):
                text()
            with tag('script', src="report.js"):
                text()
            with tag('title'):
                text(title)
        with tag('body'):
            with tag('div', klass="container"):
                with tag('header'):
                    with tag('div', klass="row"):
                        with tag('div', klass="col"):
                            with tag('h1'):
                                text(title)
                # total count summary
                content = ''
                place_media_count = int([r for r in media_counts if r['place'] == p][0]['media_count'])
                content += "Using the Media Cloud database, we searched English-language sources published in {}. We gathered all stories published in 2017, 2018, and 2019; {:n} stories in total.  ".format(
                    place_name, place_media_count)
                story_counts = {}
                with open('batik-data/batik-climate-coverage.csv', 'r') as csvfile:
                    dict_reader = csv.DictReader(csvfile)
                    for row in dict_reader:
                        if row['place'] == p:
                            story_counts = row
                    pct = int(story_counts['climate-change-stories']) / int(story_counts['total-stories'])
                    content += 'This included {:n} total stories. {:.2%} mentioned "climate change" or "global warming" in some way ({:n} stories).'.format(
                        int(story_counts['total-stories']), pct, int(story_counts['climate-change-stories']))
                with tag('div', klass="row"):
                    with tag('div', klass="col"):
                        with tag('p', klass="lead"):
                            text(content)
                    # climate story pie chart
                    div_id = "{}-climate-stories-chart".format(slug.slug(place_name))
                    with tag('div', klass="col"):
                        with tag('div', id=div_id, klass="piechart"):
                            text('')
                        with tag('script'):
                            data = [
                                {'name': 'climate stories', 'y': int(story_counts['climate-change-stories']), 'color': '#300D28'},
                                {'name': 'other stories', 'y': int(story_counts['total-stories'])-int(story_counts['climate-change-stories']), 'color': '#cccccc'},
                            ]
                            doc.asis('$(function() {{piechart("Climate Stories", "{}", {}, 300)}})'.format(div_id, json.dumps(data)))
                # attention over time chart
                div_id = "{}-climate-attention-chart".format(slug.slug(place_name))
                with tag('div', klass="row"):
                    with tag('div', klass="col"):
                        with tag('div', id=div_id, klass="linechart"):
                            with tag('script'):
                                data = [r for r in attention_data if r['place'] == p][0]['climate-change-stories']['counts']
                                data = [[solr_to_date(r['date']).timestamp()*1000, int(r['count'])] for r in data]
                                doc.asis('$(function() {{linechart("Stories", "{}", {}, 200)}})'.format(div_id, json.dumps(data)))
                # story summary cards
                with tag('div', klass="row"):
                    story_stat_card('resilience', story_counts)
                    story_stat_card('security', story_counts)
                    story_stat_card('migration', story_counts)
                # theme sections
                topic_section(p, 'resilience', story_counts)
                topic_section(p, 'security', story_counts)
                topic_section(p, 'migration', story_counts)

    with open('reports/batik-report-{}.html'.format(slug.slug(place_name)), 'w') as f:
        f.write(indent(doc.getvalue()))


In [272]:
from datetime import datetime as dt
SOLR_DATE_FORMAT = '%Y-%m-%d'
def solr_to_date(date_str):
    return dt.strptime(date_str[:10], SOLR_DATE_FORMAT)

def story_stat_card(theme_name, story_counts):
    theme_story_count = int(story_counts['{}-stories'.format(theme_name)])
    climate_story_count = int(story_counts['climate-change-stories'])
    with tag('div', klass="col"):
        with tag('div', klass='card stat'):
            with tag('div', klass='card-body'):
                with tag('h5', klass='card-title'):
                    text("Climate & {}".format(theme_name.capitalize()))
                with tag('p', klass='card-text stat'):
                    pct = theme_story_count / climate_story_count
                    text("{:.0%}".format(pct))
                with tag('p', klass="card-subtitle text-muted"):
                    text('stories about climate change also mention {}'.format(theme_name))

In [278]:
def topic_section(place_name, topic_name, story_counts):
    topic_story_count = int(story_counts['{}-stories'.format(topic_name)])
    climate_story_count = int(story_counts['climate-change-stories'])
    # summary
    with tag('div', klass="row"):
        with tag('div', klass="col"):
            with tag('h2'):
                text('Climate and {}'.format(topic_name.capitalize()))
            with tag('p', klass='lead'):
                text('There were {:n} stories about climate and {}. This represents {:.2%} of the stories about climate in some way.'.format(
                    topic_story_count,
                    topic_name,
                    topic_story_count/climate_story_count
                ))
    # attention over time chart
    div_id = "{}-{}-attention-chart".format(slug.slug(place_name), topic_name)
    with tag('div', klass="row"):
        with tag('div', klass="col"):
            with tag('div', id=div_id, klass="linechart"):
                with tag('script'):
                    data = [r for r in attention_data if r['place'] == p][0]['{}-stories'.format(topic_name)]['counts']
                    data = [[solr_to_date(r['date']).timestamp()*1000, int(r['count'])] for r in data]
                    doc.asis('$(function() {{linechart("{} Stories", "{}", {}, 200)}})'.format(topic_name.capitalize(), div_id, json.dumps(data)))
    with tag('div', klass="row"):
        # word cloud
        top_words = [d for d in word_data if (d['place'] == place_name) and (d['topic'] == topic_name)][0]['top_words']
        top_words = [{'name': t['word'], 'weight': t['freq']} for t in top_words if t['word'] not in ['brazilian', topic_name]][:100]
        with tag('div', klass="col"):
            with tag('h3'):
                text('Top Words')
            with tag('p', klass='headline'):
                text('Here are the top 100 words used in articles about climate and {}.'.format(topic_name))
            chart_id = "{}-{}-top-words".format(slug.slug(place_name), topic_name)
            with tag('div', id=chart_id, klass="wordcloud"):
                text('')
            with tag('script'):
                doc.asis('$(function() {{wordcloud("{}", {}, 300)}})'.format(
                    chart_id, json.dumps(top_words)))
        # top themes
        top_themes = [d for d in theme_data if (d['place'] == place_name) and (d['topic'] == topic_name)][0]['top_themes'][:7]
        with tag('div', klass="col"):
            with tag('h3'):
                text('Top Themes')
            with tag('p', klass='headline'):
                text('Here are the top "themes" detected by our computational model in stories about climate and {}.'.format(topic_name))
                with tag('table', klass='table'):
                    with tag('thead'):
                        with tag('tr'):
                            with tag('th'):
                                text('Theme')
                            with tag('th'):
                                text('Percent of Stories')
                    with tag('tbody'):
                        for theme in top_themes:
                            with tag('tr'):
                                with tag('td'):
                                    text(theme['name'])
                                with tag('td'):
                                    text("{:.0%}".format(int(theme['count'])/topic_story_count))
        
