In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from os.path import basename
import regex as re
import numpy as np
import math
import time
#NOTE: change this according to your own data_io.py file
import data_io
import utils as u
import datetime as dt
import openpyxl

### Direct replications of Silver et al.


To directly replicate the results of Silver et al. (2021), you'll need to make sure that you read in the published dataset to select articles included in their analysis. The following lines of code accomplish this task.

In [17]:
direct_rep = True

In [22]:
if direct_rep:
    all_links = pd.read_csv(f"{data_io.DATA}article_data_cleaned.csv", encoding = "utf-8-sig")
    all_links = all_links[["article_date", "article_title", "article_link", "article_id", "uni"]]
    princeton_2019 = all_links[all_links.uni=="princeton"]
    harvard_2019 = all_links[all_links.uni=="harvard"]
    mit_2019 = all_links[all_links.uni=="mit"]
    columbia_2019 = all_links[all_links.uni=="columbia"]
    yale_2019 = all_links[all_links.uni=="yale"]
else:
    princeton_links = pd.read_csv(data_io.LINK_FNAME.replace("uni", 'princeton'))
    princeton_2019 = princeton_links[princeton_links["article_date"].str.contains("2019")]
    princeton_2019 = princeton_2019.reset_index(drop = True)
    
    harvard_links = pd.read_csv(data_io.LINK_FNAME.replace("uni", 'harvard'))
    harvard_2019 = harvard_links[harvard_links['article_date'].str.contains('2019')]
    harvard_2019 = harvard_2019.reset_index(drop = True)
    
    mit_links = pd.read_csv(data_io.LINK_FNAME.replace("uni", 'mit'))
    mit_2019 = mit_links[mit_links['article_link'].str.contains('/2019/')]
    mit_2019 = mit_2019.sample(100)
    mit_2019 = mit_2019.reset_index(drop = True)
    
    columbia_links = pd.read_csv(data_io.LINK_FNAME.replace("uni", 'columbia'))
    columbia_2019 = columbia_links[columbia_links['article_date'].str.contains('2019')]
    columbia_2019 = columbia_2019.sample(100)
    columbia_2019 = columbia_2019.reset_index(drop = True)
    
    yale_links = pd.read_csv(data_io.LINK_FNAME.replace("uni", 'yale'))
    yale_2019 = yale_links[yale_links['article_date'].str.contains('2019')]
    yale_2019 = yale_2019.reset_index(drop = True)
    #The first story is just the greatest hits of 2019--drop it 
    yale_2019 = yale_2019.loc[1:, :]
    #just include stories from Yale's main campus
    yale_2019 = yale_2019[yale_2019['article_link'].str.startswith("https://westcampus")==False]
    yale_2019 = yale_2019.reset_index(drop = True)
    

## Get Princeton data

In [None]:

def get_princeton_data(link, article_id='TEST', fix_text = False):
    soup = u.process_request(link)
    article = soup.find("article", {'role':'article'})
    try:
        title = article.find('h1').get_text().strip()
    except:
        title = article.find("h2").get_text().strip()
    
    date = article.find("div", 
                        class_= re.compile("published-date")).get_text().strip()
    text = article.find('div',class_="node__content")
    article_text = u.process_article_text(text.find_all("p"))
    if fix_text:
        return article_text
    article_container = u.setup_article_container(title = title,
                                                 date = date,
                                                 text = article_text,
                                                 link = link)
    slideshow = article.find_all("article", class_=re.compile("media-group view-mode-slideshow"))
    images = []
    captions = []
    for s in slideshow:
        images.extend(s.find_all("img", alt=True))
        captions.extend(s.find_all("figcaption"))
    
    main_image = article.find("div",class_=re.compile("news-cover-image"))
    if main_image:
        if main_image.find('img', alt=True) and main_image.find('img', alt=True) not in images:
            images.append(main_image.find("img",alt=True))
            main_caption = article.find("div",class_=re.compile('cover-caption')).find("div",class_=re.compile('image-caption'))
            
            if main_caption:
                captions.append(main_caption)
            else:
                captions.append('NO CAPTION')
            
    other_images = article.find_all("article", class_=re.compile("media media-image"))
    for o in other_images:
        if o.find("img", alt=True):
            if o.find('img',alt=True) not in images:
                images.append(o.find("img", alt=True))
                if o.find("div", class_=re.compile("caption")):
                    captions.append(o.find("div", class_=re.compile("caption")))
                elif o.find("figcaption"):
                    captions.append(o.find("figcaption"))
                else:
                    captions.append("NO CAPTION")
    
    if soup.find("iframe"):
        print("video present: ", link)
    img_link_stem = "https://princeton.edu"
    idx = 0
    for i in range(0, len(images)):
        article_container = u.process_image(images[i], 'princeton_',
                                           article_container, idx,
                                           img_stem = img_link_stem)
        try:
            caption = captions[i].get_text().strip().replace("\xa0", "")
        except:
            caption = 'NO CAPTION'
        article_container.loc[idx, 'image_captions'] = caption
        idx += 1
    embedded = article.find_all("iframe")
    for e in embedded:
        if e.has_attr('src'):
            img = e['src']
            if 'youtube' in img or 'yt' in img:
                article_container = u.process_image(img, 'princeton_',
                                               article_container, idx,
                                               image_type = 'youtube')
                idx += 1
    article_text_fmt = u.remove_captions(article_container['image_captions'].to_list(),
                                            article_text)
    article_container.loc[0, 'article_text'] = article_text_fmt
    article_container['article_id'] = article_id
    return article_container


In [None]:
princeton_data = ARTICLE_CONTAINER.copy()

In [None]:
for i in princeton_2019.index:
    tmp = get_princeton_data(princeton_2019.loc[i, 'article_link'],
                                princeton_2019.loc[i, 'article_id'])
    tmp.loc[0, 'article_date_scrape'] = princeton_2019.loc[i, 'article_date']
    tmp.loc[0, 'article_title_scrape'] = princeton_2019.loc[i, 'article_title']
    princeton_data = pd.concat([princeton_data, tmp], ignore_index = True)

In [None]:
princeton_clean = u.clean_save_dataset(princeton_data, data_io.DATA_FNAME.replace("uni", 'princeton'),
                                    data_io.FMTD_DATA_FNAME.replace('uni', 'princeton').replace('year','2019'),
                                      sample_size = 100)

## Get MIT data

In [None]:
def get_mit_data(link, article_id='TEST', fix_text = False):
    soup = u.process_request(link)
    #Get article info
    #article text
    article_body = soup.find("div", class_="news-article--content--body--inner")
    
    article_text = u.process_article_text(article_body.find_all("p"))
    date_container = soup.find("div", class_="news-article--publication-date")
    date = date_container.find("time").get_text().strip()
    if fix_text:
        return article_text
    title = soup.find("h1").get_text().strip()
    article_container = u.setup_article_container(title = title, text = article_text,
                                                 date = date, link = link)
    
    article = soup.find("article")
    
    image_url_stem = "https://news.mit.edu"
    idx = 0
    header = soup.find("div", class_='news-article--full-width-wrapper')
    if header:
        top_images = header.find_all("div", class_="news-article--media--image--file")
        top_captions = header.find_all("div", class_="news-article--media--image--caption")
    else:
        top_images = []
        top_captions = []
    i = 0
    #Get images at the top of the page
    for i in range(0, len(top_images)):
        if top_images[i].find("img", alt=True):
            img = top_images[i].find('img', alt=True)

            img_link = img['data-src']
            if len(top_captions) > 0:
                caption = top_captions[i].get_text().strip().replace("\xa0", " ")
                if 'Caption:\n' in caption:
                    caption = caption.replace("Caption:\n", "")
            else:
                caption = 'NO CAPTION'
            article_container.loc[idx, 'image_captions'] = caption
            if 'yt' in img_link or 'youtube' in img_link:
                article_container = u.process_image(img_link, 'mit_', article_container, idx,
                                                       image_type = 'youtube')
                idx += 1
                print('found youtube video: ', img_link)
            else:
                article_container = u.process_image(img, 'mit_', article_container, idx,
                                                   img_stem = image_url_stem, src_key = 'data-src')
                idx += 1
        
    #Get images in the article body--these seem to be mostly thumbnails
    i = 0
    article_images = article_body.find_all("img", alt = True)
    for i in range(0, len(article_images)):
        img_link = article_images[i]['src']
        if 'yt' in img_link or 'youtube' in img_link:
            article_container = u.process_image(img_link, 'mit_', article_container, idx,
                                                       image_type = 'youtube')
            idx += 1
            print('found youtube video: ', img_link)
        else:
            article_container = u.process_image(article_images[i], 'mit_', article_container, idx,
                                                img_stem = image_url_stem, src_key = ['src','data-src'])
            idx += 1
    article_text_fmt = u.remove_captions(article_container['image_captions'].to_list(),
                                            article_text)
    article_container.loc[0, 'article_text'] = article_text_fmt
    article_container['article_id'] = article_id
    
    return article_container
        
    

In [None]:
mit_2019['scraped'] = False

In [None]:
mit_data = ARTICLE_CONTAINER.copy()
for i in mit_2019.index:
    print(i)
    tmp = get_mit_data(mit_2019.loc[i, 'article_link'],
                      mit_2019.loc[i, 'article_id'])
    tmp.loc[0, 'article_date_scrape'] = mit_2019.loc[i, 'article_date']
    tmp.loc[0, 'article_link'] = mit_2019.loc[i, 'article_link']
    tmp.loc[0, 'article_title_scrape'] = mit_2019.loc[i, 'article_title']
    mit_2019.loc[i, 'scraped'] = True
    mit_data = pd.concat([mit_data, tmp], ignore_index=True)


In [None]:
mit_data = clean_save_dataset(mit_data, data_io.DATA_FNAME.replace("uni", 'mit'),
                                    data_io.FMTD_DATA_FNAME.replace('uni', 'mit').replace('year','2019'),
                                     sample_size = 100, return_type = 'subset')

## Get Yale data

In [None]:
def get_yale_seas_data(link, article_id='TEST', fix_text = False):
    r = requests.get(link)
    soup = u.process_request(link)
    idx = 0
    article_area = soup.find("article")
    title = article_area.find("h1").get_text().strip().replace("\xa0", " ")
    
    date = article_area.find("div", class_="news-date").get_text().strip().replace("\xa0", " ")
    
    article_body = article_area.find('div',class_="news-body")
    article_text = u.process_article_text(article_body.find_all("p"))
    if fix_text:
        return article_text
    image_stem = "https://seas.yale.edu"
    article_container = u.setup_article_container(text = article_text,
                                                 date = date,
                                                 title = title,
                                                 link = link)
    article_container.loc[0, 'url_redirect'] = r.url
    
    feature_img = article_area.find("div", class_="news-image")
    if feature_img:
        if feature_img.find("img", alt = True):
            img = feature_img.find("img", alt=True)
            article_container = u.process_image(img, 'yale_', article_container,
                                                idx, img_stem = image_stem)
            idx += 1
        elif feature_img.find("iframe"):
            if feature_img.find("iframe").has_attr("src"):
                article_container = u.process_image(feature_img.find("iframe")['src'],
                                                   'yale', article_container,
                                                   idx, image_type = 'youtube')
                idx += 1
    
    images = article_body.find_all("img", alt=True)
    for i in images:
        article_container = u.process_image(i, 'yale_', article_container,
                                                idx, img_stem = image_stem)
        idx += 1
        
    video_objs = article_body.find_all("object", class_=re.compile("youtube"))
    for v in video_objs:
        vid_url = v['data']
        article_container = u.process_image(vid_url,'yale', article_container,
                                            idx, image_type = 'youtube')
        idx += 1
    article_text_fmt = u.remove_captions(article_container['image_captions'].to_list(),
                                            article_text)
    article_container.loc[0, 'article_text'] = article_text_fmt
    article_container['article_id'] = article_id
    return article_container
        
    

In [None]:
def get_yale_data(link, article_id = 'TEST', fix_text = False):
    r = requests.get(link)
    if 'yale' not in r.url:
        article_container = u.ARTICLE_CONTAINER.copy()
        article_container.loc[0, 'article_link'] = link
        article_container.loc[0, 'url_redirect'] = r.url
        article_container.loc[0, 'article_text'] = 'FAILED--OUTSIDE REDIRECT'
        article_container['article_id'] = article_id
        return article_container
    
    if r.url.startswith("https://seas."):
        print("SEAS article: ", article_id)
        article_container = get_yale_seas_data(link, article_id, fix_text = fix_text)
        return article_container
    
    if r.url.startswith("https://news.yale") == False:
        article_container = u.ARTICLE_CONTAINER.copy()
        article_container.loc[0, 'article_link'] = link
        article_container.loc[0, 'url_redirect'] = r.url
        article_container.loc[0, 'article_text'] = 'FAILED--REDIRECT TO NON-NEWS SITE'
        article_container['article_id'] = article_id
        return article_container
    
    soup = u.process_request(link)
    if soup.find('p', class_="eyebrow"):
        eb = soup.find('p', class_="eyebrow").get_text()
        if 'Video' in eb:
            article_container.loc[0, 'image_links'] = 'VIDEO'
            article_container.loc[0, 'article_text'] = 'FAILED--VIDEO ONLY'
            article_container['article_id'] = article_id
            return article_container
    
    title = soup.find("h1").get_text().strip()
    date = soup.find("div", class_="date").get_text().strip()
    text_area = soup.find("div", class_=re.compile("story clearfix"))
    text = u.process_article_text(text_area.find_all("p"))
    if fix_text:
        return text
    article_container = u.setup_article_container(text = text, date = date, title = title,
                                                 link = link)
    figs = soup.find_all("figure")
    idx = 0
    for f in figs:
        skip = False
        for p in f.parents:
            if p.name == 'footer':
                skip = True
        if skip == False:
            images = f.find_all("img", alt=True)
            if len(images)>0:
                if f.find("figcaption"):
                    article_container.loc[idx, 'image_captions'] = f.find("figcaption").get_text().strip()

                for i in images:
                    article_container = u.process_image(i, 'yale_', article_container,
                                                        idx, img_stem = 'https://news.yale.edu')

                    idx += 1
            elif f.find("iframe"):
                if f.find('iframe').has_attr('src'):
                    if 'youtube' in f.find("iframe")['src'] or 'yt' in f.find('iframe')['src']:
                        print('found video: ',fv.find("iframe")['src'])
                        article_container = u.process_image(f.find("iframe")['src'], 'yale_', article_container,
                                                           idx, image_type = 'youtube')
                        if f.find("figcaption"):
                            article_container.loc[idx, 
                                                  'image_captions'] = f.find("figcaption").get_text().strip()
                        idx += 1
            
    videos = soup.find_all("div", class_=re.compile("embedded-video"))
    for v in videos:
        if v.find("iframe"):
            if v.find("iframe").has_attr('src'):
                if 'youtube' in v.find("iframe")['src'] or 'yt' in v.find('iframe')['src']:
                    print('found video: ', v.find("iframe")['src'])
                    article_container = u.process_image(v.find("iframe")['src'], 'yale_', article_container,
                                                       idx, image_type = 'youtube')
                    idx += 1
    article_text_fmt = u.remove_captions(article_container['image_captions'].to_list(),
                                            article_text)
    article_container.loc[0, 'article_text'] = article_text_fmt
    article_container['article_id'] = article_id
    return article_container

In [None]:
for i in range(0, len(yale_2019)):
    print(i)
    tmp = get_yale_data(yale_2019.loc[i, 'article_link'],
                       yale_2019.loc[i, 'article_id'])
    tmp.loc[0, 'article_date_scrape'] = yale_2019.loc[i, 'article_date']
    tmp.loc[0, 'article_link'] = yale_2019.loc[i, 'article_link']
    tmp.loc[0, 'article_title_scrape'] = yale_2019.loc[i, 'article_title']
    if i == 0:
        yale_data = tmp.copy()
    else:
        yale_data = pd.concat([yale_data, tmp], ignore_index=True)

In [None]:
yale_data

In [None]:
yale_subset = u.clean_save_dataset(yale_data, data_io.DATA_FNAME.replace('uni', 'yale'),
                                    data_io.FMTD_DATA_FNAME.replace('uni', 'yale').replace('year','2019'),
                                     sample_size = 100, return_type='subset')

## Get Columbia data

In [None]:
def get_columbia_data(link, article_id='TEST', existing_titles = None,
                     fix_text = False):
    
    r = requests.get(link, allow_redirects=False, headers ={'User-Agent': '...'})
    soup = None
    soup = BeautifulSoup(r.content, features = 'html')
    #print(r.url)
    article_area = soup.find("div", {"id": 'left-area'})
    title = article_area.find("h1").get_text().strip()
    if existing_titles or r.url != link:
        if any(title in e for e in existing_titles):
            article_title = 'FAILED'
            print("FAILED")
            article_container = u.ARTICLE_CONTAINER.copy()
            article_container.loc[0, 'article_link'] = link
            article_container.loc[0, 'article_title'] = article_title
            print(soup.find("div", class_=re.compile("et_pb_section")))
            return article_container
    if article_area.find("span", class_="date"):
        date = article_area.find("span", class_="date").get_text().strip()
    else:
        date = article_area.find("time", class_="datetime").get_text().strip()
        
    article_text = article_area.find_all("p")
    ps = []
    bad_classes = ['caption']
    for p in article_text:
        if p.has_attr("class"):
            if len([c for c in p['class'] if any(b in c for b in bad_classes)]) == 0:
                ps.append(p)
        else:
            ps.append(p)
            
    if len(article_text) ==1:
        print(link)
        print(article_text)
        
    article_text = u.process_article_text(ps)
    
    if fix_text:
        return article_text
    article_container = u.setup_article_container(text = article_text,
                                                 link = link,
                                                 date = date,
                                                 title = title)
    
    figs = article_area.find_all("div", class_=re.compile("wp-caption"))
    idx = 0
    if soup.find("iframe"):
        l2 = soup.find('iframe')['src']
        if 'youtube' in l2 or 'vimeo' in l2:
            if 'youtube' in l2:
                article_container = u.process_image(l2, 'columbia', article_container, idx, 
                                                      src_key = 'src', img_stem = None,
                                                     image_type = 'youtube')
                article_container.loc[idx, 'article_id'] = article_id
                idx += 1
            else:
                iframe = soup.find("iframe")
                if_soup = process_request(iframe.attrs['src'])
                if_body = if_soup.find('body')
                ph = if_body.find('div', class_='vp-placeholder')
                ph = str(ph)
                article_conainer = u.process_image(ph, 'columbia_', article_container, idx, 
                                              src_key = 'src', img_stem = '',
                                             image_type = 'vimeo')
                idx += 1
        
    else:
        if len(figs) == 0:
            images = article_area.find_all("img", alt=True)
            if len(images) == 0:
                print('no figures: ', link)
                idx += 1
            else:
                for image in images:
                    if "https://secure.gravatar" not in image['src']:
                        article_container = u.process_image(image, 'columbia_', article_container, idx, 
                                                      src_key = 'src')
                        article_container.loc[idx, 'article_id'] = article_id
                        idx += 1
    for f in figs:
        image = f.find("img", alt=True)
        if image:
            image_caption = f.find("p", class_=re.compile("caption"))
            if image_caption:
                image_caption = image_caption.get_text().strip().replace("\xa0", '')
            else:
                image_caption = 'NO CAPTION'
            alt = image['alt']
            article_container = u.process_image(image, 'columbia_', article_container, idx, 
                                                      src_key = 'src')
            article_container.loc[idx, 'image_captions'] = image_caption
            article_container.loc[idx, 'article_id'] = article_id
            idx += 1
    article_text_fmt = u.remove_captions(article_container['image_captions'].to_list(),
                                            article_text)
    article_container.loc[0, 'article_text'] = article_text_fmt
    article_container['article_id'] = article_id
    return article_container

In [None]:
cu_data = pd.DataFrame()


In [None]:
for i in range(0, len(columbia_2019.index)):
    tmp = get_columbia_data(columbia_2019.loc[i, 'article_link'],
                           columbia_2019.loc[i, 'article_id'])
    time.sleep(3)
    if tmp['article_text'].dropna().duplicated(keep=False).sum() > 0:
        print('duplicate: ', i)
        tmp.loc[0, 'article_text'] = 'FAILED'
        tmp.loc[0, 'article_title'] = 'FAILED'
    cu_data = pd.concat([cu_data, tmp], ignore_index=True)

In [None]:
id_subset = cu_data.drop_duplicates(subset=['article_id'],keep='first')
id_subset = id_subset[id_subset['article_title']!='FAILED']

keep_ids = id_subset['article_id'].to_list()
columbia_subset = cu_data[cu_data['article_id'].isin(keep_ids)]

In [None]:
columbia_subset = u.clean_save_dataset(columbia_subset, data_io.DATA_FNAME.replace('uni', 'columbia'),
                                    data_io.FMTD_DATA_FNAME.replace('uni', 'columbia').replace('year','2019'),
                                     sample_size = 100, return_type='subset')

## Get Harvard article data

In [None]:
def get_harvard_data(link_to_article, article_id = 'TEST', fix_text = False):
    soup = u.process_request(link_to_article)
    art = soup.find_all('div', class_= "article-content")
    ps = []
    bad_classes = ['byline', 'posted-on', 'caption', 'explore']
    for a in art:
        tmp_ps = a.find_all("p")
        for p in tmp_ps:
            if p.has_attr("class"):
                if len([c for c in p['class'] if any(b in c for b in bad_classes)]) == 0:
                    ps.append(p)
            else:
                ps.append(p)
    article_text = u.process_article_text(ps)
    if fix_text:
        return article_text
    date = soup.find("time", class_=re.compile("timestamp--published")).get_text().strip()
    title = soup.find('h1', 
                      class_=re.compile('title')).get_text()
    
    article_container = u.setup_article_container(title = title,
                                                 link = link_to_article,
                                                 date = date,
                                                 text = article_text)
    
    idx = 0
    figs = soup.find("main").find("article").find_all("figure")
    for f in figs:
        images = f.find_all("img", alt = True)
        if len(images) > 0:
            #Sometimes there are multiple images associated with the same caption
            for img in images:
                article_container = u.process_image(img, 'harvard_', article_container,
                                                   idx)
                if f.find('figcaption'):
                    article_container.loc[idx, 'image_captions'] = f.find("figcaption").get_text().strip()
                idx += 1
        elif f.find("iframe"):
            embed = f.find("iframe")
            if embed.has_attr("src"):
                if 'youtube' in embed['src'] or 'yt' in embed['src'] or 'vimeo' in embed['src']:
                    print("thumbnail: ", link_to_article)
                    if 'vimeo' in embed['src']:
                        article_container = u.process_image(embed['src'], 'harvard_',
                                                           article_container, idx,
                                                           image_type='vimeo')
                    else:
                        article_container = u.process_image(embed['src'], 'harvard_',
                                                           article_container, idx,
                                                           image_type='youtube')
                    if f.find("figcaption"):
                        article_container.loc[idx, 'image_captions'] = f.find("figcaption").get_text().strip()
                    idx += 1
        elif f.find("video"):
            if f.find("video").find("source"):
                #print('found mp4')
                article_container = u.process_image(f.find("video").find("source"), 'harvard_',
                                                    article_container, idx,
                                                   src_key = ['src', 'source'])
                if f.find('figcaption'):
                    article_container.loc[idx, 'image_captions'] = f.find("figcaption").get_text().strip()
                idx += 1
    article_text_fmt = u.remove_captions(article_container['image_captions'].to_list(),
                                            article_text)
    article_container.loc[0, 'article_text'] = article_text_fmt
    article_container['article_id'] = article_id
    return article_container

In [None]:
harvard_data = u.ARTICLE_CONTAINER.copy()
for i in range(0, len(harvard_2019)):
    print(i)
    tmp = get_harvard_data(harvard_2019.loc[i, 'article_link'],
                           harvard_2019.loc[i, 'article_id'])
    tmp.loc[0, 'article_date_scrape'] = harvard_2019.loc[i, 'article_date']
    tmp.loc[0, 'article_title_scrape'] = harvard_2019.loc[i, 'article_title']
    tmp.loc[0, 'article_link'] = harvard_2019.loc[i, 'article_link']
    harvard_data = pd.concat([harvard_data, tmp], ignore_index = True)

In [None]:
all_harvard_data = u.clean_save_dataset(harvard_data, data_io.DATA_FNAME.replace('uni', 'harvard'),
                                    data_io.FMTD_DATA_FNAME.replace('uni', 'harvard').replace('year','2019'))