### Project Goal:



The goal of this project is to webscrape Meta's newsroom for articles tagged with

* Safety and Expression
* Combatting Misinformation
* Data and Privacy

### Import packages

In [13]:
from bs4 import BeautifulSoup
from google.colab import files
import requests
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime


### Write function to scrape article metadata, text from Meta's newsroom

In [1]:
def scrape_meta_section(url, page_range):
  data = [ ]
  for i in range(0, page_range):
    request = requests.get(f"{url}/{i}/")
    parser = BeautifulSoup(request.content, "html.parser")

    area_section = parser.find("div", class_="archive-articles-container")
    article_section = area_section.find_all("div", class_="uk-width-3-5 article-excerpt")
    for article in article_section:
      try:
        header = article.find("header", class_="entry-header")
        link_source = header.find("a", href=True)

        article_title = link_source.get_text()
        link = link_source.get("href")

        date_source = header.find("time", class_="entry-date published")
        date_source2 = header.find("time", class_="updated")
        date = date_source.get_text()
        date2 = date_source2.get_text()

        data.append((link, article_title, date, date2))

      except:
        header = article.find("header", class_="entry-header")
        link_source = header.find("a", href=True)

        article_title = link_source.get_text()
        link = link_source.get("href")

        date_source = header.find("time", class_="entry-date published updated")
        date = date_source.get_text()
        date2 = None

        data.append((link, article_title, date, date2))
  return pd.DataFrame(data, columns=["url", "article_title" , "date", "updated_date"])


In [8]:

def process_row(row, session):
    article_request = session.get(url=row['url'])
    parser = BeautifulSoup(article_request.content, "html.parser")

    author_name, author_pos = "na", "na"
    author = parser.find("div", class_="post-custom_author")
    if author:
        author_text = author.get_text(strip=True)
        match_val = re.match(r"By\s+(.*?),\s*(.*)", author_text)
        if match_val:
            author_name = match_val.group(1)
            author_pos = match_val.group(2)

    text_section = parser.find("div", class_="uk-width-2-3@m article-container")
    paragraphs = text_section.find_all("p") if text_section else []
    plain_text = " ".join([para.get_text(strip=True) for para in paragraphs])

    tag_s = parser.find("div", class_="entry-tags")
    tags_text_cleaned = ""
    if tag_s:
        tags = tag_s.find_all("a")
        tags_text_cleaned = ", ".join([tag.get_text() for tag in tags])

    cat_lst = ["Meta", "Facebook", "Instagram", "WhatsApp"]
    category_text = []
    category_source = parser.find("div", class_="entry-categories")
    if category_source:
        categories = category_source.find_all("a")
        for cat in categories:
            cat_text = cat.get_text()
            if cat_text in cat_lst:
              category_text.append(cat_text)
            else:
              tags_text_cleaned += f", {cat_text}"

    category_text_cleaned = " ".join(category_text)

    return {
        "author": author_name,
        "author_pos": author_pos,
        "text": plain_text,
        "category": category_text_cleaned,
        "tags": tags_text_cleaned,
    }

def append_text_category(dataframe):
    session = requests.Session()
    results = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = list(executor.map(lambda row: process_row(row, session), dataframe.to_dict(orient='records')))


    for idx, result in enumerate(results):
        dataframe.at[idx, "author"] = result["author"]
        dataframe.at[idx, "author_pos"] = result["author_pos"]
        dataframe.at[idx, "text"] = result["text"]
        dataframe.at[idx, "category"] = result["category"]
        dataframe.at[idx, "tags"] = result["tags"]

    return dataframe

### Scrape metadata section of the Meta blog

In [4]:
safety_and_expression = scrape_meta_section(url="https://about.fb.com/news/category/safety-and-expression/page/", page_range=22)

In [5]:
combat_misinfo = scrape_meta_section(url ="https://about.fb.com/news/tag/misinformation/page", page_range=8)

In [6]:
data_and_privacy = scrape_meta_section(url = "https://about.fb.com/news/category/data-and-privacy/page/", page_range=17)

### Scrape article dataset and download as XLSX file

In [9]:
append_text_category(safety_and_expression)

Unnamed: 0,url,article_title,date,updated_date,author,author_pos,text,category,tags
0,https://about.fb.com/news/2024/12/2024-global-...,What We Saw on Our Platforms During 2024’s Glo...,"December 3, 2024","December 3, 2024",na,na,The recent presidential election in the United...,Meta,"Artificial Intelligence and Machine Learning, ..."
1,https://about.fb.com/news/2024/11/cracking-dow...,Cracking Down on Organized Crime Behind Scam C...,"November 21, 2024","January 6, 2025",na,na,"Every day, criminals target people across the ...",Meta,"Safety, Scams, Security News, Well-Being, Inte..."
2,https://about.fb.com/news/2024/11/introducing-...,Reshape Your Instagram With a Recommendations ...,"November 19, 2024","November 19, 2024",na,na,We want to make sure everyone on Instagram – e...,Instagram,"Safety, Well-Being, Product News, Safety and E..."
3,https://about.fb.com/news/2024/10/testing-comb...,Testing New Ways to Combat Scams and Help Rest...,"October 21, 2024","December 10, 2024",na,na,"We know security matters, and that includes be...",Facebook Instagram,"Creators, Product News, Safety, Scams, Public ..."
4,https://about.fb.com/news/2024/10/instagram-ca...,Our New Education Campaign to Help Protect Tee...,"October 17, 2024","December 10, 2024",na,na,"Sextortion is a horrific crime, where financia...",Instagram,"Safety, Scams, Well-Being, Public Policy, Safe..."
...,...,...,...,...,...,...,...,...,...
215,https://about.fb.com/news/2018/12/take-down-in...,Taking Down Coordinated Inauthentic Behavior i...,"December 20, 2018","March 24, 2021",na,na,"ByNathaniel Gleicher, Head of Cybersecurity Po...",Meta,"Community Standards and Enforcement, Coordinat..."
216,https://about.fb.com/news/2018/12/guardian-fac...,Responding to The Guardian: A Fact-Check on Fa...,"December 13, 2018","June 22, 2020",na,na,"ByMeredith Carden,Head of News Integrity Partn...",Meta,"Combating Misinformation, Community Standards ..."
217,https://about.fb.com/news/2018/11/content-stan...,Product Policy Forum Minutes,"November 15, 2018","January 26, 2022",na,na,We recognize how important it is for Facebook ...,Meta,"Community Standards and Enforcement, Public Po..."
218,https://about.fb.com/news/2018/11/enforcing-ou...,How Are We Doing at Enforcing Our Community St...,"November 15, 2018","December 13, 2019",na,na,"ByGuy Rosen, VP of Product Management People w...",Meta,"Community Standards and Enforcement, Community..."


In [10]:
append_text_category(combat_misinfo)

Unnamed: 0,url,article_title,date,updated_date,author,author_pos,text,category,tags
0,https://about.fb.com/news/2024/12/2024-global-...,What We Saw on Our Platforms During 2024’s Glo...,"December 3, 2024","December 3, 2024",na,na,The recent presidential election in the United...,Meta,"Artificial Intelligence and Machine Learning, ..."
1,https://about.fb.com/news/2022/11/metas-progre...,Sharing Our Progress on Combating Climate Change,"November 6, 2022","November 11, 2022",na,na,"Update on November 11, 2022 at 8:00AM PT: Meta...",Facebook Meta,"Climate, Combating Misinformation, Sustainability"
2,https://about.fb.com/news/2022/08/how-meta-is-...,How Meta Is Preparing for Brazil’s 2022 Elections,"August 12, 2022","August 16, 2022",na,na,"Update on August 16, 2022 at 10:30AM PT: As pa...",Meta,"Combating Misinformation, Elections, Election ..."
3,https://about.fb.com/news/2022/07/oversight-bo...,Meta Asks Oversight Board to Advise on COVID-1...,"July 26, 2022","June 16, 2023",Nick Clegg,"President, Global Affairs","Update on June 16, 2023 at 6:00AM PT: Today, w...",Meta,"Combating Misinformation, COVID-19 Response, H..."
4,https://about.fb.com/news/2022/07/how-metas-pr...,How Meta is Preparing for Kenya’s 2022 General...,"July 20, 2022","July 20, 2022",Mercy Ndegwa,Director of Public Policy East & Horn of Africa,"Today, we’re sharing an update on our work to ...",Meta,"Combating Hate Speech, Combating Misinformatio..."
...,...,...,...,...,...,...,...,...,...
75,https://about.fb.com/news/2018/10/removing-ina...,Removing Additional Inauthentic Activity from ...,"October 11, 2018","June 22, 2020",na,na,"ByNathaniel Gleicher, Head of Cybersecurity Po...",Meta,"Combating Misinformation, Community Standards ..."
76,https://about.fb.com/news/2018/09/expanding-fa...,Expanding Fact-Checking to Photos and Videos,"September 13, 2018","June 22, 2020",na,na,"By Antonia Woodford, Product Manager We know t...",Facebook,"Combating Misinformation, False News, Photos, ..."
77,https://about.fb.com/news/2018/06/hard-questio...,How Is Facebook’s Fact-Checking Program Working?,"June 14, 2018","June 22, 2020",na,na,Hard Questions isa seriesfrom Facebook that ad...,Meta,"Combating Misinformation, False News, Hard Que..."
78,https://about.fb.com/news/2018/05/facing-facts...,Facing Facts: Facebook’s Fight Against Misinfo...,"May 23, 2018","June 22, 2020",na,na,"ByJohn Hegeman, Head of News Feed Over the las...",Meta,"Combating Misinformation, False News, Company ..."


In [11]:
append_text_category(data_and_privacy)

Unnamed: 0,url,article_title,date,updated_date,author,author_pos,text,category,tags
0,https://about.fb.com/news/2024/03/end-to-end-e...,End-to-End Encryption on Messenger Explained,"March 28, 2024","March 28, 2024",na,na,"In December, we startedrolling out default end...",,", Data and Privacy, Messenger, Product News"
1,https://about.fb.com/news/2024/01/investing-in...,Investing In Privacy,"January 25, 2024","February 29, 2024",Michel Protti,"Chief Privacy Officer, Product","Update on February 29, 2024 at 9:30AM PT Learn...",Meta,"Privacy Matters, Data and Privacy, Public Policy"
2,https://about.fb.com/news/2023/12/default-end-...,Launching Default End-to-End Encryption on Mes...,"December 6, 2023","December 11, 2023",Loredana Crisan,Head of Messenger,Today I’m delighted to announce that we are ro...,,", Data and Privacy, Messenger, Product News"
3,https://about.fb.com/news/2023/11/new-tools-to...,New Tools to Support Independent Research,"November 21, 2023","September 26, 2024",Nick Clegg,"President, Global Affairs","Update on September 26, 2024 at 9:00AM PT: Thr...",Meta,", Data and Privacy"
4,https://about.fb.com/news/2023/10/collaboratin...,Feedback Is a Gift: How We Collaborate to Buil...,"October 5, 2023","January 24, 2024",Erin Egan,"Chief Privacy Officer, Policy",This week Meta hosted nearly 100 privacy acade...,Meta,"Privacy Conversations, Privacy Matters, Data a..."
...,...,...,...,...,...,...,...,...,...
163,https://about.fb.com/news/2019/02/addressing-c...,What Is Facebook Doing to Address the Challeng...,"February 4, 2019","November 11, 2019",na,na,The most frequently asked questions we receive...,Meta,"Ads and Pages Transparency, Community Standard..."
164,https://about.fb.com/news/2019/01/a-discussion...,A Discussion with Nick Clegg,"January 28, 2019","January 31, 2021",na,na,Vice President of Global Affairs and Communica...,Meta,"Ads and Pages Transparency, Community Standard..."
165,https://about.fb.com/news/2019/01/data-privacy...,Marking Data Privacy Day 2019,"January 27, 2019","November 7, 2019",na,na,"ByErin Egan, Chief Privacy Officer, Policy Mon...",Meta,"Controls, Resources, Data and Privacy, Public ..."
166,https://about.fb.com/news/2019/01/designing-se...,Designing Security for Billions,"January 25, 2019","August 29, 2022",na,na,"By Collin Greene, Manager of Product Security ...",Meta,"Security News, Data and Privacy, Inside Feed, ..."


In [None]:
safety_and_expression['date'] = safety_and_expression['date'].map(lambda x: datetime.strptime(x, "%B %d, %Y").strftime("%Y-%m-%d"))
safety_and_expression['updated_date'] = safety_and_expression['updated_date'].map(lambda x: datetime.strptime(x, "%B %d, %Y").strftime("%Y-%m-%d") if x else None)

In [18]:
combat_misinfo['date'] = combat_misinfo['date'].map(lambda x: datetime.strptime(x, "%B %d, %Y").strftime("%Y-%m-%d"))
combat_misinfo['updated_date'] = combat_misinfo['updated_date'].map(lambda x: datetime.strptime(x, "%B %d, %Y").strftime("%Y-%m-%d") if x else None)

In [19]:
data_and_privacy['date'] = data_and_privacy['date'].map(lambda x: datetime.strptime(x, "%B %d, %Y").strftime("%Y-%m-%d"))
data_and_privacy['updated_date'] = data_and_privacy['updated_date'].map(lambda x: datetime.strptime(x, "%B %d, %Y").strftime("%Y-%m-%d") if x else None)

In [20]:
safety_and_expression.to_excel('meta_safety_expression.xlsx', index=False)
files.download('meta_safety_expression.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
combat_misinfo.to_excel('meta_content_misinfo.xlsx', index=False)
files.download('meta_content_misinfo.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
data_and_privacy.to_excel('meta_data_privacy.xlsx', index=False)
files.download("meta_data_privacy.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>