### Project Goal:



The goal of this project is to webscrape Meta's newsroom for articles tagged with

* Safety and Expression
* Combatting Misinformation
* Data and Privacy

### Import packages

In [1]:
from bs4 import BeautifulSoup
from google.colab import files
import requests
import pandas as pd

### Write function to scrape article metadata, text from Meta's newsroom

In [85]:
def scrape_meta_section(url, page_range):
  data = [ ]
  for i in range(0, page_range):
    request = requests.get(f"{url}/{i}/")
    parser = BeautifulSoup(request.content, "html.parser")

    area_section = parser.find("div", class_="archive-articles-container")
    article_section = area_section.find_all("div", class_="uk-width-3-5 article-excerpt")
    for article in article_section:
      try:
        header = article.find("header", class_="entry-header")
        link_source = header.find("a", href=True)

        article_title = link_source.get_text()
        link = link_source.get("href")

        date_source = header.find("time", class_="entry-date published")
        date_source2 = header.find("time", class_="updated")
        date = date_source.get_text()
        date2 = date_source2.get_text()

        data.append((link, article_title, date, date2))

      except:
        header = article.find("header", class_="entry-header")
        link_source = header.find("a", href=True)

        article_title = link_source.get_text()
        link = link_source.get("href")

        date_source = header.find("time", class_="entry-date published updated")
        date = date_source.get_text()
        date2 = None

        data.append((link, article_title, date, date2))
  return pd.DataFrame(data, columns=["Link", "Article Title" , "Date", "Updated Date"])




def append_text_category(dataframe):
  for idx, row in dataframe.iterrows():
      article_request = requests.get(url=row['Link'])
      parser = BeautifulSoup(article_request.content, "html.parser")

      text_section = parser.find("div", class_="uk-width-2-3@m article-container")
      paragraphs = text_section.find_all("p")

      cleaned_text = [para.get_text(strip=True) for para in paragraphs]
      plain_text = "\n".join(cleaned_text)

      category_source = parser.find("div", class_="entry-categories")
      category = category_source.find_all("a")
      category_text = [cat.get_text() for cat in category]
      category_text_cleaned =" ".join(category_text)
      dataframe.at[idx, "Text"] = plain_text
      dataframe.at[idx, "Category"] = category_text_cleaned

      print(dataframe.iloc[idx])

  return dataframe



### Scrape metadata section of the Meta blog

In [93]:
safety_and_expression = scrape_meta_section(url="https://about.fb.com/news/category/safety-and-expression/page/", page_range=21)

safety_and_expression

Unnamed: 0,Link,Article Title,Date,Updated Date
0,https://about.fb.com/news/2024/10/testing-comb...,Testing New Ways to Combat Scams and Help Rest...,"October 21, 2024","October 21, 2024"
1,https://about.fb.com/news/2024/10/instagram-ca...,Our New Education Campaign to Help Protect Tee...,"October 17, 2024","October 21, 2024"
2,https://about.fb.com/news/2024/09/instagram-te...,Introducing Instagram Teen Accounts: Built-In ...,"September 17, 2024","September 17, 2024"
3,https://about.fb.com/news/2024/09/preventing-s...,Preventing Suicide and Self-Harm Content Sprea...,"September 12, 2024","September 12, 2024"
4,https://about.fb.com/news/2024/07/combating-fi...,Combating Financial Sextortion Scams From Nigeria,"July 24, 2024","July 24, 2024"
...,...,...,...,...
205,https://about.fb.com/news/2019/02/addressing-c...,What Is Facebook Doing to Address the Challeng...,"February 4, 2019","November 11, 2019"
206,https://about.fb.com/news/2019/01/taking-down-...,Taking Down Coordinated Inauthentic Behavior i...,"January 31, 2019","March 24, 2021"
207,https://about.fb.com/news/2019/01/a-discussion...,A Discussion with Nick Clegg,"January 28, 2019","January 31, 2021"
208,https://about.fb.com/news/2019/01/oversight-bo...,Charting a Course for an Oversight Board for C...,"January 28, 2019","May 19, 2021"


In [90]:
combat_misinfo = scrape_meta_section(url ="https://about.fb.com/news/tag/misinformation/page", page_range=8)

combat_misinfo

Unnamed: 0,Link,Article Title,Date,Updated Date
0,https://about.fb.com/news/2022/11/metas-progre...,Sharing Our Progress on Combating Climate Change,"November 6, 2022","November 11, 2022"
1,https://about.fb.com/news/2022/08/how-meta-is-...,How Meta Is Preparing for Brazil’s 2022 Elections,"August 12, 2022","August 16, 2022"
2,https://about.fb.com/news/2022/07/oversight-bo...,Meta Asks Oversight Board to Advise on COVID-1...,"July 26, 2022","June 16, 2023"
3,https://about.fb.com/news/2022/07/how-metas-pr...,How Meta is Preparing for Kenya’s 2022 General...,"July 20, 2022","July 20, 2022"
4,https://about.fb.com/news/2022/02/metas-ongoin...,Meta’s Ongoing Efforts Regarding Russia’s Inva...,"February 26, 2022","April 18, 2022"
...,...,...,...,...
74,https://about.fb.com/news/2018/10/removing-ina...,Removing Additional Inauthentic Activity from ...,"October 11, 2018","June 22, 2020"
75,https://about.fb.com/news/2018/09/expanding-fa...,Expanding Fact-Checking to Photos and Videos,"September 13, 2018","June 22, 2020"
76,https://about.fb.com/news/2018/06/hard-questio...,How Is Facebook’s Fact-Checking Program Working?,"June 14, 2018","June 22, 2020"
77,https://about.fb.com/news/2018/05/facing-facts...,Facing Facts: Facebook’s Fight Against Misinfo...,"May 23, 2018","June 22, 2020"


In [94]:
data_and_privacy = scrape_meta_section(url = "https://about.fb.com/news/category/data-and-privacy/page/", page_range=17)

data_and_privacy

Unnamed: 0,Link,Article Title,Date,Updated Date
0,https://about.fb.com/news/2024/03/end-to-end-e...,End-to-End Encryption on Messenger Explained,"March 28, 2024","March 28, 2024"
1,https://about.fb.com/news/2024/01/investing-in...,Investing In Privacy,"January 25, 2024","February 29, 2024"
2,https://about.fb.com/news/2023/12/default-end-...,Launching Default End-to-End Encryption on Mes...,"December 6, 2023","December 11, 2023"
3,https://about.fb.com/news/2023/11/new-tools-to...,New Tools to Support Independent Research,"November 21, 2023","September 26, 2024"
4,https://about.fb.com/news/2023/10/collaboratin...,Feedback Is a Gift: How We Collaborate to Buil...,"October 5, 2023","January 24, 2024"
...,...,...,...,...
163,https://about.fb.com/news/2019/02/addressing-c...,What Is Facebook Doing to Address the Challeng...,"February 4, 2019","November 11, 2019"
164,https://about.fb.com/news/2019/01/a-discussion...,A Discussion with Nick Clegg,"January 28, 2019","January 31, 2021"
165,https://about.fb.com/news/2019/01/data-privacy...,Marking Data Privacy Day 2019,"January 27, 2019","November 7, 2019"
166,https://about.fb.com/news/2019/01/designing-se...,Designing Security for Billions,"January 25, 2019","August 29, 2022"


### Scrape article dataset and download as XLSX file

In [95]:
append_text_category(safety_and_expression)
safety_and_expression.to_excel('meta_safety_expression.xlsx', index=False)
files.download('meta_safety_expression.xlsx')

Link             https://about.fb.com/news/2024/10/testing-comb...
Article Title    Testing New Ways to Combat Scams and Help Rest...
Date                                              October 21, 2024
Updated Date                                      October 21, 2024
Text             We know security matters, and that includes be...
Category         Facebook Instagram Public Policy Safety and Ex...
Name: 0, dtype: object
Link             https://about.fb.com/news/2024/10/instagram-ca...
Article Title    Our New Education Campaign to Help Protect Tee...
Date                                              October 17, 2024
Updated Date                                      October 21, 2024
Text             Sextortion is a horrific crime, where financia...
Category             Instagram Public Policy Safety and Expression
Name: 1, dtype: object
Link             https://about.fb.com/news/2024/09/instagram-te...
Article Title    Introducing Instagram Teen Accounts: Built-In ...
Date            

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [92]:
combat_misinfo = append_text_category(combat_misinfo)
combat_misinfo.to_excel('meta_content_misinfo.xlsx', index=False)
files.download('meta_content_misinfo.xlsx')

Link             https://about.fb.com/news/2022/11/metas-progre...
Article Title     Sharing Our Progress on Combating Climate Change
Date                                              November 6, 2022
Updated Date                                     November 11, 2022
Text             Update on November 11, 2022 at 8:00AM PT:\nMet...
Category                              Facebook Meta Sustainability
Name: 0, dtype: object
Link             https://about.fb.com/news/2022/08/how-meta-is-...
Article Title    How Meta Is Preparing for Brazil’s 2022 Elections
Date                                               August 12, 2022
Updated Date                                       August 16, 2022
Text             Update on August 16, 2022 at 10:30AM PT:\nAs p...
Category         Election Integrity Integrity and Security Meta...
Name: 1, dtype: object
Link             https://about.fb.com/news/2022/07/oversight-bo...
Article Title    Meta Asks Oversight Board to Advise on COVID-1...
Date            

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [96]:
append_text_category(data_and_privacy)
data_and_privacy.to_excel('meta_data_privacy.xlsx', index=False)
files.download("meta_data_privacy.xlsx")

Link             https://about.fb.com/news/2024/03/end-to-end-e...
Article Title         End-to-End Encryption on Messenger Explained
Date                                                March 28, 2024
Updated Date                                        March 28, 2024
Text             In December, we startedrolling out default end...
Category                   Data and Privacy Messenger Product News
Name: 0, dtype: object
Link             https://about.fb.com/news/2024/01/investing-in...
Article Title                                 Investing In Privacy
Date                                              January 25, 2024
Updated Date                                     February 29, 2024
Text             Update on February 29, 2024 at 9:30AM PT\nLear...
Category                       Data and Privacy Meta Public Policy
Name: 1, dtype: object
Link             https://about.fb.com/news/2023/12/default-end-...
Article Title    Launching Default End-to-End Encryption on Mes...
Date            

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>