# Scraping content

In [None]:
# Import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
from dateutil import relativedelta
from datetime import datetime
from dateutil.relativedelta import relativedelta
import time

In [None]:
# Read database
df_outcomes = pd.read_excel("../databases/Pipeline2_Outcome.xlsx")
links = df_outcomes["link"].to_list()

In [None]:
# Scrape content
data = []
missed_data = []

for i in links:
    url = i
    headers = {"User-Agent": "Mozilla/5.0"}

    success = False
    retries = 0
    max_retries = 5  # prevent infinite loops

    while not success and retries < max_retries:
        try:
            response = requests.get(url, headers=headers)
            
            if response.status_code == 429:
                print(f"Rate limited (429) on {url}, retrying...")
                time.sleep(1)
                retries += 1
                continue

            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            post_content = soup.find(class_="post__content")
            content_text = []

            if post_content:
                articles = post_content.find_all("p", recursive=False)
                for article in articles:
                    post_content = article.get_text(strip=True)

                    content_text.append(post_content)

            post_body = soup.find(class_="post__body")
            if post_body:
                articles = post_body.find_all("p", recursive=False)
                for article in articles:
                    post_body = article.get_text(strip=True)

                    content_text.append(post_body)
            else:
                print(f"No 'content' section found on {url}")
            data.append({"Content": content_text, "Link": url})

            success = True  # Success: exit loop

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {e}, retrying...")
            time.sleep(1)
            retries += 1

    if not success:
        print(f"Failed after retries: {url}")
        missed_data.append(url)

In [None]:
df = pd.DataFrame(data=data, columns= ["Content", "Link"])

df.to_excel("shelfens/Pipeline2_Matched_dataset.xlsx")