In [1]:
import os
import time
import pandas as pd
from google.cloud import bigquery
from newspaper import Article
from bs4 import BeautifulSoup
import requests

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/reppmazc/Documents/IRONHACK/quests/final_project/fourth-epigram-440716-f1-502459dd1bae.json"

# Initialize the BigQuery client
client = bigquery.Client()



In [3]:
# Define the list of outlets for which BeautifulSoup is preferred
beautifulsoup_outlets = [
    'www.tagesspiegel.de',
    'orf.at',
    'www.welt.de',
    'www.diepresse.com',
    'www.focus.de']

In [7]:
# Function to scrape full content using `newspaper`
def scrape_article_content(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        print(f"Error scraping {url} with `newspaper`: {e}")
        return None

# Function to scrape content using BeautifulSoup
def scrape_content(url):
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()  # Raise an exception for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract main article text
        paragraphs = soup.find_all('p')
        content = ' '.join([p.get_text() for p in paragraphs])
        return content
    except Exception as e:
        print(f"Failed to scrape {url} with BeautifulSoup: {e}")
        return None  # Return None if scraping failed

# Function to select scraping method based on URL
def get_scraping_method(url):
    # If URL contains any of the specified outlets, use BeautifulSoup method
    if any(outlet in url for outlet in beautifulsoup_outlets):
        return scrape_content  # Use BeautifulSoup method from script 2
    else:
        return scrape_article_content  # Default to newspaper method from script 1

# Parameters for batch processing
start_line = 5000
end_line = 904360
batch_size = 2000

# Main loop with conditional scraping
for offset in range(start_line, end_line, batch_size):
    # Query the next chunk
    query = f"""
    SELECT DocumentIdentifier
    FROM `fourth-epigram-440716-f1.news_analysis.gdelt`
    LIMIT {batch_size} OFFSET {offset}
    """
    urls_df = client.query(query).to_dataframe()
    
    # Scrape content based on selected method
    contents = []
    for index, row in urls_df.iterrows():
        url = row['DocumentIdentifier']
        scrape_method = get_scraping_method(url)
        
        if scrape_method:
            content = scrape_method(url)
        else:
            content = None
            print(f"No scraping method available for {url}")
        
        contents.append(content)
        time.sleep(1)  # Optional delay to avoid rate-limiting

    # Save results
    results_df = pd.DataFrame({
        'DocumentIdentifier': urls_df['DocumentIdentifier'],
        'Content': contents})
    #csv_filename = f"/Users/reppmazc/Documents/IRONHACK/quests/final_project/scraped_content_{offset}_{offset + batch_size}.csv"
    csv_filename = f"/scraped_content_{offset}_{offset + batch_size}.csv"
    results_df.to_csv(csv_filename, index=False)
    print(f"Saved {csv_filename}")



Error scraping https://www.stern.de/gesellschaft/regional/rheinland-pfalz-saarland/wetter--steigende-temperaturen-erwartet---bis-zu-32-grad-am-freitag-34885656.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/rheinland-pfalz-saarland/wetter--steigende-temperaturen-erwartet---bis-zu-32-grad-am-freitag-34885656.html on URL https://www.stern.de/gesellschaft/regional/rheinland-pfalz-saarland/wetter--steigende-temperaturen-erwartet---bis-zu-32-grad-am-freitag-34885656.html
Error scraping https://www.stern.de/gesellschaft/regional/hessen/unfall--feuerwehrwagen-prallt-in-lkws---drei-schwerverletzte-34857254.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/hessen/unfall--feuerwehrwagen-prallt-in-lkws---drei-schwerverletzte-34857254.html on URL https://www.stern.de/gesellschaft/regional/hessen/unfall--feuerwehrwagen-prallt-



Error scraping https://www.stern.de/gesellschaft/regional/rheinland-pfalz-saarland/wald--neues-jagdgesetz-kommt-spaeter---ueberarbeiteter-entwurf-steht-35017422.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/rheinland-pfalz-saarland/wald--neues-jagdgesetz-kommt-spaeter---ueberarbeiteter-entwurf-steht-35017422.html on URL https://www.stern.de/gesellschaft/regional/rheinland-pfalz-saarland/wald--neues-jagdgesetz-kommt-spaeter---ueberarbeiteter-entwurf-steht-35017422.html
Error scraping https://www.dw.com/sr/uvode-se-kontrole-na-svim-nema%C4%8Dkim-granicama/a-70173371 with `newspaper`: Article `download()` failed with HTTPSConnectionPool(host='www.dw.com', port=443): Read timed out. (read timeout=7) on URL https://www.dw.com/sr/uvode-se-kontrole-na-svim-nema%C4%8Dkim-granicama/a-70173371
Error scraping https://www.stern.de/gesellschaft/regional/sachsen/munitionsfund--80-panzerfaeuste-bei-bauarbeiten-i

Building prefix dict from /opt/anaconda3/lib/python3.9/site-packages/jieba/dict.txt ...
Loading model from cache /var/folders/4j/klr6l0gn60b9blsxb73r3jq40000gn/T/jieba.cache
Loading model cost 0.8341422080993652 seconds.
Prefix dict has been built succesfully.


Error scraping https://www.stern.de/kultur/versteigerung-gescheitert--sammlungen-erwerben-skizzenbuch-von-caspar-david-friedrich-34869498.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/kultur/versteigerung-gescheitert--sammlungen-erwerben-skizzenbuch-von-caspar-david-friedrich-34869498.html on URL https://www.stern.de/kultur/versteigerung-gescheitert--sammlungen-erwerben-skizzenbuch-von-caspar-david-friedrich-34869498.html
Failed to scrape https://www.focus.de/wissen/diverses/tiere-doch-nicht-so-kolossal-zweifel-an-errechnetem-wal-gewicht_id_259718359.html with BeautifulSoup: 404 Client Error: Not Found for url: https://www.focus.de/wissen/diverses/tiere-doch-nicht-so-kolossal-zweifel-an-errechnetem-wal-gewicht_id_259718359.html
Error scraping https://www.stern.de/politik/deutschland/parteien--lindner-ueber-appelle-der-union--keine--koalitionsspielchen--34651454.html with `newspaper`: Article `download()` failed with 40



Error scraping https://www.stern.de/gesellschaft/regional/hessen/wetter--herbstwetter-in-hessen--schauer-und-meist-unter-20-grad-35051196.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/hessen/wetter--herbstwetter-in-hessen--schauer-und-meist-unter-20-grad-35051196.html on URL https://www.stern.de/gesellschaft/regional/hessen/wetter--herbstwetter-in-hessen--schauer-und-meist-unter-20-grad-35051196.html
Error scraping https://www.stern.de/gesellschaft/regional/berlin-brandenburg/terrorismus--raf-fahndung--polizei-durchsucht-wagenburg---keine-festnahme-34512434.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/berlin-brandenburg/terrorismus--raf-fahndung--polizei-durchsucht-wagenburg---keine-festnahme-34512434.html on URL https://www.stern.de/gesellschaft/regional/berlin-brandenburg/terrorismus--raf-fahndung--polizei



Failed to scrape https://www.welt.de/regionales/sachsen-anhalt/article250006960/Termine-fuer-Gerichtsprozess-gegen-Hoecke-stehen-fest.html with BeautifulSoup: 404 Client Error: Not Found for url: https://www.welt.de/regionales/sachsen-anhalt/article250006960/Termine-fuer-Gerichtsprozess-gegen-Hoecke-stehen-fest.html
Error scraping https://www.stern.de/gesellschaft/regional/bayern/muehldorf-am-inn--motorradfahrer-stirbt-nach-zusammenstoss-mit-auto-33973102.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/bayern/muehldorf-am-inn--motorradfahrer-stirbt-nach-zusammenstoss-mit-auto-33973102.html on URL https://www.stern.de/gesellschaft/regional/bayern/muehldorf-am-inn--motorradfahrer-stirbt-nach-zusammenstoss-mit-auto-33973102.html
Error scraping https://www.stern.de/gesellschaft/regional/baden-wuerttemberg/verkehr--jugendlicher-bei-unfall-mit-e-scooter-verletzt-34754290.html with `newspaper`: Article `do



Error scraping https://www.stern.de/politik/ausland/klimagipfel-in-dubai--cop28--fossiler-ausstieg-fehlt-im-entwurf-fuer-abschlusstext-34278040.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/politik/ausland/klimagipfel-in-dubai--cop28--fossiler-ausstieg-fehlt-im-entwurf-fuer-abschlusstext-34278040.html on URL https://www.stern.de/politik/ausland/klimagipfel-in-dubai--cop28--fossiler-ausstieg-fehlt-im-entwurf-fuer-abschlusstext-34278040.html


KeyboardInterrupt: 