In [3]:
import os
import time
import pandas as pd
from google.cloud import bigquery
from newspaper import Article
from bs4 import BeautifulSoup
import requests

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/reppmazc/Documents/IRONHACK/quests/final_project/fourth-epigram-440716-f1-502459dd1bae.json"

# Initialize the BigQuery client
client = bigquery.Client()

In [4]:
# Define the list of outlets for which BeautifulSoup is preferred
beautifulsoup_outlets = [
    'www.tagesspiegel.de',
    'orf.at',
    'www.welt.de',
    'www.diepresse.com',
    'www.focus.de']

In [5]:
# Function to scrape full content using `newspaper`
def scrape_article_content(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        print(f"Error scraping {url} with `newspaper`: {e}")
        return None

# Function to scrape content using BeautifulSoup
def scrape_content(url):
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()  # Raise an exception for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract main article text
        paragraphs = soup.find_all('p')
        content = ' '.join([p.get_text() for p in paragraphs])
        return content
    except Exception as e:
        print(f"Failed to scrape {url} with BeautifulSoup: {e}")
        return None  # Return None if scraping failed

# Function to select scraping method based on URL
def get_scraping_method(url):
    # If URL contains any of the specified outlets, use BeautifulSoup method
    if any(outlet in url for outlet in beautifulsoup_outlets):
        return scrape_content  # Use BeautifulSoup method from script 2
    else:
        return scrape_article_content  # Default to newspaper method from script 1

# Parameters for batch processing
start_line = 510001
end_line = 904360
batch_size = 2000

# Main loop with conditional scraping
for offset in range(start_line, end_line, batch_size):
    # Query the next chunk
    query = f"""
    SELECT DocumentIdentifier
    FROM `fourth-epigram-440716-f1.news_analysis.gdelt`
    LIMIT {batch_size} OFFSET {offset}
    """
    urls_df = client.query(query).to_dataframe()
    
    # Scrape content based on selected method
    contents = []
    for index, row in urls_df.iterrows():
        url = row['DocumentIdentifier']
        scrape_method = get_scraping_method(url)
        
        if scrape_method:
            content = scrape_method(url)
        else:
            content = None
            print(f"No scraping method available for {url}")
        
        contents.append(content)
        time.sleep(1)  # Optional delay to avoid rate-limiting

    # Save results
    results_df = pd.DataFrame({
        'DocumentIdentifier': urls_df['DocumentIdentifier'],
        'Content': contents})
    csv_filename = f"/Users/reppmazc/Documents/IRONHACK/quests/final_project/scraped_content_{offset}_{offset + batch_size}.csv"
    #csv_filename = f"/scraped_content_{offset}_{offset + batch_size}.csv"
    results_df.to_csv(csv_filename, index=False)
    print(f"Saved {csv_filename}")



Error scraping https://www.stern.de/gesellschaft/regional/thueringen/kriminalitaet--voigt--attacken-auf-politiker-inakzeptabel-34771408.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/thueringen/kriminalitaet--voigt--attacken-auf-politiker-inakzeptabel-34771408.html on URL https://www.stern.de/gesellschaft/regional/thueringen/kriminalitaet--voigt--attacken-auf-politiker-inakzeptabel-34771408.html
Error scraping https://www.stern.de/gesellschaft/regional/baden-wuerttemberg/fussball-bundesliga--im-zweiten-anlauf--nach-anton-verliert-der-vfb-auch-guirassy-34895234.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/baden-wuerttemberg/fussball-bundesliga--im-zweiten-anlauf--nach-anton-verliert-der-vfb-auch-guirassy-34895234.html on URL https://www.stern.de/gesellschaft/regional/baden-wuerttemberg/fussball-bundesliga--im-

Building prefix dict from /opt/anaconda3/lib/python3.9/site-packages/jieba/dict.txt ...
Loading model from cache /var/folders/4j/klr6l0gn60b9blsxb73r3jq40000gn/T/jieba.cache
Loading model cost 0.9159789085388184 seconds.
Prefix dict has been built succesfully.


Error scraping https://www.stern.de/gesellschaft/regional/hessen/ausbildung--mehr-studienabschluesse-im-fach-mathematik-34537738.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/hessen/ausbildung--mehr-studienabschluesse-im-fach-mathematik-34537738.html on URL https://www.stern.de/gesellschaft/regional/hessen/ausbildung--mehr-studienabschluesse-im-fach-mathematik-34537738.html
Error scraping https://www.dw.com/zh/%E5%93%88%E9%A9%AC%E6%96%AF%E7%A7%B0%E4%B8%AD%E5%9B%BD%E6%9C%89%E6%84%8F%E4%BB%BF%E6%95%88%E6%94%BB%E5%8F%B0-%E8%A7%A3%E6%94%BE%E5%86%9B%E5%B0%86%E9%A2%86%E9%A9%B3%E6%96%A5/a-67275163 with `newspaper`: Article `download()` failed with HTTPSConnectionPool(host='www.dw.com', port=443): Read timed out. (read timeout=7) on URL https://www.dw.com/zh/%E5%93%88%E9%A9%AC%E6%96%AF%E7%A7%B0%E4%B8%AD%E5%9B%BD%E6%9C%89%E6%84%8F%E4%BB%BF%E6%95%88%E6%94%BB%E5%8F%B0-%E8%A7%A3%E6%94%BE%E5%86%9B%E5%B0%86%E9%



Error scraping https://www.stern.de/politik/ausland/spitzentreffen-in-washington--am-ende-noch-ein-eklat--ungarn-provoziert-bei-nato-gipfel-34875020.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/politik/ausland/spitzentreffen-in-washington--am-ende-noch-ein-eklat--ungarn-provoziert-bei-nato-gipfel-34875020.html on URL https://www.stern.de/politik/ausland/spitzentreffen-in-washington--am-ende-noch-ein-eklat--ungarn-provoziert-bei-nato-gipfel-34875020.html
Error scraping https://www.stern.de/gesellschaft/regional/hamburg-schleswig-holstein/nahverkehr--schaden-an-bruecke---u1-faehrt-auf-teilstrecke-weiter-nicht-34875088.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/hamburg-schleswig-holstein/nahverkehr--schaden-an-bruecke---u1-faehrt-auf-teilstrecke-weiter-nicht-34875088.html on URL https://www.stern.de/gesellschaft/regional/hamburg-



Error scraping https://www.stern.de/politik/deutschland/1--mai--hamburg--linksextreme-starten-im-schanzenviertel-zur-demo-34675380.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/politik/deutschland/1--mai--hamburg--linksextreme-starten-im-schanzenviertel-zur-demo-34675380.html on URL https://www.stern.de/politik/deutschland/1--mai--hamburg--linksextreme-starten-im-schanzenviertel-zur-demo-34675380.html
Error scraping https://www.stern.de/politik/deutschland/europawahlen--datenschutzbeauftragte-warnt-vor-gezielter-wahlwerbung-34651832.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/politik/deutschland/europawahlen--datenschutzbeauftragte-warnt-vor-gezielter-wahlwerbung-34651832.html on URL https://www.stern.de/politik/deutschland/europawahlen--datenschutzbeauftragte-warnt-vor-gezielter-wahlwerbung-34651832.html
Error scraping https://www.stern.de/gesellsch



Error scraping https://www.stern.de/gesellschaft/regional/niedersachsen-bremen/abitur--ministerium-kuendigt-konsequenzen-nach-verzoegerter-pruefung-an-34630590.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/niedersachsen-bremen/abitur--ministerium-kuendigt-konsequenzen-nach-verzoegerter-pruefung-an-34630590.html on URL https://www.stern.de/gesellschaft/regional/niedersachsen-bremen/abitur--ministerium-kuendigt-konsequenzen-nach-verzoegerter-pruefung-an-34630590.html
Error scraping https://www.stern.de/gesellschaft/regional/sachsen-anhalt/zeugen-gesucht--giftangriff--loecher-in-75-jahre-alten-baum-gebohrt-34866714.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/sachsen-anhalt/zeugen-gesucht--giftangriff--loecher-in-75-jahre-alten-baum-gebohrt-34866714.html on URL https://www.stern.de/gesellschaft/regional/sachsen



Error scraping https://www.stern.de/gesellschaft/regional/hamburg-schleswig-holstein/bundeslaender--dgb-will-von-kuestenlaendern-gemeinsame-energiewende-politik-34339006.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/hamburg-schleswig-holstein/bundeslaender--dgb-will-von-kuestenlaendern-gemeinsame-energiewende-politik-34339006.html on URL https://www.stern.de/gesellschaft/regional/hamburg-schleswig-holstein/bundeslaender--dgb-will-von-kuestenlaendern-gemeinsame-energiewende-politik-34339006.html
Error scraping https://www.stern.de/gesellschaft/regional/niedersachsen-bremen/verden--junger-wolf-bei-achim-von-zug-ueberfahren-34190370.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/niedersachsen-bremen/verden--junger-wolf-bei-achim-von-zug-ueberfahren-34190370.html on URL https://www.stern.de/gesellschaft/regional/n



Error scraping https://www.dw.com/sw/maporomoko-ya-udongo-yaua-zaidi-ya-watu-30-afghanistan/a-68822980 with `newspaper`: Article `download()` failed with HTTPSConnectionPool(host='www.dw.com', port=443): Read timed out. (read timeout=7) on URL https://www.dw.com/sw/maporomoko-ya-udongo-yaua-zaidi-ya-watu-30-afghanistan/a-68822980
Error scraping https://www.stern.de/gesellschaft/regional/baden-wuerttemberg/gesundheit--landgericht-weist-impfschadenklage-gegen-biontech-ab-34262014.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/baden-wuerttemberg/gesundheit--landgericht-weist-impfschadenklage-gegen-biontech-ab-34262014.html on URL https://www.stern.de/gesellschaft/regional/baden-wuerttemberg/gesundheit--landgericht-weist-impfschadenklage-gegen-biontech-ab-34262014.html
Error scraping https://www.stern.de/politik/deutschland/elbtower--ren%C3%A9-benko-droht-zu-scheitern--der-alte-deal-koennte-olaf-scholz



Error scraping https://www.stern.de/gesellschaft/regional/hessen/dfb-pokal--frankfurt-mit-pokalpruefung-in-braunschweig-34989738.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/gesellschaft/regional/hessen/dfb-pokal--frankfurt-mit-pokalpruefung-in-braunschweig-34989738.html on URL https://www.stern.de/gesellschaft/regional/hessen/dfb-pokal--frankfurt-mit-pokalpruefung-in-braunschweig-34989738.html
Error scraping https://www.stern.de/news/fussball-bundesliga--leverkusen-zurueck-in-der-erfolgsspur---erster-vfb-sieg-35064336.html with `newspaper`: Article `download()` failed with 404 Client Error: Not Found for url: https://www.stern.de/news/fussball-bundesliga--leverkusen-zurueck-in-der-erfolgsspur---erster-vfb-sieg-35064336.html on URL https://www.stern.de/news/fussball-bundesliga--leverkusen-zurueck-in-der-erfolgsspur---erster-vfb-sieg-35064336.html
Error scraping https://www.stern.de/lifestyle/einschaltquoten--roland-ka

KeyboardInterrupt: 