In [1]:
!pip install --quiet bs4
!pip install --quiet selenium
!pip install --quiet openpyxl

In [64]:
from bs4 import BeautifulSoup
from selenium import webdriver
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col, avg, concat, lit, from_csv
from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType, IntegerType, DateType
from time import sleep
from datetime import datetime, timedelta
import pandas as pd
import re
import time

In [55]:
def scrape_data():
    options = webdriver.ChromeOptions()
    options.headless = True
    options.add_argument("--window-size=1920,1200")
    driver = webdriver.Remote("http://selenium:4444/wd/hub", options=options)
    driver.get("https://peilingwijzer.tomlouwerse.nl/p/laatste-cijfers.html")

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()
    
    publications = soup.find(id="PublicationInfo").find("div")
    publication_data = re.search("start veldwerk: (.*), einde veldwerk", str(pub))
    publication_date = publication_data.group(1) + "-2021"
    publication_date_obj = datetime.strptime(publication_date, "%d-%m-%Y")
    publication_date = publication_date_obj.strftime("%Y-%m-%d")

    downloads = soup.find(id="downloads")
    find_links = downloads.find_all(
        "a", string="hier te downloaden (Excel-formaat)")
    download_link = find_links[0]["href"]

    return {
        "download_link": download_link, 
        "publication_date": publication_date
    }

In [59]:
def run_batch_pipe(download_link, publication_date):
    pDF = pd.read_excel(download_link)
    
    window = {
        "min": (datetime.strptime(publication_date, "%Y-%m-%d") - timedelta(days=7)).strftime("%Y-%m-%d"),
        "max": datetime.strptime(publication_date, "%Y-%m-%d").strftime("%Y-%m-%d")
    }
    
    sparkConf = SparkConf()
    sparkConf.setMaster("spark://spark-master:7077")
    sparkConf.setAppName("polls-pipeline")
    sparkConf.set("spark.driver.memory", "2g")
    sparkConf.set("spark.executor.cores", "1")
    sparkConf.set("spark.driver.cores", "1")
    
    spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
    sparkPolls = spark.createDataFrame(pDF)

    drop_poll_columns = ["Datum", "Percentage", "PercentageLaag", "PercentageHoog", "ZetelsLaag", "ZetelsHoog"]
    sparkPolls = sparkPolls.select([col for col in sparkPolls.columns if col not in drop_poll_columns])
    sparkPolls = sparkPolls.withColumnRenamed("Partij", "party").withColumnRenamed("Zetels", "seats")

    sentimentDataSchema = StructType([
        StructField("party", StringType(), True),
        StructField("window_start", StringType(), True),
        StructField("window_end", DateType(), True),
        StructField("sentiment", FloatType(), True)])

    sparkSentiment_raw = spark.read.format("kafka")\
                              .option("kafka.bootstrap.servers", "kafka1:9093")\
                              .option("subscribe", "avg_sentiment")\
                              .load()
    
    sparkSentiment_lines = sparkSentiment_raw.selectExpr("CAST(value AS STRING)")
    sparkSentiment_csv = sparkSentiment_lines.select(from_csv(sparkSentiment_lines.value, 
                                              sentimentDataSchema.simpleString()))
    sparkSentiment = sparkSentiment_csv.select(col("from_csv(value).*"))
    sparkSentiment = sparkSentiment.where(sparkSentiment.window_end <= window["max"]).where(sparkSentiment.window_end > window["min"])

    combined = sparkSentiment.join(sparkPolls, "party", how="left")
    correlation = combined.stat.corr("sentiment", "seats")
    
    spark.stop()
    
    return correlation

In [65]:
while(True):
    scraped_data = scrape_data()
    correlation = run_batch_pipe(scraped_data["download_link"], scraped_data["publication_date"])
    print(correlation)
    time.sleep(604800)

0.32561668092191476


AttributeError: 'NoneType' object has no attribute 'sc'