In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver

def scrape_data():
    options = webdriver.ChromeOptions()
    options.headless = True
    options.add_argument("--window-size=1920,1200")
    driver = webdriver.Remote("http://selenium:4444/wd/hub", options=options)
    driver.get("https://peilingwijzer.tomlouwerse.nl/p/laatste-cijfers.html")

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    downloads = soup.find(id="downloads")
    find_links = downloads.find_all(
        "a", string="hier te downloaden (Excel-formaat)")
    download_link = find_links[0]["href"]

    return download_link

In [25]:
# Imports
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col, avg, concat, lit, from_csv
from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType, IntegerType
from time import sleep
from scrape_data import scrape_data
import pandas as pd

scraped_file = scrape_data()

pDF = pd.read_excel(scraped_file)


# Configure spark session
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("test")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# Create a spark session
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sparkPolls = spark.createDataFrame(pDF)

drop_poll_columns = ["Datum", "Percentage", "PercentageLaag", "PercentageHoog", "ZetelsLaag", "ZetelsHoog"]
sparkPolls = sparkPolls.select([col for col in sparkPolls.columns if col not in drop_poll_columns])
sparkPolls = sparkPolls.withColumnRenamed("Partij", "party")

sparkPolls.printSchema()
sparkPolls.show()

sparkSentiment = spark.createDataFrame(
    [
        ("VVD", 2),
        ("D66", 1),
        ("PVV", 0),
    ],
    ["party", "sentiment"],  # add your column names here
)
sparkSentiment.printSchema()
sparkSentiment.show()

combined = sparkSentiment.join(sparkPolls, "party", how="left")
combined.printSchema()
combined.show()

combined.stat.corr("sentiment", "zetels")

dataSchema = StructType([
    StructField("count", FloatType(), True),
    StructField("party", StringType(), True),
    StructField("sentiment", FloatType(), True),
    StructField("window", StringType(), True)])

df_raw = spark.read.format("kafka")\
              .option("kafka.bootstrap.servers", "kafka1:9093")\
              .option("subscribe", "avg_sentiment")\
              .load()
lines = df_raw.selectExpr("CAST(value AS STRING)")
df = lines.select(from_csv(lines.value, dataSchema.simpleString()))
df.printSchema()

df_gs = df.select(col("from_csv(value).*"))
df_gs.printSchema()
df_gs.show()

root
 |-- party: string (nullable = true)
 |-- Zetels: long (nullable = true)

+------+------+
| party|Zetels|
+------+------+
|   VVD|    34|
|   D66|    18|
|   PVV|    18|
|   CDA|     9|
|    SP|     8|
|  PvdA|     9|
|    GL|     9|
|   FvD|     5|
|  PvdD|     7|
|    CU|     6|
|  Volt|     7|
|  JA21|     6|
|   SGP|     3|
|  Denk|     3|
|50PLUS|     1|
|   BBB|     6|
|  BIJ1|     1|
+------+------+

root
 |-- party: string (nullable = true)
 |-- sentiment: long (nullable = true)

+-----+---------+
|party|sentiment|
+-----+---------+
|  VVD|        2|
|  D66|        1|
|  PVV|        0|
+-----+---------+

root
 |-- party: string (nullable = true)
 |-- sentiment: long (nullable = true)
 |-- Zetels: long (nullable = true)

+-----+---------+------+
|party|sentiment|Zetels|
+-----+---------+------+
|  D66|        1|    18|
|  VVD|        2|    34|
|  PVV|        0|    18|
+-----+---------+------+

root
 |-- from_csv(value): struct (nullable = true)
 |    |-- count: float (nulla

In [26]:
spark.stop()

In [None]:
lines = df_raw.selectExpr("CAST(value AS STRING)")
df = lines.select(from_csv(lines.value, dataSchema.simpleString()))
df.printSchema()

df_gs = df.select(col("from_csv(value).*"))
df_gs.printSchema()
df_gs.show()