### Retrive hourly production data

In [11]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time

# === API SETTINGS ===
BASE_URL = "https://api.elhub.no/energy-data/v0/price-areas"
DATASET = "PRODUCTION_PER_GROUP_MBA_HOUR"

#== FUNCTION TO FORMAT DATES ===
def format_date(dt_obj):
    """Formats datetime with timezone offset for Elhub (%2B02:00).""" 
    return dt_obj.strftime("%Y-%m-%dT%H:%M:%S%%2B02:00") # formatted for URL encoding. The time is always in +02:00 for MBA data. Which is summertime all year round.

all_records = []

# === FETCH EACH MONTH OF 2021 ===
for month in range(1, 13):
    start = datetime(2021, month, 1)
    next_month = (start + timedelta(days=32)).replace(day=1)
    end = next_month - timedelta(seconds=1)

    start_str = format_date(start)
    end_str = format_date(end)

    url = f"{BASE_URL}?dataset={DATASET}&startDate={start_str}&endDate={end_str}"
    print(f"=== Fetching {start.date()} → {end.date()} ===")

    response = requests.get(url)
    if response.status_code != 200:
        print(f"❌ Error {response.status_code}")
        continue

    data = response.json()
    month_records = []

    for entry in data.get("data", []):
        attrs = entry.get("attributes", {})
        recs = attrs.get("productionPerGroupMbaHour", [])
        # Filter out placeholders. Filtering them out ensures your DataFrame contains only meaningful production data (real energy values per group and price area).
        recs = [r for r in recs if r.get("productionGroup") != "*"]
        month_records.extend(recs)

    all_records.extend(month_records)
    print(f"✅ {len(month_records)} records added")

    # Be nice to the API
    time.sleep(1)

print(f"\nTotal records collected: {len(all_records)}")


=== Fetching 2021-01-01 → 2021-01-31 ===
✅ 17856 records added
=== Fetching 2021-02-01 → 2021-02-28 ===
✅ 16128 records added
=== Fetching 2021-03-01 → 2021-03-31 ===
✅ 17832 records added
=== Fetching 2021-04-01 → 2021-04-30 ===
✅ 17280 records added
=== Fetching 2021-05-01 → 2021-05-31 ===
✅ 17856 records added
=== Fetching 2021-06-01 → 2021-06-30 ===
✅ 17976 records added
=== Fetching 2021-07-01 → 2021-07-31 ===
✅ 18600 records added
=== Fetching 2021-08-01 → 2021-08-31 ===
✅ 18600 records added
=== Fetching 2021-09-01 → 2021-09-30 ===
✅ 18000 records added
=== Fetching 2021-10-01 → 2021-10-31 ===
✅ 18625 records added
=== Fetching 2021-11-01 → 2021-11-30 ===
✅ 18000 records added
=== Fetching 2021-12-01 → 2021-12-31 ===
✅ 18600 records added

Total records collected: 215353


In [12]:
# === CONVERT TO DATAFRAME ===
df = pd.DataFrame(all_records)

# Convert data types
df['startTime'] = pd.to_datetime(df['startTime'], utc=True) # Ensure UTC timezone
df['endTime'] = pd.to_datetime(df['endTime'], utc=True)
df['quantityKwh'] = pd.to_numeric(df['quantityKwh'], errors='coerce')

# Keep only relevant columns
df = df[['priceArea', 'productionGroup', 'startTime', 'quantityKwh']]

# Sort by time for readability
df.sort_values('startTime', inplace=True)

# Optional: set startTime as index
df.set_index('startTime', inplace=True)

print(df.info())
# Print first 50 rows
print(df.head(50))
# Shape of DataFrame
print(f"DataFrame shape: {df.shape}")


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 215353 entries, 2020-12-31 23:00:00+00:00 to 2021-12-31 22:00:00+00:00
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   priceArea        215353 non-null  object 
 1   productionGroup  215353 non-null  object 
 2   quantityKwh      215353 non-null  float64
dtypes: float64(1), object(2)
memory usage: 6.6+ MB
None
                          priceArea productionGroup  quantityKwh
startTime                                                       
2020-12-31 23:00:00+00:00       NO1           hydro  2507716.800
2020-12-31 23:00:00+00:00       NO2           other        4.346
2020-12-31 23:00:00+00:00       NO5           solar        3.720
2020-12-31 23:00:00+00:00       NO2            wind      706.206
2020-12-31 23:00:00+00:00       NO3           hydro  2836774.000
2020-12-31 23:00:00+00:00       NO4            wind   381065.000
2020-12-31 23:00:00+00:00       

### Spark and cassandra

In [13]:
import os
import subprocess
import time
import socket
import pandas as pd
import datetime
from pyspark.sql import SparkSession
from cassandra.cluster import Cluster

# 1️⃣ Set Java environment for PySpark
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/microsoft-11.jdk/Contents/Home"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# 2️⃣ Start Cassandra container if not running
container_name = "cassandra"
status = subprocess.run(
    ["docker", "inspect", "-f", "{{.State.Running}}", container_name],
    capture_output=True, text=True
)

if "true" not in status.stdout:
    print("Starting Cassandra container...")
    subprocess.run(["docker", "rm", "-f", container_name])
    subprocess.run([
        "docker", "run", "-d",
        "--name", container_name,
        "-p", "9042:9042",
        "cassandra:4.1"
    ])
else:
    print("Cassandra container is already running.")

Cassandra container is already running.


In [14]:
# 4️⃣ Connect to Cassandra using Python driver (protocol v4)
cluster = Cluster(['127.0.0.1'], port=9042, protocol_version=4)
session = cluster.connect()

# 5️⃣ Create keyspace and table
session.execute("""
CREATE KEYSPACE IF NOT EXISTS elhub
WITH REPLICATION = { 'class': 'SimpleStrategy', 'replication_factor': 1 };
""")
session.set_keyspace('elhub')

session.execute("DROP TABLE IF EXISTS production_data;")
session.execute("""
CREATE TABLE IF NOT EXISTS production_data (
    starttime timestamp,
    pricearea text,
    productiongroup text,
    quantitykwh double,
    PRIMARY KEY ((pricearea), starttime, productiongroup)
);
""")

<cassandra.cluster.ResultSet at 0x12a29d950>

In [15]:
df = df.reset_index()  # bring startTime back as a column
df.columns = [c.lower() for c in df.columns]  # all lowercase


# Shape of DataFrame
print(f"DataFrame shape: {df.shape}")

DataFrame shape: (215353, 4)


In [16]:
# ...existing code...
import time
import socket
import subprocess
from pyspark.sql import SparkSession

def wait_for_cassandra(host='127.0.0.1', port=9042, timeout=180):
    start = time.time()
    while time.time() - start < timeout:
        try:
            with socket.create_connection((host, port), timeout=3):
                print(f"Cassandra reachable at {host}:{port}")
                return
        except OSError:
            print("Waiting for Cassandra...", end="\r")
            time.sleep(2)
    # timed out — print recent container logs to help debug
    try:
        print("\n=== Docker logs (last 200 lines) ===")
        subprocess.run(["docker", "logs", "cassandra", "--tail", "200"], check=False)
    except Exception as e:
        print("Could not fetch docker logs:", e)
    raise TimeoutError(f"Cassandra not reachable at {host}:{port} after {timeout}s")

# ensure Cassandra is reachable from host (use 127.0.0.1 if you started container with -p 9042:9042)
wait_for_cassandra('127.0.0.1', 9042, timeout=180)

# restart Spark cleanly and use localhost as contact point
try:
    spark.stop()
except Exception:
    pass

spark = SparkSession.builder \
    .appName("SparkCassandraApp") \
    .master("local[*]") \
    .config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.5.1") \
    .config("spark.cassandra.connection.host", "127.0.0.1") \
    .config("spark.cassandra.connection.port", "9042") \
    .getOrCreate()

# read table
spark_df = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(keyspace="elhub", table="production_data") \
    .load() \
    .select("pricearea", "productiongroup", "starttime", "quantitykwh")

spark_df.show(10)
print(f"Total rows extracted: {spark_df.count()}")
# ...existing code...

Cassandra reachable at 127.0.0.1:9042
+---------+---------------+---------+-----------+
|pricearea|productiongroup|starttime|quantitykwh|
+---------+---------------+---------+-----------+
+---------+---------------+---------+-----------+

Total rows extracted: 0


In [17]:
spark_df = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(keyspace="elhub", table="production_data") \
    .load() \
    .select("pricearea", "productiongroup", "starttime", "quantitykwh")

spark_df.show(10)
print(f"Total rows extracted: {spark_df.count()}")


+---------+---------------+---------+-----------+
|pricearea|productiongroup|starttime|quantitykwh|
+---------+---------------+---------+-----------+
+---------+---------------+---------+-----------+

Total rows extracted: 0


In [18]:
print("Checking Spark JARs:")
print(spark.sparkContext._jsc.sc().listJars())


Checking Spark JARs:
Vector(spark://mac.home:58906/jars/org.apache.cassandra_java-driver-mapper-runtime-4.18.1.jar, spark://mac.home:58906/jars/org.apache.cassandra_java-driver-query-builder-4.18.1.jar, spark://mac.home:58906/jars/com.thoughtworks.paranamer_paranamer-2.8.jar, spark://mac.home:58906/jars/org.reactivestreams_reactive-streams-1.0.3.jar, spark://mac.home:58906/jars/com.datastax.spark_spark-cassandra-connector_2.12-3.5.1.jar, spark://mac.home:58906/jars/com.datastax.spark_spark-cassandra-connector-driver_2.12-3.5.1.jar, spark://mac.home:58906/jars/com.typesafe_config-1.4.1.jar, spark://mac.home:58906/jars/org.apache.cassandra_java-driver-core-shaded-4.18.1.jar, spark://mac.home:58906/jars/org.scala-lang.modules_scala-collection-compat_2.12-2.11.0.jar, spark://mac.home:58906/jars/org.scala-lang_scala-reflect-2.12.19.jar, spark://mac.home:58906/jars/com.datastax.oss_native-protocol-1.5.1.jar, spark://mac.home:58906/jars/org.apache.commons_commons-lang3-3.10.jar, spark://mac.h

In [19]:
import socket
sock = socket.create_connection(("127.0.0.1", 9042), timeout=5)
print("Cassandra reachable from host!")
sock.close()


Cassandra reachable from host!


### Plots

In [20]:
# A pie chart for the total production of the year from a chosen price area, where each piece of the pie is one of the production groups
import plotly.express as px

# Choose your price area
chosen_pricearea = "NO1"

# Filter Spark DataFrame for that price area
df_selected = spark_df.filter(spark_df.pricearea == chosen_pricearea)

# Aggregate total production per production group for the entire year
df_pie = (
    df_selected.groupBy("productiongroup")
    .sum("quantitykwh")
    .toPandas()
)

# Rename column for readability
df_pie.rename(columns={"sum(quantitykwh)": "total_quantitykwh"}, inplace=True)

# Create interactive pie chart
fig = px.pie(
    df_pie,
    names="productiongroup",
    values="total_quantitykwh",
    title=f"Total Production by Group – Price Area {chosen_pricearea} (2021)"
)

fig.show()


In [21]:
# A pie chart for the total production of the year from a chosen price area, where each piece of the pie is one of the production groups.

# 1️⃣ Choose price area
chosen_pricearea = "NO1"

# 2️⃣ Filter Spark DataFrame for chosen area and January 2021
df_january = (
    spark_df.filter(spark_df.pricearea == chosen_pricearea)
    .filter((spark_df.starttime >= '2021-01-01') & (spark_df.starttime < '2021-02-01'))
)

# 3️⃣ Convert to Pandas for Plotly
df_january_pd = df_january.toPandas()

# 4️⃣ Plot 1 — All production groups
fig_all = px.line(
    df_january_pd,
    x="starttime",
    y="quantitykwh",
    color="productiongroup",
    title=f"Hourly Production – January 2021 (Price Area {chosen_pricearea}, All Groups)",
    labels={"starttime": "Time", "quantitykwh": "Production (kWh)", "productiongroup": "Group"}
)
fig_all.update_layout(template="plotly_white")
fig_all.show()

# 5️⃣ Extract color mapping used by Plotly in the first figure
color_map = {
    trace.name: trace.line.color for trace in fig_all.data
}

# 6️⃣ Filter out hydro
df_no_hydro = df_january_pd[df_january_pd["productiongroup"].str.lower() != "hydro"]

# 7️⃣ Create consistent color map for remaining groups
consistent_colors = {
    group: color_map[group]
    for group in df_no_hydro["productiongroup"].unique()
    if group in color_map
}

# 8️⃣ Plot 2 — Without hydro, using same colors
fig_no_hydro = px.line(
    df_no_hydro,
    x="starttime",
    y="quantitykwh",
    color="productiongroup",
    title=f"Hourly Production – January 2021 (Price Area {chosen_pricearea}, Without Hydro)",
    labels={"starttime": "Time", "quantitykwh": "Production (kWh)", "productiongroup": "Group"},
    color_discrete_map=consistent_colors
)
fig_no_hydro.update_layout(template="plotly_white")
fig_no_hydro.show()


### Insert the Spark-extracted data into your MongoDB.

In [None]:
from pymongo.mongo_client import MongoClient

# MongoDB Atlas connection
uri = ("mongodb+srv://{}:{}@cluster0.qwrlccf.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")

# Read credentials
USR, PWD = open('/Users/sarahorte/Documents/IND320/Personlig/No_sync/MongoDB').read().splitlines()

# Connect
client = MongoClient(uri.format(USR, PWD))

# Create your own database and collection
database = client['elhub']
collection = database['production_data']

ModuleNotFoundError: No module named 'pymongo'

25/10/21 13:01:35 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1068929 ms exceeds timeout 120000 ms
25/10/21 13:01:35 WARN SparkContext: Killing executors is not supported by current scheduler.
25/10/21 13:19:26 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$

In [None]:
# Convert Spark DataFrame to Pandas
df_mongo = df_selected.toPandas()

# Convert to list of dictionaries
records = df_mongo.to_dict("records")

# Insert all records (or in batches if very large)
collection.insert_many(records)


<pymongo.results.InsertManyResult at 0x15b6f2080>

In [None]:
print("Documents inserted:", collection.count_documents({}))
print("Example document:", collection.find_one())
print("Distinct price areas:", collection.distinct("pricearea"))


Documents inserted: 131400
Example document: {'_id': ObjectId('68f3c59253b0d15fa463cc84'), 'pricearea': 'NO1', 'starttime': datetime.datetime(2021, 1, 1, 0, 0), 'productiongroup': 'hydro', 'quantitykwh': 2507716.8}
Distinct price areas: ['NO1']


25/10/21 09:25:44 WARN ChannelPool: [s0|/127.0.0.1:9042]  Error while opening new channel (ConnectionInitException: [s0|id: 0x1e6d38ca, L:/127.0.0.1:54360 - R:localhost/127.0.0.1:9042] Protocol initialization request, step 1 (STARTUP {CQL_VERSION=3.0.0, DRIVER_NAME=Apache Cassandra Java Driver, DRIVER_VERSION=4.18.1, CLIENT_ID=e0849d40-7446-4809-81cf-bf720c39f1c9, APPLICATION_NAME=Spark-Cassandra-Connector-local-1761031248134}): unexpected failure (com.datastax.oss.driver.api.core.connection.ClosedConnectionException: Unexpected error on channel))
25/10/21 09:25:44 WARN ChannelPool: [s0|/127.0.0.1:9042]  Error while opening new channel (ConnectionInitException: [s0|id: 0x3bf99641, L:/127.0.0.1:54361 - R:localhost/127.0.0.1:9042] Protocol initialization request, step 1 (STARTUP {CQL_VERSION=3.0.0, DRIVER_NAME=Apache Cassandra Java Driver, DRIVER_VERSION=4.18.1, CLIENT_ID=e0849d40-7446-4809-81cf-bf720c39f1c9, APPLICATION_NAME=Spark-Cassandra-Connector-local-1761031248134}): unexpected fa