In [1]:
#buat pandasrawclient yang bentuknya dah langsung table data 
import os 
from dotenv import load_dotenv
import requests
from google.oauth2 import service_account
import xmltodict
import json
import pandas as pd 
from requests import request
import pytz
from bs4 import BeautifulSoup
from typing import Dict
from google.oauth2 import service_account
from google.cloud import storage
from entsoe import EntsoeRawClient

import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
import sys
import re

import csv

from pyspark.sql.types import StructType, StructField, ArrayType, FloatType, BooleanType, TimestampType
from pyspark.sql.types import DoubleType, IntegerType, StringType, DataType
from pyspark.sql import functions as F
from pyspark.sql.window import Window

from entsoe import EntsoeRawClient
from entsoe import EntsoePandasClient

from pyspark.sql import SparkSession

#load env variables
load_dotenv('../playground/jupyter.env', verbose=True, override=True)
os.environ['TZ'] = 'UTC'

'''
----------------
INIT VARIABLES
----------------
'''

#setting up entsoe variables
security_token = os.environ.get("ENTSOE_SECURITY_TOKEN")
ENTSOE_URL = 'https://transparency.entsoe.eu/api'

#setting up GCP variables
service_account_file = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
print(service_account_file)
credentials = service_account.Credentials.from_service_account_file(
    service_account_file
)
gcs_bucket = os.environ.get("GCP_GCS_BUCKET")

print(os.environ.get("PYSPARK_PYTHON"))


/home/rafzul/.google/credentials/google_credentials.json
/usr/bin/python3.10


In [2]:
#setting up entsoe client
entsoe_client = EntsoeRawClient(api_key=security_token)

In [3]:


'''
----------------
SETTING UP FUNCTION CALLS 
----------------
'''

# upload data to GCS
def upload_blob_to_gcs(bucket_name, contents, destination_blob_name):
    # Upload file to bucket"""

    # ID of GCS bucket
    # bucket_name =

    # the contents from memory to be uploaded to file
    # contents =

    # the ID of your GCS object
    # destination_blob_name =

    storage_client = storage.Client(credentials=credentials)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_string(contents)


'''
----------------
EXTRACTION
----------------
'''
#for test, we'll be querying

start=pd.Timestamp('202101010000', tz='Europe/Berlin')
end=pd.Timestamp('202101010600', tz='Europe/Berlin')
country_code= 'DE_TENNET'
country_code_from=''
country_code_to=''
type_marketagreement_type=''
contract_marketagreement_type=''
label_data='total_generation'

actual_start = start + pd.Timedelta(hours=1)
actual_end = end + pd.Timedelta(hours=1)

entsoe_data = entsoe_client.query_generation(country_code, start=actual_start, end=actual_end, psr_type=None)
entsoe_dict = xmltodict.parse(entsoe_data)
#if header is already correct, header = True. if not, header = entsoe_header_list
entsoe_json = json.dumps(entsoe_dict)
# try:
    
# except Exception as e:
#     print("An exception occurred:", e)


'''
----------------
LOAD
----------------
'''
#upload to GCS
start = start.strftime("%Y%m%d%H%M")
end = end.strftime("%Y%m%d%H%M")
landing_filename=f"TEST_{label_data}__{country_code}__{start}__{end}.json"
upload_blob_to_gcs(bucket_name=gcs_bucket, contents=entsoe_json, destination_blob_name=landing_filename)

In [2]:
import os
from helpers.parsers import parse_datetimeindex, parse_generation_timeseries


In [3]:
start=pd.Timestamp('202101010000', tz='Europe/Berlin')
end=pd.Timestamp('202101010600', tz='Europe/Berlin')


label_data='total_generation'
start = start.strftime("%Y%m%d%H%M")
end = end.strftime("%Y%m%d%H%M")
country_code= 'DE_TENNET'


In [5]:
from pyspark.sql import SparkSession
import os
from helpers.parsers import parse_datetimeindex, parse_generation_timeseries, _parse_resolution_to_timedelta
from pyspark.sql import functions as F
#TAROH DI DAG

#TAROH DI FILE CLASSNYA

class EntsoeRawTS:
    
    def __init__(self):
        #load env variable and setup variable
        load_dotenv("/opt/airflow/.env", verbose=True)
        SPARK_HOME = os.environ["SPARK_HOME"]
        GCS_BUCKET = os.environ.get("GCP_GCS_BUCKET")   
        SERVICE_ACCOUNT_FILE = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
        #setup spark session
        self.spark = SparkSession.builder.appName("gcp_playground") \
            .config("spark.jars", f"{SPARK_HOME}/jars/gcs-connector-hadoop3-latest.jar, {SPARK_HOME}/jars/spark-bigquery-with-dependencies_2.13-0.27.1.jar") \
            .config("spark.sql.session.timeZone", "UTC") \
            .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
            .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
            .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", SERVICE_ACCOUNT_FILE) \
            .getOrCreate()
    
    def _base_timeseries(self, metrics_label: str, start: str, end: str, country_code: str):
        
        #initializing variables
        start_label = start
        end_label = end
        
        #initializing source path
        landing_filename = (
            f"TEST_{metrics_label}__{country_code}__{start_label}__{end_label}.json"
        )
        path = f"gs://{gcs_bucket}/{landing_filename}"
        
        #create dataframe
        try:
            df_spark = (
                self.spark.read.format("json")
                .option("inferSchema", "true")
                .option("multiLine", "true")
                .option("timestampFormat", "yyyy-MM-dd'T'HH:mm'Z'")
                .load(path)
        )
        except Exception as e:
            raise e
        
        #deciding if it's timeseries, clean columns & cast types, 
        if metrics_label in ["total_generation"]:
            document_column = "GL_MarketDocument"
        df_spark = self._clean_columns_n_casttypes(df_spark, document_column)
        
        #separate df into ts and non ts df
        df_ts = df_spark.select("TimeSeries")
        df_nonts = df_spark.drop("TimeSeries")
        
        #initial processing non TS dataframe
        df_nonts = self._flatten_struct(self._flatten_struct(df_nonts))
        #cast the timestamp column to timestamp type
        df_nonts = df_nonts.withColumn("time_Period_timeInterval_end", F.to_timestamp("time_Period_timeInterval_end",  "yyyy-MM-dd'T'HH:mm'Z'")) \
                    .withColumn("time_Period_timeInterval_start", F.to_timestamp("time_Period_timeInterval_start",  "yyyy-MM-dd'T'HH:mm'Z'")) \
                    .withColumn("createdDateTime", F.to_timestamp("createdDateTime",  "yyyy-MM-dd'T'HH:mm:ss'Z'"))

                    
        #initial processing for TS dataframe
        # explode timeseries column into struct, jadiin semua elemen di dalam array TimeSeries jadi satu row
        df_ts = df_ts.withColumn("TimeSeries", F.explode("TimeSeries"))
        df_ts = df_ts.select("TimeSeries.*")
        #flatten nested struct sampe ke dalem, nyisain si period 
        df_ts = self._flatten_struct(self._flatten_struct(df_ts))
        
        return df_ts, df_nonts
    
    ##
    ## BASE TIMESERIES UTILITIES
    
    def _clean_columns_n_casttypes(self, df, parent_column_name):
        #cleaning the dots, changed it into namespaces, casting new columns names
        df_schema = df.select(parent_column_name).dtypes[0][1]
        replacements = [('\.', '_'), ('[@#]', '')]
        for old, new in replacements: 
            df_schema = re.sub(old, new, df_schema)
        #casting the DF with the cleaned schema (must be done first before column name got changed)
        df = df.withColumn(parent_column_name, F.col(parent_column_name).cast(df_schema)).select(f"{parent_column_name}.*")
        # #casting the DF with correct datatype schema, selecting the column inside the big parent column name
        # df = df.withColumn(parent_column_name, F.col(parent_column_name).cast(raw_schema))
        return df

    # define flatten struct function
    def _flatten_struct(self, nested_struct_df): 
        flat_cols = [c[0] for c in nested_struct_df.dtypes if c[1][:6] != 'struct']
        nested_struct_cols = [c[0] for c in nested_struct_df.dtypes if c[1][:6] == 'struct']
        flat_df = nested_struct_df.select(flat_cols + [F.col(f"{nc}.{c}").alias(f"{nc}_{c}") for nc in nested_struct_cols for c in nested_struct_df.select(f"{nc}.*").columns])
        return flat_df
    
    
    def _stage_to_bq(self, df_spark):
         df_spark.write.format("bigquery").option("project", "rafzul-analytics-1009") \
        .option("temporaryGcsBucket", "entsoe_temp_1009") \
        .mode("append") \
        .save("rafzul-analytics-1009.entsoe_playground.TEST_total_generation_staging")
    ##
    ## BASE TIMESERIES UTILITIES END
    ##----------------------------------
    ## TRANSFORM METHOD
    ##
    
    def transform_generation(self, metrics_label: str, start: str, end: str, country_code: str, **params):
        #get full df of timeseries & non timeseries
        full_df = self._base_timeseries(metrics_label, start, end ,country_code)
        df_ts = full_df[0]
        df_nonts = full_df[1]
        ##processing TS dataframe
        #setting up per_plant & include_eic as placeholder default
        per_plant = False
        include_eic = False
        
        
        #interpret the date as in local timezone where process dijalanin (WIB Jakarta, GMT +7), and render it as UTC timestamp
        start = df_nonts.select(F.to_utc_timestamp(F.col("time_Period_timeInterval_start"), "+07:00")).collect()[0][0]
        end = df_nonts.select(F.to_utc_timestamp(F.col("time_Period_timeInterval_end"), "+07:00")).collect()[0][0]
        
        # ambil resolution dan parse
        resolution_col = df_ts.select(F.col("Period_resolution")).collect()[0][0]
        delta = _parse_resolution_to_timedelta(resolution_col)
        
        #setting up initial dfs used biar ga berulang2 manggil
        all_df = parse_datetimeindex(self.spark, df_ts, df_nonts, tz=None)
        # print(all_df[0])
        # print(all_df[1])
        
        #select transform_generation special column
        periods_col = df_ts.select(F.col("Period_point")).collect()
        psrtype_col =  df_ts.select(F.col("MktPSRType_psrType")).collect()
        metric_col = df_ts.select(F.col("inBiddingZone_Domain_mRID_text")).collect()

        #get range len of periods_col
        for entry in range(len(periods_col)):
            ts_data = parse_generation_timeseries(self.spark,entry, periods_col, psrtype_col, metric_col, per_plant=per_plant, include_eic=include_eic)
            all_df = all_df.join(ts_data, ["position"], how="inner")
        all_df = all_df.orderBy(F.asc("position")).drop("position")
        
        #stage data to bigquery
        self._stage_to_bq(all_df)
        print("MANTEP!!!!")
        
# metrics_methods = {
#         "total_generation": transform_generation    
#     }

def main(metrics_label: str, start: str, end: str, country_code: str, **params):
    entsoe = EntsoeRawTS()
    entsoe.transform_generation(metrics_label, start, end, country_code)
    

metrics_label=label_data
start=start
print(start)
end=end
print(end)
country_code=country_code
main(metrics_label, start, end, country_code)
        

202101010000
202101010600


                                                                                

MANTEP!!!!


In [None]:

# path ="/home/rafzul/projects/entsoe-pipelines/sample.xml"

#coba spark gcs connector
#setup sparksession for entry point - COBA GCS CONNECTOR
####
# DAG INITIALIZATION
####

# load env variables
load_dotenv("/opt/airflow/.env", verbose=True)
    
#pasang di DAG
gcs_bucket = os.environ.get("GCP_GCS_BUCKET")    

#####
# Class for Entsoe Data Initialization
#####
# load env variables
load_dotenv("/opt/airflow/.env", verbose=True)

def sparksession_init():
    SPARK_HOME = os.environ["SPARK_HOME"]
    spark = SparkSession.builder.appName("gcp_playground") \
        .config("spark.jars", f"{SPARK_HOME}/jars/gcs-connector-hadoop3-latest.jar, {SPARK_HOME}/jars/spark-bigquery-with-dependencies_2.13-0.27.1.jar") \
        .config("spark.sql.session.timeZone", "UTC") \
        .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
        .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
        .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", service_account_file) \
        .getOrCreate()
    return spark


def base_getraw_timeseries(metrics_label, start, end, country_code, gcs_bucket):
    # load env variables
    load_dotenv("/opt/airflow/.env", verbose=True)
        
    
    #rinit variables
    gcs_bucket = gcs_bucket
    service_account_file = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
    
    
    #initializing source path
    path = f"gs://{gcs_bucket}/{landing_filename}"
    print(path)

    try: 
        df_spark = spark.read.format("json") \
            .option("inferSchema","true") \
            .option("multiLine","true") \
            .load(path)
    except Exception as e:
    pass

if label_data in ["total_generation"]:
    document_column = "GL_MarketDocument"
df_spark = clean_columns_n_casttypes(df_spark, document_column)

#separate df into ts and non ts df
df_ts = df_spark.select("TimeSeries")
df_nonts = df_spark.drop("TimeSeries")

#processing non TS dataframe
df_nonts = flatten_struct(flatten_struct(df_nonts))

#cast the timestamp column to timestamp type
df_nonts = df_nonts.withColumn("time_Period_timeInterval_end", F.to_timestamp("time_Period_timeInterval_end",  "yyyy-MM-dd'T'HH:mm'Z'")) \
            .withColumn("time_Period_timeInterval_start", F.to_timestamp("time_Period_timeInterval_start",  "yyyy-MM-dd'T'HH:mm'Z'")) \
            .withColumn("createdDateTime", F.to_timestamp("createdDateTime",  "yyyy-MM-dd'T'HH:mm:ss'Z'"))
df_nonts.printSchema()
df_nonts.show()

##processing TS dataframe
# explode timeseries column into struct, jadiin semua elemen di dalam array TimeSeries jadi satu row
df_ts = df_ts.withColumn("TimeSeries", F.explode("TimeSeries"))
df_ts = df_ts.select("TimeSeries.*")
#flatten nested struct sampe ke dalem, nyisain si period 
df_ts = flatten_struct(flatten_struct(df_ts))
df_ts.printSchema()

#set the loop count
period_row = 2
per_plant = False
include_eic = False

#setting up initial dfs used biar ga berulang2 manggil
all_df = parse_datetimeindex(df_ts, df_nonts, tz=None)
all_df.show()

periods_col = df_ts.select(F.col("Period_point")).collect()
psrtype_col =  df_ts.select(F.col("MktPSRType_psrType")).collect()
metric_col = df_ts.select(F.col("inBiddingZone_Domain_mRID_text")).collect()

#get range len of periods_col
for entry in range(len(periods_col)):
    ts_data = _parse_generation_timeseries(entry, periods_col, psrtype_col, metric_col, per_plant=per_plant, include_eic=include_eic)
    print("ts berhasil diparse")
    all_df = all_df.join(ts_data, ["position"], how="inner")

all_df = all_df.orderBy(F.asc("position")).drop("position")

#tulis data ke bigquery via temporary gcs bucket
all_df.write \
  .format("bigquery") \
  .option("project","rafzul-analytics-1009") \
  .option("temporaryGcsBucket","entsoe_temp_1009") \
  .mode("append") \
  .save("rafzul-analytics-1009.entsoe_playground.total_generation_staging")


In [6]:

# path ="/home/rafzul/projects/entsoe-pipelines/sample.xml"

#coba spark gcs connector
#setup sparksession for entry point - COBA GCS CONNECTOR
SPARK_HOME = os.environ["SPARK_HOME"]
spark = SparkSession.builder.appName("gcp_playground") \
    .config("spark.jars", f"{SPARK_HOME}/jars/gcs-connector-hadoop3-latest.jar, {SPARK_HOME}/jars/spark-bigquery-with-dependencies_2.13-0.27.1.jar") \
    .config("spark.sql.session.timeZone", "UTC") \
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", service_account_file) \
    .getOrCreate()




In [None]:

# def download_blob_from_gcs(bucket_name, source_blob_name, local_source_blob_name):
#     # Upload file to bucket"""

#     # ID of GCS bucket
#     # bucket_name =

#     # the contents from memory to be uploaded to file
#     # contents =

#     # the ID of your GCS object
#     # destination_blob_name =

#     storage_client = storage.Client(credentials=credentials)
#     bucket = storage_client.bucket(bucket_name)
#     blob = bucket.blob(source_blob_name)

#     blob.download_to_filename(local_source_blob_name)
    
# #ambil schema
# raw_schema_filename = f"{label_data}__rawschema.txt"
# source_rawschema_filename = f"schema_source/{raw_schema_filename}"
# local_rawschema_filename = f"/home/rafzul/projects/entsoe-pipelines/schemas/source/{raw_schema_filename}"
# download_blob_from_gcs(bucket_name=gcs_bucket, source_blob_name=source_rawschema_filename, local_source_blob_name=local_rawschema_filename)

# #setting up schema - block

# with open(local_rawschema_filename, "r") as local_source:
#     rawschema_data = local_source.read()


In [7]:
# berangkat pak haji
# setup parameternya
gcs_bucket = gcs_bucket
path = f"gs://{gcs_bucket}/{landing_filename}"
print(path)

try: 
   df_spark = spark.read.format("json") \
      .option("inferSchema","true") \
      .option("multiLine","true") \
      .option("timestampFormat", "yyyy-MM-dd'T'HH:mm'Z'") \
      .load(path)
except Exception as e:
   pass

gs://entsoe_analytics_1009/TEST_total_generation__DE_TENNET__202101010000__202101010600.json


In [8]:
#Cleaning column names & casting data type
#-------------------------------------------------------------------

#cleaning documents

def clean_columns_n_casttypes(df, parent_column_name):
    #cleaning the dots, changed it into namespaces, casting new columns names
    df_schema = df.select(parent_column_name).dtypes[0][1]
    replacements = [('\.', '_'), ('[@#]', '')]
    for old, new in replacements: 
        df_schema = re.sub(old, new, df_schema)
    #casting the DF with the cleaned schema (must be done first before column name got changed)
    df = df.withColumn(parent_column_name, F.col(parent_column_name).cast(df_schema)).select(f"{parent_column_name}.*")
    # #casting the DF with correct datatype schema, selecting the column inside the big parent column name
    # df = df.withColumn(parent_column_name, F.col(parent_column_name).cast(raw_schema))
    return df

# define flatten struct function
def flatten_struct(nested_struct_df): 
    flat_cols = [c[0] for c in nested_struct_df.dtypes if c[1][:6] != 'struct']
    nested_struct_cols = [c[0] for c in nested_struct_df.dtypes if c[1][:6] == 'struct']
    flat_df = nested_struct_df.select(flat_cols + [F.col(f"{nc}.{c}").alias(f"{nc}_{c}") for nc in nested_struct_cols for c in nested_struct_df.select(f"{nc}.*").columns])
    return flat_df

#Extraction from Non TS

if label_data in ["total_generation"]:
    document_column = "GL_MarketDocument"
df_spark = clean_columns_n_casttypes(df_spark, document_column)


In [13]:
#separate df into ts and non ts df
df_ts = df_spark.select("TimeSeries")
df_nonts = df_spark.drop("TimeSeries")

#processing non TS dataframe
df_nonts = flatten_struct(flatten_struct(df_nonts))

#cast the timestamp column to timestamp type
df_nonts = df_nonts.withColumn("time_Period_timeInterval_end", F.to_timestamp("time_Period_timeInterval_end",  "yyyy-MM-dd'T'HH:mm'Z'")) \
            .withColumn("time_Period_timeInterval_start", F.to_timestamp("time_Period_timeInterval_start",  "yyyy-MM-dd'T'HH:mm'Z'")) \
            .withColumn("createdDateTime", F.to_timestamp("createdDateTime",  "yyyy-MM-dd'T'HH:mm:ss'Z'"))

##processing TS dataframe
# explode timeseries column into struct, jadiin semua elemen di dalam array TimeSeries jadi satu row
df_ts = df_ts.withColumn("TimeSeries", F.explode("TimeSeries"))
df_ts = df_ts.select("TimeSeries.*")
#flatten nested struct sampe ke dalem, nyisain si period 
df_ts = flatten_struct(flatten_struct(df_ts))

In [14]:
df_nonts.show()
df_ts.show()

+--------------------+-------------------+--------------------+-------------------+------------------------------------------+--------------+----------------------------------------+----+------------------------------------+--------------------------------------------+----------------------------------+------------------------------------------+----------------------------+------------------------------+
|               xmlns|    createdDateTime|                mRID|process_processType|receiver_MarketParticipant_marketRole_type|revisionNumber|sender_MarketParticipant_marketRole_type|type|receiver_MarketParticipant_mRID_text|receiver_MarketParticipant_mRID_codingScheme|sender_MarketParticipant_mRID_text|sender_MarketParticipant_mRID_codingScheme|time_Period_timeInterval_end|time_Period_timeInterval_start|
+--------------------+-------------------+--------------------+-------------------+------------------------------------------+--------------+----------------------------------------+--

In [10]:
PSRTYPE_MAPPINGS = {
    'A03': 'Mixed',
    'A04': 'Generation',
    'A05': 'Load',
    'B01': 'Biomass',
    'B02': 'Fossil Brown coal/Lignite',
    'B03': 'Fossil Coal-derived gas',
    'B04': 'Fossil Gas',
    'B05': 'Fossil Hard coal',
    'B06': 'Fossil Oil',
    'B07': 'Fossil Oil shale',
    'B08': 'Fossil Peat',
    'B09': 'Geothermal',
    'B10': 'Hydro Pumped Storage',
    'B11': 'Hydro Run-of-river and poundage',
    'B12': 'Hydro Water Reservoir',
    'B13': 'Marine',
    'B14': 'Nuclear',
    'B15': 'Other renewable',
    'B16': 'Solar',
    'B17': 'Waste',
    'B18': 'Wind Offshore',
    'B19': 'Wind Onshore',
    'B20': 'Other',
    'B21': 'AC Link',
    'B22': 'DC Link',
    'B23': 'Substation',
    'B24': 'Transformer'}

In [17]:

def _parse_resolution_to_timedelta(resolution_column: str) -> str:
    resolutions = {
        'PT60M': 'INTERVAL 1 HOUR',
        'P1Y': 'INTERVAL 12 MONTH',
        'PT15M': 'INTERVAL 15 MINUTES',
        'PT30M': 'INTERVAL 30 MINUTES',
        'P1D': 'INTERVAL 1 DAY',
        'P7D': 'INTERVAL 7 DAY',
        'P1M': 'INTERVAL 1 MONTH',
    }
    delta = resolutions.get(resolution_column)
    if delta is None:
        raise NotImplementedError(f"Sorry, I don't know what to do with the "
                                  "resolution '{resolution_column}', because there was no "
                                  "documentation to be found of this format. "
                                  "Everything is hard coded. Please open an "
                                  "issue.")
    return delta
    
#parsing datetime
def _parse_datetimeindex(df_ts, df_nonts, tz=None):
    start = df_nonts.select(F.col("time_Period_timeInterval_start")).collect()[0][0]
    end = df_nonts.select(F.col("time_Period_timeInterval_end")).collect()[0][0]
    if tz is not None:
        start = df_nonts.select(F.to_utc_timestamp(F.col("time_Period_timeInterval_start"), tz)).collect[0][0]
        end = df_nonts.select(F.to_utc_timestamp(F.col("time_Period_timeInterval_end"), tz)).collect()[0][0]
    print(start)
    print(end)
    
    # ambil resolution dan parse
    resolution_col = df_ts.select(F.col("Period_resolution")).collect()[0][0]
    delta = _parse_resolution_to_timedelta(resolution_col)
    print(delta)

    #generate date index
    # date_index = spark.createDataFrame([{'date':1}]).select(F.explode(F.sequence(F.lit(start),F.lit(end),F.expr(delta))).alias("ts_index")) --> alternative bikin sequence
    date_index = spark.sql(f"SELECT sequence(to_timestamp('{start}'), to_timestamp('{end}'), {delta}) as ts_index").withColumn("ts_index", F.explode(F.col("ts_index")))
    # if tz is not None:
    #     #case kalo di parse_timeindex: weekly granularity bakal nambah index element karena ada Daylight Saving Time. Harus di kurangin 
    #     #sementara skip dulu
    #     pass
    #generate row number
    w = Window.partitionBy(F.lit(1)).orderBy("ts_index")
    date_index = date_index.select("ts_index").distinct().withColumn("position", F.row_number().over(w))
    #alternatif bikin sequence pake SQL instead of built-in function
    # date_index = (
    #         date_index.select("ts_index")
    #         .distinct()
    #         .withColumn("position", F.expr("ROW_NUMBER() OVER ( \
    #                                     PARTITION BY '1' ORDER BY ts_index) AS position"))
    #     )
    return date_index

#parsing generation timeseries function
def _parse_generation_timeseries(period_row, df_periods, df_psrtype, df_metric,  per_plant: bool = False, include_eic: bool = False):
    #------------------
    #get name of psrtype 
    psrtype = df_psrtype[period_row][0]
    if psrtype is None:
        psrtype = None
    else:
        psrtype_name = PSRTYPE_MAPPINGS[psrtype]
        name = [psrtype_name]

    #check consumption ato aggregated dari nilai inBiddingZone
    #kalo inBiddingZone is none, berarti adanya outbidding zone alias metric = consumption atawa consumption element. kalo inBidding zone is not none, berarti metric = aggregated atawa generation element
    metric_check = df_metric[period_row][0]
    if metric_check is None:
        metric = "Actual Consumption"
    else:
        metric = "Actual Aggregated"  
    name.append(metric)

    #skip per plant case
    if per_plant:
        plantname = ""
        if include_eic:
            pass

    #giving the columns set a name (berguna kalo per plantnya kepake aja sih)
    if len(name) == 1:
        name = name[0]
    else: 
        name = " - ".join(name)
        

    #getting the columns for a row of per type generation data and setting up dataframe for the column
    #-----------------
    #getting quantities
    df_periodrow = df_periods[period_row][0]
    datas = [(int(point.position), float(point.quantity)) for point in df_periodrow]
    datas = spark.createDataFrame(datas, ["position", name])

    return datas

#set the loop count
period_row = 2
per_plant = False
include_eic = False

#setting up initial dfs used biar ga berulang2 manggil
all_df = _parse_datetimeindex(df_ts, df_nonts, tz=None)

periods_col = df_ts.select(F.col("Period_point")).collect()
psrtype_col =  df_ts.select(F.col("MktPSRType_psrType")).collect()
metric_col = df_ts.select(F.col("inBiddingZone_Domain_mRID_text")).collect()

#get range len of periods_col
for entry in range(len(periods_col)):
    ts_data = _parse_generation_timeseries(entry, periods_col, psrtype_col, metric_col, per_plant=per_plant, include_eic=include_eic)
    all_df = all_df.join(ts_data, ["position"], how="inner")

all_df = all_df.orderBy(F.asc("position")).drop("position")



#skip net and redundant calculation
#skip tz convert



                                                                                

2021-01-01 07:00:00
2021-01-01 13:00:00
INTERVAL 15 MINUTES


                                                                                

+-------------------+---------------------------+---------------------------------------------+------------------------------+------------------------------------+------------------------------+------------------------------+----------------------------------------+-----------------------------------------+---------------------------------------------------+-----------------------------------------+---------------------------+-------------------------+-----------------------------------+-------------------------+-------------------------+---------------------------------+--------------------------------+
|           ts_index|Biomass - Actual Aggregated|Fossil Brown coal/Lignite - Actual Aggregated|Fossil Gas - Actual Aggregated|Fossil Hard coal - Actual Aggregated|Fossil Oil - Actual Aggregated|Geothermal - Actual Aggregated|Hydro Pumped Storage - Actual Aggregated|Hydro Pumped Storage - Actual Consumption|Hydro Run-of-river and poundage - Actual Aggregated|Hydro Water Reservoir - Actu

In [None]:
all_df.show(100)

In [None]:
#upload to BQ
#tulis data ke bigquery via temporary gcs bucket
df_spark.write \
  .format("bigquery") \
  .option("project","rafzul-analytics-1009") \
  .option("temporaryGcsBucket","entsoe_temp_1009") \
  .mode("append") \
  .save("rafzul-analytics-1009.entsoe_playground.total_generation_staging")

In [None]:
#-------------------------------------- old code ----------------------------------

In [None]:
# #create schema to be enforced in subsequent json load operation
# with open("schema_raw_j", "w") as schrawjson:
#     schrawjson.write(df_spark_orig.schema.json())

In [None]:
# with open("schema_raw2.json", "r") as schrawjson:
#     json_schema_data = schrawjson.read()
#     json_enforced_schema = StructType.fromJson(json.loads(json_schema_data))
    

In [None]:
# #create dataframe from gcs
# path = f"gs://{gcs_bucket}/{landing_filename}"
# print(path)
# df_spark = spark.read.format("json").schema(json_enforced_schema) \
#    .option("header","true") \
#    .option("multiLine","true") \
#    .load(path) \
#    .select("GL_MarketDocument.*")
   

In [None]:
#clean non timeseries column name. change dot to underscores
df_spark = df_spark.toDF(*(c_name.replace(".", "_") for c_name in df_spark.columns))

#clean timeseries column. cast TimeSeries to new scheme where 1.dot in names are replaced with underscores and 2. Strange characters such as '@' or '#' are removed
ts_schema = df_spark.select("TimeSeries").dtypes[0][1]
replacements = [('\.', '_'), ('[@#]', '')]
for old, new in replacements:
    ts_schema = re.sub(old, new, ts_schema)
df_spark = df_spark.withColumn("TimeSeries", (F.col("TimeSeries").cast(ts_schema)))

In [None]:
df_spark.select("TimeSeries").dtypes

In [None]:
#flatten column

# explode timeseries column into struct
df_spark = df_spark.withColumn("TimeSeries", F.explode("TimeSeries"))


In [None]:
df_spark.printSchema()

In [None]:


# flatten TimeSeries if there is TimeSeries, flatten AttributeInstanceComponent if there is AttributeInstanceComponent
def unpack_df(nested_df):
    component_columns = ["TimeSeries", "AttributeInstanceComponent"]
    general_cols = [c for c in nested_df.columns if c not in component_columns]
    if "TimeSeries" in nested_df.columns:
        data_cols_name = "TimeSeries"
        data_cols = [c for c in nested_df.select("TimeSeries.*").columns]
    else:
        pass
    unpacked_df = nested_df.select(general_cols \
                                   + [F.col(data_cols_name+"."+c).alias(data_cols_name+"_"+c)\
                                      for c in data_cols])
    return unpacked_df
    

df_spark = unpack_df(df_spark)
df_spark.printSchema()

In [None]:
#tulis data ke bigquery via temporary gcs bucket
all_df.write \
  .format("bigquery") \
  .option("project","rafzul-analytics-1009") \
  .option("temporaryGcsBucket","entsoe_temp_1009") \
  .mode("append") \
  .save("rafzul-analytics-1009.entsoe_playground.total_generation_staging")

In [None]:
# if df_spark2.select("TimeSeries"):
#     # rename all column (specifically replace dot with underscore) on every struct nested in TimeSeries array column. use transform
#     df_spark3 = df_spark3.withColumn("TimeSeries", F.transform \
#     ("TimeSeries", lambda el,ind: \
#     F.struct \
#     ("abc" for c_name in el.columns)))
    
# # # #get list of names of all columns in every struct inside TimeSeries array column
# # a = [x for x in df_spark2.select(("TimeSeries"))]
# # print(a)

# df_spark3.select("TimeSeries").show(10, truncate=False)
# a = F.struct(F.col("TimeSeries").getItem(c_name).alias(c_name.replace(".", "_")) for i, c_name in enumerate(df_spark2.schema["TimeSeries"].dataType.elementType.fieldNames()))
# # print(a


# F.struct(F.col("TimeSeries") for c_name in df_spark2.schema[x].dataType.names


# df_spark2.show()
# df_spark2.printSchema()

# # #get list of names of all columns in every struct inside TimeSeries array column
# a = [(c_name,i) for (c_name,i) in enumerate(df_spark2.schema["TimeSeries"].dataType.elementType.fieldNames())]
# print(a)


# df_spark2 = df_spark2.withColumn("TimeSeries", F.transform("TimeSeries", lambda x: F.struct(*[F.col("TimeSeries." + c_name.replace(".", "_")).alias(c_name) for c_name in x])))

In [None]:
# #for non timeseries column
# df_spark2 = df_spark2.toDF(*(c_name.replace(".", "_") for c_name in df_spark2.columns))
# b = df_spark2.select(F.col("TimeSeries"))
# #for timeseries one
# a = F.struct(F.col("TimeSeries").getItem(c_name).alias(c_name.replace(".", "_")) for i, c_name in enumerate(df_spark2.schema["TimeSeries"].dataType.elementType.fieldNames()))
# print(a)
# if df_spark2.select("TimeSeries"):
#     # rename all column (specifically replace dot with underscore) on every struct nested in TimeSeries array column. use transform, use Timeseries schema and its datatype.name o
#     df_spark2 = df_spark2.withColumn("TimeSeries", F.transform("TimeSeries", lambda x: (a)))
    
# # #get list of names of all columns in every struct inside TimeSeries array column
# # a = [c_name for c_name in enumerate(df_spark2.schema["TimeSeries"].dataType.elementType.fieldNames())]


# df_spark3.show()
# df_spark3.printSchema()



# # df_spark2 = df_spark2.withColumn("TimeSeries", F.transform("TimeSeries", lambda x: F.struct(*[F.col("TimeSeries." + c_name.replace(".", "_")).alias(c_name) for c_name in x])))

In [None]:
# #cast schema with the dot replaced with underscore, programatically


# ---- solution for  column names beside TimeSeries:
# changed_column_general = [(column_name, column_name.replace(".", "_")) for column_name in df_schema.fieldNames() if "." in x]
# for column_name, changed_column_name in changed_column_general:
#     df_schema = df_spark2.select([F.col(c).alias(mappingcolumn_name, changed_column_name)
# # UPDATE: there is even better solution

# #changed non timeseries column
# df_spark2 = df_spark2.toDF(*(c.replace(".", "_") for c in df_spark2.columns))

In [None]:
with open("example_destination.json", "w+") as output_file:
    output_file.write(df_spark2.toJSON())

In [None]:
# #inferring schema and get the data type of each column and turn it into spark dataframe
# datatype_infer = pd.DataFrame.from_dict(xml_file_dict[0], orient='index')

In [None]:
# #flatten nested df at every layer
# from pyspark.sql.types import *
# from pyspark.sql.import functions as f

# def flatten_structs(nested_df):
#     stack = [(), nested_df]
#     columns = []
    
#     while len(stack) > 0:
#         parents
        
    

In [None]:
# df_schema = df_spark.dtypes

# firstrow_orig = df_spark.collect()[0]
# new_header = [f"{x} - {firstrow_orig.__getitem__(x)}" for x in firstrow_orig.__fields__]

# df_spark = df_spark.where(F.col("_c0").isNotNull())
# df_spark.show()

# # #drop header and first column


In [None]:
#original code to read json file 

try: 
   df_spark = spark.read.format("json") \
      .option("inferSchema","true") \
      .option("multiLine","true") \
      .load(path) \
      .select("GL_MarketDocument.*")
except Exception as e:
   pass
   # df_spark = spark.read.format("csv") \
   #    .option("inferSchema","true") \
   #    .option("header","true") \
   #    .load(path)
   # schema_version = source_schema_version + 0.1
   # new_source_schema = { "version": float(schema_version),"schema": df_spark.schema.jsonValue()}
   # new_source_schema_json = json.dumps(new_source_schema)
   # new_source_schema_filename = f"schema_source/{label_data}__schema__{schema_version}.json"
   # upload_blob_to_gcs(bucket_name=gcs_bucket, contents=new_source_schema_json, destination_blob_name=new_source_schema_filename)

In [None]:
#parsing generation timeseries

positions = []
quantities = []

example_row = []
#contoh 1 row, equivalent of parse_generation_timeseries function
#bagian ini dirubah ke parse_generation alias semua yg row di collect dijadiin looping buat collect() [x][0] for x in range(len())
df_periods = df_ts.select(F.col("Period_point")).collect()
df_periods_1 = df_periods[0][0]

In [None]:
#getting quantities
# quantities = [float(point.quantity) for point in df_periodrow]
quantities = []
df_periodrow = df_periods[period_row][0]
for point in df_periodrow:
    quantity = point.quantity
    if quantity is None:
        raise LookupError(
        f'No quantity found in this point, it should have one: {point}')
    quantities.append(float(point.quantity))