In [26]:
import os 
from dotenv import load_dotenv
import requests
from google.oauth2 import service_account
import xmltodict
import json
import pandas as pd 
from requests import request
import pytz
from bs4 import BeautifulSoup
from typing import Dict
from google.oauth2 import service_account
from google.cloud import storage

import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
import sys

from pyspark.sql.types import StructType, StructField, FloatType, BooleanType
from pyspark.sql.types import DoubleType, IntegerType, StringType, DataType
from pyspark.sql import functions as F

from entsoe import EntsoeRawClient
from entsoe import EntsoePandasClient





In [3]:
#load env variables
load_dotenv('./creds/.env', verbose=True, override=True)

True

In [4]:
print(os.environ.get("PYSPARK_DRIVER_PYTHON"))

None


In [5]:


'''
----------------
INIT VARIABLES
----------------
'''

#setting up entsoe variables
security_token = os.environ.get("SECURITY_TOKEN")
ENTSOE_URL = 'https://transparency.entsoe.eu/api'

#setting up GCP variables
service_account_file = os.environ.get("SERVICE_ACCOUNT_FILE")
credentials = service_account.Credentials.from_service_account_file(
    service_account_file
)
gcs_bucket = os.environ.get("CLOUD_STORAGE_BUCKET")
print(gcs_bucket)

entsoe_analytics_1009


In [6]:



# #setting up session
# entsoe_session = requests.Session()

# '''
# ----------------
# SETTING UP FUNCTION CALLS 
# ----------------
# '''

# #format dates
# def datetime_to_str(dtm: pd.Timestamp) -> str:
#     #convert timezone to UTC if it's exist and it's not on UTC already
#     if dtm.tzinfo is not None and dtm.tzinfo != pytz.UTC:
#         dtm = dtm.tz_convert("UTC")
#     fmt = '%Y%m%d%H%M'
#     dtm_str = dtm.strftime(fmt)
#     return dtm_str

# #basic requests
# def basic_requests(params: Dict, start:pd.Timestamp, end: pd.Timestamp) -> requests.Response:
#     #setting up time intervals start and stop
#     start_str = datetime_to_str(start)
#     end_str = datetime_to_str(end)

#     #setting up params and extending with custom parameter based
#     base_params = {
#         'securityToken': security_token,
#         'periodStart': start_str,
#         'periodEnd': end_str,
#     }
#     params.update(base_params)
    
#     #seting up sesssion
#     session = requests.Session()
#     response = session.get(url=ENTSOE_URL, params=params)

#     return response

# # upload data to GCS
# def upload_blob_to_gcs(bucket_name, contents, destination_blob_name):
#     # Upload file to bucket"""

#     # ID of GCS bucket
#     # bucket_name =

#     # the contents from memory to be uploaded to file
#     # contents =

#     # the ID of your GCS object
#     # destination_blob_name =

#     storage_client = storage.Client(credentials=credentials)
#     bucket = storage_client.bucket(bucket_name)
#     blob = bucket.blob(destination_blob_name)

#     blob.upload_from_string(contents)


# '''
# ----------------
# EXTRACTION
# ----------------
# '''
# #for test, we'll be querying Actual Generation Output per Generation Unit

# domain = '10YCZ-CEPS-----N'
# params_requests = {
#     'documentType': 'A73',
#     'processType': 'A16',
#     'in_Domain': {domain},
# }
# start=pd.Timestamp('202101011200', tz='Europe/Berlin')
# end=pd.Timestamp('202101011300', tz='Europe/Berlin')

# entsoe_data = basic_requests(params=params_requests, start=start, end=end)
# # entsoe_dict = xmltodict.parse(entsoe_data.text)
# # entsoe_json = json.dumps(entsoe_dict, indent=4)
# # print(entsoe_json)
# print(entsoe_data.text)

# '''
# ----------------
# LOAD
# ----------------
# '''
# # #upload to GCS
# # landing_filename=f"entsoe_data_{start}.json"
# # upload_blob_to_gcs(bucket_name=gcs_bucket, contents=entsoe_json, destination_blob_name=landing_filename)

# #upload to GCS - XML
# landing_filename=f"entsoe_data_{start}_{domain}.xml"
# upload_blob_to_gcs(bucket_name=gcs_bucket, contents=entsoe_data.text, destination_blob_name=landing_filename)



In [6]:



#setting up session
entsoe_client = EntsoeRawClient(security_token)
'''
----------------
SETTING UP FUNCTION CALLS 
----------------
'''

# upload data to GCS
def upload_blob_to_gcs(bucket_name, contents, destination_blob_name):
    # Upload file to bucket"""

    # ID of GCS bucket
    # bucket_name =

    # the contents from memory to be uploaded to file
    # contents =

    # the ID of your GCS object
    # destination_blob_name =

    storage_client = storage.Client(credentials=credentials)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_string(contents)


'''
----------------
EXTRACTION
----------------
'''
#for test, we'll be querying Actual Generation Output per Generation Unit

start=pd.Timestamp('202101011200', tz='Europe/Berlin')
end=pd.Timestamp('202101011300', tz='Europe/Berlin')
country_code= 'DE_TENNET'
country_code_from=''
country_code_to=''
type_marketagreement_type=''
contract_marketagreement_type=''

start2=pd.Timestamp('20210101', tz='Europe/Berlin')
end2=pd.Timestamp('20211231', tz='Europe/Berlin')

pd.set_option('display.max_rows', None)

try:
    entsoe_data = entsoe_client.query_generation(country_code, start=start, end=end)
    # entsoe_data3 = entsoe_client.query_load(country_code, start=start, end=end)
    # entsoe_data4 = entsoe_client.query_day_ahead_prices(country_code, start=start, end=end)
    # entsoe_data5 = entsoe_client.query_installed_generation_capacity(country_code, start=start2, end=end2)
    # entsoe_data6 = entsoe_client.query_installed_generation_capacity_per_unit(country_code, start=start2, end=end2)
    # display(entsoe_data2)
    # display(entsoe_data3)
    entsoe_dict = xmltodict.parse(entsoe_data)
    entsoe_json = json.dumps(entsoe_dict, indent=4, ensure_ascii=False)
except Exception as e:
    print("An exception occurred:", e)


'''
----------------
LOAD
----------------
'''
#upload to GCS
landing_filename=f"entsoe_data_{country_code}.json"
upload_blob_to_gcs(bucket_name=gcs_bucket, contents=entsoe_json, destination_blob_name=landing_filename)



In [7]:
## COBA SPARK GCS CONNECTOR ##


# berangkat pak haji
# setup parameternya
gcs_bucket = gcs_bucket
path = f"gs://{gcs_bucket}/{landing_filename}"
# path ="/home/rafzul/projects/entsoe-pipelines/sample.xml"

#coba spark gcs connector
#setup sparksession for entry point - COBA GCS CONNECTOR
SPARK_HOME = os.environ["SPARK_HOME"]
spark = SparkSession.builder.appName("gcp_playground") \
    .config("spark.jars", f"{SPARK_HOME}/jars/gcs-connector-hadoop3-latest.jar, {SPARK_HOME}/jars/spark-bigquery-with-dependencies_2.13-0.27.1.jar") \
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", service_account_file) \
    .getOrCreate()



22/12/30 14:02:07 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.18.2 instead (on interface wlp58s0)
22/12/30 14:02:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/12/30 14:02:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [8]:
print(spark.conf.get("spark.hadoop.fs.AbstractFileSystem.gs.impl"))

google.cloud.hadoop.fs.gcs.GoogleHadoopFS


In [46]:
path = f"gs://{gcs_bucket}/{landing_filename}"
print(path)

#create dataframe from gcs
df_spark = spark.read.format("json") \
   .option("header","true") \
   .option("multiLine","true") \
   .option("inferSchema","true") \
   .load(path) \
   

gs://entsoe_analytics_1009/entsoe_data_DE_TENNET.json


In [10]:
print(type(df_spark))
df_spark.printSchema()
df_spark.show()

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- @xmlns: string (nullable = true)
 |-- TimeSeries: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- MktPSRType: struct (nullable = true)
 |    |    |    |-- psrType: string (nullable = true)
 |    |    |-- Period: struct (nullable = true)
 |    |    |    |-- Point: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- position: string (nullable = true)
 |    |    |    |    |    |-- quantity: string (nullable = true)
 |    |    |    |-- resolution: string (nullable = true)
 |    |    |    |-- timeInterval: struct (nullable = true)
 |    |    |    |    |-- end: string (nullable = true)
 |    |    |    |    |-- start: string (nullable = true)
 |    |    |-- businessType: string (nullable = true)
 |    |    |-- curveType: string (nullable = true)
 |    |    |-- inBiddingZone_Domain.mRID: struct (nullable = true)
 |    |    |    |-- #text: s

                                                                                

In [52]:
#create schema to be enforced in subsequent json load operation
with open("schema_raw.json", "w") as schrawjson:
    schrawjson.write(df_spark.schema.json())

In [8]:
with open("schema_raw2.json", "r") as schrawjson:
    json_schema_data = schrawjson.read()
    json_enforced_schema = StructType.fromJson(json.loads(json_schema_data))
    

print(json_enforced_schema)

StructType([StructField('GL_MarketDocument', StructType([StructField('@xmlns', StringType(), True), StructField('mRID', StringType(), True), StructField('revisionNumber', StringType(), True), StructField('type', StringType(), True), StructField('process.processType', StringType(), True), StructField('sender_MarketParticipant.mRID', StructType([StructField('#text', StringType(), True), StructField('@codingScheme', StringType(), True)]), True), StructField('sender_MarketParticipant.marketRole.type', StringType(), True), StructField('receiver_MarketParticipant.mRID', StructType([StructField('#text', StringType(), True), StructField('@codingScheme', StringType(), True)]), True), StructField('receiver_MarketParticipant.marketRole.type', StringType(), True), StructField('createdDateTime', StringType(), True), StructField('time_Period.timeInterval', StructType([StructField('end', StringType(), True), StructField('start', StringType(), True)]), True), StructField('TimeSeries', ArrayType(Struct

In [9]:
#create dataframe from gcs
path = f"gs://{gcs_bucket}/{landing_filename}"
print(path)
df_spark2 = spark.read.format("json").schema(json_enforced_schema) \
   .option("header","true") \
   .option("multiLine","true") \
   .load(path) \
   .select("GL_MarketDocument.*")
   

gs://entsoe_analytics_1009/entsoe_data_DE_TENNET.json


In [11]:
print(type(df_spark2))
df_spark2.printSchema()
df_spark2.show()


#    

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- @xmlns: string (nullable = true)
 |-- mRID: string (nullable = true)
 |-- revisionNumber: string (nullable = true)
 |-- type: string (nullable = true)
 |-- process.processType: string (nullable = true)
 |-- sender_MarketParticipant.mRID: struct (nullable = true)
 |    |-- #text: string (nullable = true)
 |    |-- @codingScheme: string (nullable = true)
 |-- sender_MarketParticipant.marketRole.type: string (nullable = true)
 |-- receiver_MarketParticipant.mRID: struct (nullable = true)
 |    |-- #text: string (nullable = true)
 |    |-- @codingScheme: string (nullable = true)
 |-- receiver_MarketParticipant.marketRole.type: string (nullable = true)
 |-- createdDateTime: string (nullable = true)
 |-- time_Period.timeInterval: struct (nullable = true)
 |    |-- end: string (nullable = true)
 |    |-- start: string (nullable = true)
 |-- TimeSeries: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Mkt

In [33]:
#temporary fix to solve 
df_spark2.cast()

DataFrame[@xmlns: string, mRID: string, revisionNumber: string, type: string, process.processType: string, sender_MarketParticipant.mRID: struct<#text:string,@codingScheme:string>, sender_MarketParticipant.marketRole.type: string, receiver_MarketParticipant.mRID: struct<#text:string,@codingScheme:string>, receiver_MarketParticipant.marketRole.type: string, createdDateTime: string, time_Period.timeInterval: struct<end:string,start:string>, TimeSeries: array<struct<MktPSRType:struct<psrType:string>,Period:struct<Point:array<struct<position:string,quantity:string>>,resolution:string,timeInterval:struct<end:string,start:string>>,businessType:string,curveType:string,inBiddingZone_Domain.mRID:struct<#text:string,@codingScheme:string>,mRID:string,objectAggregation:string,outBiddingZone_Domain.mRID:struct<#text:string,@codingScheme:string>,quantity_Measure_Unit.name:string>>]

In [18]:
#tulis data ke bigquery via temporary gcs bucket
df_spark2.write \
  .format("bigquery") \
  .option("project","rafzul-analytics-1009") \
  .option("temporaryGcsBucket","entsoe_temp_1009") \
  .mode("append") \
  .save("rafzul-analytics-1009.entsoe_playground.fact_test")

                                                                                

22/12/30 14:29:10 ERROR BigQueryClient: Unable to create the job to load to rafzul-analytics-1009.entsoe_playground.fact_test


Py4JJavaError: An error occurred while calling o98.save.
: com.google.cloud.bigquery.connector.common.BigQueryConnectorException: Failed to write to BigQuery
	at com.google.cloud.spark.bigquery.write.BigQueryWriteHelper.writeDataFrameToBigQuery(BigQueryWriteHelper.java:110)
	at com.google.cloud.spark.bigquery.write.BigQueryDeprecatedIndirectInsertableRelation.insert(BigQueryDeprecatedIndirectInsertableRelation.java:43)
	at com.google.cloud.spark.bigquery.write.CreatableRelationProviderHelper.createRelation(CreatableRelationProviderHelper.java:51)
	at com.google.cloud.spark.bigquery.BigQueryRelationProvider.createRelation(BigQueryRelationProvider.scala:106)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:47)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryException: Character '.' found in field name: inBiddingZone_Domain.mRID, parquet file: /bigstore/entsoe_temp_1009/.spark-bigquery-local-1672383730380-091c3ab1-73d6-4a16-a900-3a419433c3f1/part-00000-8eb3d8b0-e648-4eff-bc11-108961cc2d5e-c000.snappy.parquet.Reading such fields is not yet supported.
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.Job.reload(Job.java:419)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.Job.waitFor(Job.java:252)
	at com.google.cloud.bigquery.connector.common.BigQueryClient.createAndWaitFor(BigQueryClient.java:333)
	at com.google.cloud.bigquery.connector.common.BigQueryClient.createAndWaitFor(BigQueryClient.java:323)
	at com.google.cloud.bigquery.connector.common.BigQueryClient.loadDataIntoTable(BigQueryClient.java:553)
	at com.google.cloud.spark.bigquery.write.BigQueryWriteHelper.loadDataToBigQuery(BigQueryWriteHelper.java:130)
	at com.google.cloud.spark.bigquery.write.BigQueryWriteHelper.writeDataFrameToBigQuery(BigQueryWriteHelper.java:107)
	... 44 more


In [76]:
with open("example_destination.json", "w+") as output_file:
    output_file.write(df_spark2.toJSON())

TypeError: write() argument must be str, not RDD

In [None]:
#gimana cara ngirim doi ke bigquery?

In [171]:
# # abc.show(50,truncate=False)
# print(df_spark.dtypes)
# array_cols = [c[0] for c in df_spark.dtypes if c[1][:5] == 'array'] 
# print(array_cols)

# for array_col in array_cols:
            
#     # cols_to_select = [x for x in df.columns if x != array_col ]
#     df_spark = df_spark.withColumn(array_col, F.explode(F.col(array_col)))
    
# df_spark.show(50)
# df_spark.dtypes

[('@xmlns', 'string'), ('TimeSeries', 'array<struct<MktPSRType:struct<PowerSystemResources:struct<mRID:struct<#text:string,@codingScheme:string>,name:string>,psrType:string>,Period:struct<Point:struct<position:string,quantity:string>,resolution:string,timeInterval:struct<end:string,start:string>>,businessType:string,curveType:string,inBiddingZone_Domain.mRID:struct<#text:string,@codingScheme:string>,mRID:string,objectAggregation:string,quantity_Measure_Unit.name:string,registeredResource.mRID:struct<#text:string,@codingScheme:string>>>'), ('createdDateTime', 'string'), ('mRID', 'string'), ('process.processType', 'string'), ('receiver_MarketParticipant.mRID', 'struct<#text:string,@codingScheme:string>'), ('receiver_MarketParticipant.marketRole.type', 'string'), ('revisionNumber', 'string'), ('sender_MarketParticipant.mRID', 'struct<#text:string,@codingScheme:string>'), ('sender_MarketParticipant.marketRole.type', 'string'), ('time_Period.timeInterval', 'struct<end:string,start:string>')

[('@xmlns', 'string'),
 ('TimeSeries',
  'struct<MktPSRType:struct<PowerSystemResources:struct<mRID:struct<#text:string,@codingScheme:string>,name:string>,psrType:string>,Period:struct<Point:struct<position:string,quantity:string>,resolution:string,timeInterval:struct<end:string,start:string>>,businessType:string,curveType:string,inBiddingZone_Domain.mRID:struct<#text:string,@codingScheme:string>,mRID:string,objectAggregation:string,quantity_Measure_Unit.name:string,registeredResource.mRID:struct<#text:string,@codingScheme:string>>'),
 ('createdDateTime', 'string'),
 ('mRID', 'string'),
 ('process.processType', 'string'),
 ('receiver_MarketParticipant.mRID',
  'struct<#text:string,@codingScheme:string>'),
 ('receiver_MarketParticipant.marketRole.type', 'string'),
 ('revisionNumber', 'string'),
 ('sender_MarketParticipant.mRID',
  'struct<#text:string,@codingScheme:string>'),
 ('sender_MarketParticipant.marketRole.type', 'string'),
 ('time_Period.timeInterval', 'struct<end:string,start:

In [128]:
# abc = df_spark.select("GL_MarketDocument.*").select("@xmlns", F.explode("TimeSeries").alias("Nilai"))

In [105]:
# print(abc.dtypes)
# abc.show(50)
# abc.printSchema()

[('@xmlns', 'string'), ('Nilai', 'struct<MktPSRType:struct<PowerSystemResources:struct<mRID:struct<#text:string,@codingScheme:string>,name:string>,psrType:string>,Period:struct<Point:struct<position:string,quantity:string>,resolution:string,timeInterval:struct<end:string,start:string>>,businessType:string,curveType:string,inBiddingZone_Domain.mRID:struct<#text:string,@codingScheme:string>,mRID:string,objectAggregation:string,quantity_Measure_Unit.name:string,registeredResource.mRID:struct<#text:string,@codingScheme:string>>')]
+--------------------+--------------------+
|              @xmlns|               Nilai|
+--------------------+--------------------+
|urn:iec62325.351:...|{{{{27W-GU-ECHVG1...|
|urn:iec62325.351:...|{{{{27W-GU-ECHVG2...|
|urn:iec62325.351:...|{{{{27W-GU-ECHVG3...|
|urn:iec62325.351:...|{{{{27W-GU-ECHVG4...|
|urn:iec62325.351:...|{{{{27W-GU-EDALG1...|
|urn:iec62325.351:...|{{{{27W-GU-EDALG2...|
|urn:iec62325.351:...|{{{{27W-GU-EDALG3...|
|urn:iec62325.351:...|{{{{2

In [None]:
# #inferring schema and get the data type of each column and turn it into spark dataframe
# datatype_infer = pd.DataFrame.from_dict(xml_file_dict[0], orient='index')

In [None]:
#  df_spark.schema.prettyJson()

In [None]:
# #flatten nested df at every layer
# from pyspark.sql.types import *
# from pyspark.sql.import functions as f

# def flatten_structs(nested_df):
#     stack = [(), nested_df]
#     columns = []
    
#     while len(stack) > 0:
#         parents
        
    