In [149]:
#buat pandasrawclient yang bentuknya dah langsung table data 
import os 
from dotenv import load_dotenv
import requests
from google.oauth2 import service_account
import xmltodict
import json
import pandas as pd 
from requests import request
import pytz
from bs4 import BeautifulSoup
from typing import Dict
from google.oauth2 import service_account
from google.cloud import storage

import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
import sys
import re

import csv

from pyspark.sql.types import StructType, StructField, ArrayType, FloatType, BooleanType, TimestampType
from pyspark.sql.types import DoubleType, IntegerType, StringType, DataType
from pyspark.sql import functions as F

from entsoe import EntsoeRawClient
from entsoe import EntsoePandasClient


#load env variables
load_dotenv('./creds/.env', verbose=True, override=True)
os.environ['TZ'] = 'UTC'

'''
----------------
INIT VARIABLES
----------------
'''

#setting up entsoe variables
security_token = os.environ.get("SECURITY_TOKEN")
ENTSOE_URL = 'https://transparency.entsoe.eu/api'

#setting up GCP variables
service_account_file = os.environ.get("SERVICE_ACCOUNT_FILE")
credentials = service_account.Credentials.from_service_account_file(
    service_account_file
)
gcs_bucket = os.environ.get("CLOUD_STORAGE_BUCKET")

#setting up session
entsoe_client = EntsoeRawClient(security_token)


In [53]:

'''
----------------
SETTING UP FUNCTION CALLS 
----------------
'''

# upload data to GCS
def upload_blob_to_gcs(bucket_name, contents, destination_blob_name):
    # Upload file to bucket"""

    # ID of GCS bucket
    # bucket_name =

    # the contents from memory to be uploaded to file
    # contents =

    # the ID of your GCS object
    # destination_blob_name =

    storage_client = storage.Client(credentials=credentials)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_string(contents)


'''
----------------
EXTRACTION
----------------
'''
#for test, we'll be querying

start=pd.Timestamp('202101010000', tz='Europe/Berlin')
end=pd.Timestamp('202101010600', tz='Europe/Berlin')
country_code= 'DE_TENNET'
country_code_from=''
country_code_to=''
type_marketagreement_type=''
contract_marketagreement_type=''
label_data='total_generation'

try:
    entsoe_data = entsoe_client.query_generation(country_code, start=start, end=end)
    entsoe_dict = xmltodict.parse(entsoe_data)
    #if header is already correct, header = True. if not, header = entsoe_header_list
    entsoe_json = json.dumps(entsoe_dict)
except Exception as e:
    print("An exception occurred:", e)


'''
----------------
LOAD
----------------
'''
#upload to GCS
start = start.strftime("%Y%m%d%H%M%S")
end = end.strftime("%Y%m%d%H%M%S")
landing_filename=f"{label_data}__{country_code}__{start}__{end}.json"
upload_blob_to_gcs(bucket_name=gcs_bucket, contents=entsoe_json, destination_blob_name=landing_filename)

In [18]:

# path ="/home/rafzul/projects/entsoe-pipelines/sample.xml"

#coba spark gcs connector
#setup sparksession for entry point - COBA GCS CONNECTOR
SPARK_HOME = os.environ["SPARK_HOME"]
spark = SparkSession.builder.appName("gcp_playground") \
    .config("spark.jars", f"{SPARK_HOME}/jars/gcs-connector-hadoop3-latest.jar, {SPARK_HOME}/jars/spark-bigquery-with-dependencies_2.13-0.27.1.jar") \
    .config("spark.sql.session.timeZone", "UTC") \
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", service_account_file) \
    .getOrCreate()



In [19]:

# def download_blob_from_gcs(bucket_name, source_blob_name, local_source_blob_name):
#     # Upload file to bucket"""

#     # ID of GCS bucket
#     # bucket_name =

#     # the contents from memory to be uploaded to file
#     # contents =

#     # the ID of your GCS object
#     # destination_blob_name =

#     storage_client = storage.Client(credentials=credentials)
#     bucket = storage_client.bucket(bucket_name)
#     blob = bucket.blob(source_blob_name)

#     blob.download_to_filename(local_source_blob_name)
    
# #ambil schema
# raw_schema_filename = f"{label_data}__rawschema.txt"
# source_rawschema_filename = f"schema_source/{raw_schema_filename}"
# local_rawschema_filename = f"/home/rafzul/projects/entsoe-pipelines/schemas/source/{raw_schema_filename}"
# download_blob_from_gcs(bucket_name=gcs_bucket, source_blob_name=source_rawschema_filename, local_source_blob_name=local_rawschema_filename)

# #setting up schema - block

# with open(local_rawschema_filename, "r") as local_source:
#     rawschema_data = local_source.read()


In [173]:
# berangkat pak haji
# setup parameternya
gcs_bucket = gcs_bucket
path = f"gs://{gcs_bucket}/{landing_filename}"
print(path)

try: 
   df_spark = spark.read.format("json") \
      .option("inferSchema","true") \
      .option("multiLine","true") \
      .load(path)
except Exception as e:
   pass

gs://entsoe_analytics_1009/total_generation__DE_TENNET__20210101000000__20210101060000.json


In [174]:
#Cleaning column names & casting data type
#-------------------------------------------------------------------

#cleaning documents

def clean_columns_n_casttypes(df, raw_schema, parent_column_name):
    #cleaning the dots, changed it into namespaces, casting new columns names
    df_schema = df.select(parent_column_name).dtypes[0][1]
    replacements = [('\.', '_'), ('[@#]', '')]
    for old, new in replacements: 
        df_schema = re.sub(old, new, df_schema)
    #casting the DF with the cleaned schema (must be done first before column name got changed)
    df = df.withColumn(parent_column_name, F.col(parent_column_name).cast(df_schema)).select(f"{parent_column_name}.*")
    # #casting the DF with correct datatype schema, selecting the column inside the big parent column name
    # df = df.withColumn(parent_column_name, F.col(parent_column_name).cast(raw_schema))
    return df

# define flatten struct function
def flatten_struct(nested_struct_df): 
    flat_cols = [c[0] for c in nested_struct_df.dtypes if c[1][:6] != 'struct']
    nested_struct_cols = [c[0] for c in nested_struct_df.dtypes if c[1][:6] == 'struct']
    flat_df = nested_struct_df.select(flat_cols + [F.col(f"{nc}.{c}").alias(f"{nc}_{c}") for nc in nested_struct_cols for c in nested_struct_df.select(f"{nc}.*").columns])
    return flat_df

#Extraction from Non TS

if label_data in ["total_generation"]:
    document_column = "GL_MarketDocument"
df_spark = clean_columns_n_casttypes(df_spark, rawschema_data, document_column)


In [178]:
#separate df into ts and non ts df
df_ts = df_spark.select("TimeSeries")
df_nonts = df_spark.drop("TimeSeries")

#processing non TS dataframe
df_nonts = flatten_struct(flatten_struct(df_nonts))

#cast the timestamp column to timestamp type
df_nonts = df_nonts.withColumn("time_Period_timeInterval_end", F.to_timestamp("time_Period_timeInterval_end",  "yyyy-MM-dd'T'HH:mm'Z'")) \
            .withColumn("time_Period_timeInterval_start", F.to_timestamp("time_Period_timeInterval_start",  "yyyy-MM-dd'T'HH:mm'Z'")) \
            .withColumn("createdDateTime", F.to_timestamp("createdDateTime",  "yyyy-MM-dd'T'HH:mm:ss'Z'"))
df_nonts.printSchema()
df_nonts.show()

##processing TS dataframe
# explode timeseries column into struct, jadiin semua elemen di dalam array TimeSeries jadi satu row
df_ts = df_ts.withColumn("TimeSeries", F.explode("TimeSeries"))
df_ts = df_ts.select("TimeSeries.*")
#flatten nested struct sampe ke dalem, nyisain si period 
df_ts = flatten_struct(flatten_struct(df_ts))
df_ts.printSchema()

root
 |-- xmlns: string (nullable = true)
 |-- createdDateTime: timestamp (nullable = true)
 |-- mRID: string (nullable = true)
 |-- process_processType: string (nullable = true)
 |-- receiver_MarketParticipant_marketRole_type: string (nullable = true)
 |-- revisionNumber: string (nullable = true)
 |-- sender_MarketParticipant_marketRole_type: string (nullable = true)
 |-- type: string (nullable = true)
 |-- receiver_MarketParticipant_mRID_text: string (nullable = true)
 |-- receiver_MarketParticipant_mRID_codingScheme: string (nullable = true)
 |-- sender_MarketParticipant_mRID_text: string (nullable = true)
 |-- sender_MarketParticipant_mRID_codingScheme: string (nullable = true)
 |-- time_Period_timeInterval_end: timestamp (nullable = true)
 |-- time_Period_timeInterval_start: timestamp (nullable = true)

+--------------------+-------------------+--------------------+-------------------+------------------------------------------+--------------+---------------------------------------

In [49]:
df_nonts.show()
df_ts.show()

+--------------------+--------------------+--------------------+-------------------+------------------------------------------+--------------+----------------------------------------+----+------------------------------------+--------------------------------------------+----------------------------------+------------------------------------------+----------------------------+------------------------------+
|               xmlns|     createdDateTime|                mRID|process_processType|receiver_MarketParticipant_marketRole_type|revisionNumber|sender_MarketParticipant_marketRole_type|type|receiver_MarketParticipant_mRID_text|receiver_MarketParticipant_mRID_codingScheme|sender_MarketParticipant_mRID_text|sender_MarketParticipant_mRID_codingScheme|time_Period_timeInterval_end|time_Period_timeInterval_start|
+--------------------+--------------------+--------------------+-------------------+------------------------------------------+--------------+----------------------------------------

In [None]:
df_ts.select()

In [261]:

def parse_resolution_to_timedelta(resolution_column: str) -> str:
    resolutions = {
        'PT60M': 'INTERVAL 1 HOUR',
        'P1Y': 'INTERVAL 12 MONTH',
        'PT15M': 'INTERVAL 15 MINUTES',
        'PT30M': 'INTERVAL 30 MINUTES',
        'P1D': 'INTERVAL 1 DAY',
        'P7D': 'INTERVAL 7 DAY',
        'P1M': 'INTERVAL 1 MONTH',
    }
    delta = resolutions.get(resolution_column)
    if delta is None:
        raise NotImplementedError(f"Sorry, I don't know what to do with the "
                                  "resolution '{resolution_column}', because there was no "
                                  "documentation to be found of this format. "
                                  "Everything is hard coded. Please open an "
                                  "issue.")
    return delta
    
#parsing datetime
def parse_datetimeindex(df_ts, df_nonts, tz=None):
    start = df_nonts.select(F.col("time_Period_timeInterval_start")).collect()[0][0]
    end = df_nonts.select(F.col("time_Period_timeInterval_end")).collect()[0][0]
    if tz is not None:
        start = df_nonts.select(F.from_utc_timestamp(F.col("time_Period_timeInterval_start"), tz)).collect[0][0]
        end = df_nonts.select(F.from_utc_timestamp(F.col("time_Period_timeInterval_end"), tz)).collect()[0][0]
    
    # ambil resolution dan parse
    resolution_col = df_ts.select(F.col("Period_resolution")).collect()[0][0]
    delta = parse_resolution_to_timedelta(resolution_col)
    
    #generate index
    index = spark.createDataFrame([{'date':1}]).select(F.explode(F.sequence(F.lit(start),F.lit(end), F.expr(delta))).alias("ts_index"))
    if tz is not None:
        #case kalo di parse_timeindex: weekly granularity bakal nambah index element karena ada Daylight Saving Time. Harus di kurangin 
        #sementara skip dulu
        pass
    return index

#parsing generation timeseries

positions = []
quantities = []

example_row = []
#contoh 1 row, equivalent of parse_generation_timeseries function
#bagian ini dirubah ke parse_generation alias semua yg row di collect dijadiin looping buat collect() [x][0] for x in range(len())
df_periods = df_ts.select(F.col("Period_point")).collect()
df_periods_1 = df_periods[0][0]


# for row in df_firstrow:
#     example_row.append(row)


+-------------------+
|           ts_index|
+-------------------+
|2020-12-31 23:00:00|
|2020-12-31 23:15:00|
|2020-12-31 23:30:00|
|2020-12-31 23:45:00|
|2021-01-01 00:00:00|
|2021-01-01 00:15:00|
|2021-01-01 00:30:00|
|2021-01-01 00:45:00|
|2021-01-01 01:00:00|
|2021-01-01 01:15:00|
|2021-01-01 01:30:00|
|2021-01-01 01:45:00|
|2021-01-01 02:00:00|
|2021-01-01 02:15:00|
|2021-01-01 02:30:00|
|2021-01-01 02:45:00|
|2021-01-01 03:00:00|
|2021-01-01 03:15:00|
|2021-01-01 03:30:00|
|2021-01-01 03:45:00|
+-------------------+
only showing top 20 rows



In [162]:
#upload to BQ
#tulis data ke bigquery via temporary gcs bucket
df_spark.write \
  .format("bigquery") \
  .option("project","rafzul-analytics-1009") \
  .option("temporaryGcsBucket","entsoe_temp_1009") \
  .mode("append") \
  .save("rafzul-analytics-1009.entsoe_playground.total_generation_staging")

23/01/03 23:38:16 WARN DefaultCredentialsProvider: Your application has authenticated using end user credentials from Google Cloud SDK. We recommend that most server applications use service accounts instead. If your application continues to use end user credentials from Cloud SDK, you might receive a "quota exceeded" or "API not enabled" error. For more information about service accounts, see https://cloud.google.com/docs/authentication/.


[Stage 99:>                                                         (0 + 1) / 1]

23/01/03 23:38:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Biomass - Actual Aggregated, Fossil Brown coal/Lignite - Actual Aggregated, Fossil Gas - Actual Aggregated, Fossil Hard coal - Actual Aggregated, Fossil Oil - Actual Aggregated, Geothermal - Actual Aggregated, Hydro Pumped Storage - Actual Aggregated, Hydro Pumped Storage - Actual Consumption, Hydro Run-of-river and poundage - Actual Aggregated, Hydro Water Reservoir - Actual Aggregated, Nuclear - Actual Aggregated, Other - Actual Aggregated, Other renewable - Actual Aggregated, Solar - Actual Aggregated, Waste - Actual Aggregated, Wind Offshore - Actual Aggregated, Wind Onshore - Actual Aggregated
 Schema: _c0, Biomass - Actual Aggregated, Fossil Brown coal/Lignite - Actual Aggregated, Fossil Gas - Actual Aggregated, Fossil Hard coal - Actual Aggregated, Fossil Oil - Actual Aggregated, Geothermal - Actual Aggregated, Hydro Pumped Storage - Actual Aggregated, Hydro Pumped Storage - Actual Con

                                                                                

In [143]:
#-------------------------------------- old code ----------------------------------

In [52]:
# #create schema to be enforced in subsequent json load operation
# with open("schema_raw_j", "w") as schrawjson:
#     schrawjson.write(df_spark_orig.schema.json())

In [17]:
# with open("schema_raw2.json", "r") as schrawjson:
#     json_schema_data = schrawjson.read()
#     json_enforced_schema = StructType.fromJson(json.loads(json_schema_data))
    

In [18]:
# #create dataframe from gcs
# path = f"gs://{gcs_bucket}/{landing_filename}"
# print(path)
# df_spark = spark.read.format("json").schema(json_enforced_schema) \
#    .option("header","true") \
#    .option("multiLine","true") \
#    .load(path) \
#    .select("GL_MarketDocument.*")
   

gs://entsoe_analytics_1009/entsoe_data_DE_TENNET.json


In [144]:
#clean non timeseries column name. change dot to underscores
df_spark = df_spark.toDF(*(c_name.replace(".", "_") for c_name in df_spark.columns))

#clean timeseries column. cast TimeSeries to new scheme where 1.dot in names are replaced with underscores and 2. Strange characters such as '@' or '#' are removed
ts_schema = df_spark.select("TimeSeries").dtypes[0][1]
replacements = [('\.', '_'), ('[@#]', '')]
for old, new in replacements:
    ts_schema = re.sub(old, new, ts_schema)
df_spark = df_spark.withColumn("TimeSeries", (F.col("TimeSeries").cast(ts_schema)))

In [148]:
df_spark.select("TimeSeries").dtypes

[('TimeSeries',
  'array<struct<MktPSRType:struct<psrType:string>,Period:struct<Point:array<struct<position:string,quantity:string>>,resolution:string,timeInterval:struct<end:string,start:string>>,businessType:string,curveType:string,inBiddingZone_Domain_mRID:struct<text:string,codingScheme:string>,mRID:string,objectAggregation:string,outBiddingZone_Domain_mRID:struct<text:string,codingScheme:string>,quantity_Measure_Unit_name:string>>')]

In [10]:
#flatten column

# explode timeseries column into struct
df_spark = df_spark.withColumn("TimeSeries", F.explode("TimeSeries"))


In [11]:
df_spark.printSchema()

root
 |-- @xmlns: string (nullable = true)
 |-- mRID: string (nullable = true)
 |-- revisionNumber: string (nullable = true)
 |-- type: string (nullable = true)
 |-- process_processType: string (nullable = true)
 |-- sender_MarketParticipant_mRID: struct (nullable = true)
 |    |-- #text: string (nullable = true)
 |    |-- @codingScheme: string (nullable = true)
 |-- sender_MarketParticipant_marketRole_type: string (nullable = true)
 |-- receiver_MarketParticipant_mRID: struct (nullable = true)
 |    |-- #text: string (nullable = true)
 |    |-- @codingScheme: string (nullable = true)
 |-- receiver_MarketParticipant_marketRole_type: string (nullable = true)
 |-- createdDateTime: string (nullable = true)
 |-- time_Period_timeInterval: struct (nullable = true)
 |    |-- end: string (nullable = true)
 |    |-- start: string (nullable = true)
 |-- TimeSeries: struct (nullable = true)
 |    |-- MktPSRType: struct (nullable = true)
 |    |    |-- psrType: string (nullable = true)
 |    |-- P

In [97]:


# flatten TimeSeries if there is TimeSeries, flatten AttributeInstanceComponent if there is AttributeInstanceComponent
def unpack_df(nested_df):
    component_columns = ["TimeSeries", "AttributeInstanceComponent"]
    general_cols = [c for c in nested_df.columns if c not in component_columns]
    if "TimeSeries" in nested_df.columns:
        data_cols_name = "TimeSeries"
        data_cols = [c for c in nested_df.select("TimeSeries.*").columns]
    else:
        pass
    unpacked_df = nested_df.select(general_cols \
                                   + [F.col(data_cols_name+"."+c).alias(data_cols_name+"_"+c)\
                                      for c in data_cols])
    return unpacked_df
    

df_spark = unpack_df(df_spark)
df_spark.printSchema()

root
 |-- @xmlns: string (nullable = true)
 |-- mRID: string (nullable = true)
 |-- revisionNumber: string (nullable = true)
 |-- type: string (nullable = true)
 |-- process_processType: string (nullable = true)
 |-- sender_MarketParticipant_mRID: struct (nullable = true)
 |    |-- #text: string (nullable = true)
 |    |-- @codingScheme: string (nullable = true)
 |-- sender_MarketParticipant_marketRole_type: string (nullable = true)
 |-- receiver_MarketParticipant_mRID: struct (nullable = true)
 |    |-- #text: string (nullable = true)
 |    |-- @codingScheme: string (nullable = true)
 |-- receiver_MarketParticipant_marketRole_type: string (nullable = true)
 |-- createdDateTime: string (nullable = true)
 |-- time_Period_timeInterval: struct (nullable = true)
 |    |-- end: string (nullable = true)
 |    |-- start: string (nullable = true)
 |-- TimeSeries_MktPSRType: struct (nullable = true)
 |    |-- psrType: string (nullable = true)
 |-- TimeSeries_Period: struct (nullable = true)
 | 

In [102]:
#tulis data ke bigquery via temporary gcs bucket
df_spark.write \
  .format("bigquery") \
  .option("project","rafzul-analytics-1009") \
  .option("temporaryGcsBucket","entsoe_temp_1009") \
  .mode("append") \
  .save("rafzul-analytics-1009.entsoe_playground.fact_test")

                                                                                

In [None]:
# if df_spark2.select("TimeSeries"):
#     # rename all column (specifically replace dot with underscore) on every struct nested in TimeSeries array column. use transform
#     df_spark3 = df_spark3.withColumn("TimeSeries", F.transform \
#     ("TimeSeries", lambda el,ind: \
#     F.struct \
#     ("abc" for c_name in el.columns)))
    
# # # #get list of names of all columns in every struct inside TimeSeries array column
# # a = [x for x in df_spark2.select(("TimeSeries"))]
# # print(a)

# df_spark3.select("TimeSeries").show(10, truncate=False)
# a = F.struct(F.col("TimeSeries").getItem(c_name).alias(c_name.replace(".", "_")) for i, c_name in enumerate(df_spark2.schema["TimeSeries"].dataType.elementType.fieldNames()))
# # print(a


# F.struct(F.col("TimeSeries") for c_name in df_spark2.schema[x].dataType.names


# df_spark2.show()
# df_spark2.printSchema()

# # #get list of names of all columns in every struct inside TimeSeries array column
# a = [(c_name,i) for (c_name,i) in enumerate(df_spark2.schema["TimeSeries"].dataType.elementType.fieldNames())]
# print(a)


# df_spark2 = df_spark2.withColumn("TimeSeries", F.transform("TimeSeries", lambda x: F.struct(*[F.col("TimeSeries." + c_name.replace(".", "_")).alias(c_name) for c_name in x])))

In [16]:
# #for non timeseries column
# df_spark2 = df_spark2.toDF(*(c_name.replace(".", "_") for c_name in df_spark2.columns))
# b = df_spark2.select(F.col("TimeSeries"))
# #for timeseries one
# a = F.struct(F.col("TimeSeries").getItem(c_name).alias(c_name.replace(".", "_")) for i, c_name in enumerate(df_spark2.schema["TimeSeries"].dataType.elementType.fieldNames()))
# print(a)
# if df_spark2.select("TimeSeries"):
#     # rename all column (specifically replace dot with underscore) on every struct nested in TimeSeries array column. use transform, use Timeseries schema and its datatype.name o
#     df_spark2 = df_spark2.withColumn("TimeSeries", F.transform("TimeSeries", lambda x: (a)))
    
# # #get list of names of all columns in every struct inside TimeSeries array column
# # a = [c_name for c_name in enumerate(df_spark2.schema["TimeSeries"].dataType.elementType.fieldNames())]


# df_spark3.show()
# df_spark3.printSchema()



# # df_spark2 = df_spark2.withColumn("TimeSeries", F.transform("TimeSeries", lambda x: F.struct(*[F.col("TimeSeries." + c_name.replace(".", "_")).alias(c_name) for c_name in x])))

In [134]:
# #cast schema with the dot replaced with underscore, programatically


# ---- solution for  column names beside TimeSeries:
# changed_column_general = [(column_name, column_name.replace(".", "_")) for column_name in df_schema.fieldNames() if "." in x]
# for column_name, changed_column_name in changed_column_general:
#     df_schema = df_spark2.select([F.col(c).alias(mappingcolumn_name, changed_column_name)
# # UPDATE: there is even better solution

# #changed non timeseries column
# df_spark2 = df_spark2.toDF(*(c.replace(".", "_") for c in df_spark2.columns))

In [76]:
with open("example_destination.json", "w+") as output_file:
    output_file.write(df_spark2.toJSON())

TypeError: write() argument must be str, not RDD

In [None]:
# #inferring schema and get the data type of each column and turn it into spark dataframe
# datatype_infer = pd.DataFrame.from_dict(xml_file_dict[0], orient='index')

In [None]:
# #flatten nested df at every layer
# from pyspark.sql.types import *
# from pyspark.sql.import functions as f

# def flatten_structs(nested_df):
#     stack = [(), nested_df]
#     columns = []
    
#     while len(stack) > 0:
#         parents
        
    

In [None]:
# df_schema = df_spark.dtypes

# firstrow_orig = df_spark.collect()[0]
# new_header = [f"{x} - {firstrow_orig.__getitem__(x)}" for x in firstrow_orig.__fields__]

# df_spark = df_spark.where(F.col("_c0").isNotNull())
# df_spark.show()

# # #drop header and first column


In [None]:
#original code to read json file 

try: 
   df_spark = spark.read.format("json") \
      .option("inferSchema","true") \
      .option("multiLine","true") \
      .load(path) \
      .select("GL_MarketDocument.*")
except Exception as e:
   pass
   # df_spark = spark.read.format("csv") \
   #    .option("inferSchema","true") \
   #    .option("header","true") \
   #    .load(path)
   # schema_version = source_schema_version + 0.1
   # new_source_schema = { "version": float(schema_version),"schema": df_spark.schema.jsonValue()}
   # new_source_schema_json = json.dumps(new_source_schema)
   # new_source_schema_filename = f"schema_source/{label_data}__schema__{schema_version}.json"
   # upload_blob_to_gcs(bucket_name=gcs_bucket, contents=new_source_schema_json, destination_blob_name=new_source_schema_filename)