In [1]:
import os

from pyspark.sql import SparkSession

account_name = os.getenv("AZURE_STORAGE_ACCOUNT_NAME")
client_id = os.getenv("AZURE_CLIENT_ID")
tenant_id = os.getenv("AZURE_TENANT_ID")
secret = os.getenv("AZURE_CLIENT_SECRET")

container = "raw"
path = "security/EodHistoricalData/exchange=AS/year=2023/month=10/day=20/20231020-160221__security__EodHistoricalData__AS.csv"

spark = (
    SparkSession.builder.master("spark://spark-master:7077")
    .appName("Testing Azure")
    .config(
        "spark.jars.packages",
        "org.apache.hadoop:hadoop-azure:3.3.1",
    )
    .getOrCreate()
)


spark.conf.set(f"fs.azure.account.auth.type.{account_name}.dfs.core.windows.net", "OAuth")
spark.conf.set(
    f"fs.azure.account.oauth.provider.type.{account_name}.dfs.core.windows.net",
    "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
)
spark.conf.set(
    f"fs.azure.account.oauth2.client.id.{account_name}.dfs.core.windows.net",
    client_id,
)
spark.conf.set(
    f"fs.azure.account.oauth2.client.secret.{account_name}.dfs.core.windows.net",
    secret,
)
spark.conf.set(
    f"fs.azure.account.oauth2.client.endpoint.{account_name}.dfs.core.windows.net",
    f"https://login.microsoftonline.com/{tenant_id}/oauth2/token",
)


test_data = spark.read.format("csv").load(
    f"abfs://{container}@{account_name}.dfs.core.windows.net/{path}",
    header=True,
)

test_data.show()

:: loading settings :: url = jar:file:/usr/local/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-azure added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-380ae62a-170c-4a37-867b-34dfdfe9062a;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-azure;3.3.1 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.11 in central
	found com.microsoft.azure#azure-storage;7.0.1 in central
	found com.fasterxml.jackson.core#jackson-core;2.10.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found com.microsoft.azure#azure-keyvault-core;1.0.0 in central
	found com.google.guava#guava;27.0-jre in central
	found com.google.guava#failureaccess;1.0 in central
	found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central
	found com.

+-----+--------------------+-----------+--------+--------+------------+------------+
| Code|                Name|    Country|Exchange|Currency|        Type|        Isin|
+-----+--------------------+-----------+--------+--------+------------+------------+
|1ARKG|LS ARK Genomic Re...|Netherlands|      AS|     EUR|         ETF|XS2399368062|
| 2MSF|Leverage Shares 2...|Netherlands|      AS|     EUR|         ETF|IE00BF03XY85|
| 3AMZ|Leverage Shares 3...|Netherlands|      AS|     EUR|         ETF|        NULL|
|3ARKK|   3X ARK INNOVATION|Netherlands|      AS|     EUR|         ETF|XS2399368658|
| 3NIO|         3x Long NIO|Netherlands|      AS|     EUR|         ETF|XS2399365472|
| 3PLT|Leverage Shares 3...|Netherlands|      AS|     EUR|         ETF|XS2337085851|
| AALB|Aalberts Industri...|Netherlands|      AS|     EUR|Common Stock|NL0000852564|
|  ABN|   ABN Amro Group NV|Netherlands|      AS|     EUR|Common Stock|NL0011540547|
|ACOMO|Amsterdam Commodi...|Netherlands|      AS|     EUR|Common 

In [None]:
test_data.write.mode("overwrite").partitionBy("Country", "Exchange").parquet(
    f"abfss://{container}@{account_name}.dfs.core.windows.net/write_back"
)

In [None]:
spark.read.format("parquet").load(
    f"abfss://{container}@{account_name}.dfs.core.windows.net/write_back",
).createOrReplaceTempView("table")

In [None]:
%load_ext sparksql_magic

In [None]:
%%sparksql
SELECT Type AS type, COUNT(*) AS count FROM table WHERE Isin IS NULL GROUP BY Type

In [2]:
spark.stop()