In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()

In [0]:
import urllib.request

url = "https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/refs/heads/master/data/flight-data/csv/2015-summary.csv"

path = "/dbfs/tmp/2015-summary.csv"

urllib.request.urlretrieve(url, path)

('/dbfs/tmp/2015-summary.csv', <http.client.HTTPMessage at 0x7668a527bc90>)

In [0]:
csv_file = (
    spark.read.format("csv")
    .option("header", "true")
    .option("mode", "FAILFAST")
    .load("/tmp/2015-summary.csv")
)

In [0]:
csv_file.write.format("csv").mode("overwrite").option("sep", "\t")\
.save("/dbfs/tmp/my-tsv-file.tsv")

In [0]:
!ls dbfs

by-day-raw


Reading JSON files

In [0]:
import urllib.request

url = "https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/refs/heads/master/data/flight-data/json/2010-summary.json"

path = "/dbfs/tmp/2010-summary.json"

urllib.request.urlretrieve(url, path)

('/dbfs/tmp/2010-summary.json', <http.client.HTTPMessage at 0x76689191bc90>)

In [0]:
json_data = spark.read.format("json").option("mode", "FAILFAST")\
.option("inferSchema", "true")\
.load("/tmp/2010-summary.json").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [0]:
csv_file.write.format("json").mode("overwrite").save("/tmp/my-json-file.json")

Reading parquet files

In [0]:
import urllib.request

url = "http://github.com/databricks/Spark-The-Definitive-Guide/blob/master/data/flight-data/parquet/2010-summary.parquet/part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet"

path = "/dbfs/tmp/2010-summary.parquet"

urllib.request.urlretrieve(url, path)

('/dbfs/tmp/2010-summary.parquet', <http.client.HTTPMessage at 0x766862472590>)

In [0]:
import urllib.request

raw_url = "https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/master/data/flight-data/parquet/2010-summary.parquet/part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet"

local_path = "/dbfs/tmp/2010-summary/part-00000.parquet"

# Create directory if needed
import os
os.makedirs("/dbfs/tmp/2010-summary", exist_ok=True)

urllib.request.urlretrieve(raw_url, local_path)
print("Downloaded to:", local_path)


Downloaded to: /dbfs/tmp/2010-summary/part-00000.parquet


In [0]:
df = spark.read.parquet("dbfs:/tmp/2010-summary")
df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [0]:
import os
os.path.exists('/dbfs/tmp/2010-summary.parquet')

True

In [0]:
csv_file.write.format("parquet").mode("overwrite")\
.save("/tmp/my-parquet-file.parquet")

Reading from ORC files

In [0]:
import urllib.request, os

# Raw ORC part file URL
raw_url = (
"https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/master/data/flight-data/orc/2010-summary.orc/part-r-00000-2c4f7d96-e703-4de3-af1b-1441d172c80f.snappy.orc"
)

# Local DBFS destination
local_dir = "/dbfs/tmp/2010-summary-orc"
os.makedirs(local_dir, exist_ok=True)
local_path = f"{local_dir}/part-00000.orc"

# Download the ORC part file
urllib.request.urlretrieve(raw_url, local_path)
print("Downloaded ORC to:", local_path)


Downloaded ORC to: /dbfs/tmp/2010-summary-orc/part-00000.orc


In [0]:
spark.read.format("orc").load("dbfs:/tmp/2010-summary-orc/").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [0]:
csv_file.write.format("orc").mode("overwrite").save("/tmp/my-json-file.orc")