In [1]:
!pip install s3fs

Collecting s3fs
  Downloading s3fs-2023.12.2-py3-none-any.whl.metadata (1.6 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Downloading aiobotocore-2.9.0-py3-none-any.whl.metadata (20 kB)
Collecting fsspec==2023.12.2 (from s3fs)
  Downloading fsspec-2023.12.2-py3-none-any.whl.metadata (6.8 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from s3fs)
  Downloading aiohttp-3.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting botocore<1.33.14,>=1.33.2 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading botocore-1.33.13-py3-none-any.whl.metadata (6.1 kB)
Collecting wrapt<2.0.0,>=1.10.10 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading aioitertools-0.11.0-py3-none-any.whl (23 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp!=4.0.

In [1]:
import os
import s3fs
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as F
from pyspark.sql import Row
import pyspark.sql.types as T
import datetime
import time

In [2]:
# Define environment variables
os.environ["MINIO_KEY"] = "minio"
os.environ["MINIO_SECRET"] = "minio123"
os.environ["MINIO_ENDPOINT"] = "http://minio1:9000"

In [3]:
spark = SparkSession.builder \
    .appName("country_data_analysis") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.11.1026,org.apache.spark:spark-avro_2.12:3.5.0,io.delta:delta-spark_2.12:3.0.0") \
    .config("spark.hadoop.fs.s3a.endpoint", os.environ["MINIO_ENDPOINT"]) \
    .config("spark.hadoop.fs.s3a.access.key", os.environ["MINIO_KEY"]) \
    .config("spark.hadoop.fs.s3a.secret.key", os.environ["MINIO_SECRET"]) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .enableHiveSupport() \
    .getOrCreate()

In [5]:
scala_version = spark.sparkContext._jvm.scala.util.Properties.versionString()
print(scala_version)

version 2.12.18


In [4]:
# Sample data
data = [
    Row(name="Alice", age=25, city="New York"),
    Row(name="Bob", age=30, city="San Francisco"),
    Row(name="Charlie", age=35, city="Los Angeles")
]

df = spark.createDataFrame(data)

df.show()

+-------+---+-------------+
|   name|age|         city|
+-------+---+-------------+
|  Alice| 25|     New York|
|    Bob| 30|San Francisco|
|Charlie| 35|  Los Angeles|
+-------+---+-------------+



In [5]:
df.write.format("avro").save("s3a://mybucket/avro_test.avro")

In [6]:
df.write.orc("s3a://mybucket/orc_test.orc")

In [5]:
df.write.format("delta").save("s3a://mybucket/delta_test.delta")

In [9]:
# import pyspark.sql.types as T

In [4]:
# Number of rows
num_rows = 10000000

# Create DataFrame
df = spark.range(0, num_rows)

# Add columns
for i in range(1, 10):  # Since we already have one column
    if i % 2 == 0:
        # Integer column
        df = df.withColumn(f"int_col_{i}", (F.randn() * 100).cast(T.IntegerType()))
    else:
        # String column
        df = df.withColumn(f"str_col_{i}", (F.rand() * num_rows).cast(T.IntegerType()).cast("string"))

df.count()

10000000

In [5]:
df.show(10,truncate = False)

+---+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|id |str_col_1|int_col_2|str_col_3|int_col_4|str_col_5|int_col_6|str_col_7|int_col_8|str_col_9|
+---+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|0  |4830797  |93       |4007619  |-51      |6700735  |195      |9445928  |8        |129749   |
|1  |2322908  |-107     |1688929  |-9       |3442013  |-166     |9299275  |71       |616359   |
|2  |9361958  |77       |1420421  |-153     |6481478  |12       |5484855  |214      |8059242  |
|3  |7139320  |140      |4121314  |32       |2932949  |168      |4717922  |-88      |3108094  |
|4  |1137218  |63       |8184623  |108      |6391560  |-33      |3639632  |71       |7167665  |
|5  |7218166  |-45      |164020   |-50      |2179791  |-41      |2611901  |-71      |1909397  |
|6  |1421819  |-119     |1065251  |75       |931146   |-37      |1188891  |-99      |1614681  |
|7  |8235943  |217      |1356612  |81   

In [6]:
df.printSchema()

root
 |-- id: long (nullable = false)
 |-- str_col_1: string (nullable = true)
 |-- int_col_2: integer (nullable = true)
 |-- str_col_3: string (nullable = true)
 |-- int_col_4: integer (nullable = true)
 |-- str_col_5: string (nullable = true)
 |-- int_col_6: integer (nullable = true)
 |-- str_col_7: string (nullable = true)
 |-- int_col_8: integer (nullable = true)
 |-- str_col_9: string (nullable = true)



In [7]:
df.write.csv("s3a://mybucket/ten_million_parquet.csv")
df.write.csv("s3a://mybucket/ten_million_avro.csv")
df.write.csv("s3a://mybucket/ten_million_orc.csv")
df.write.csv("s3a://mybucket/ten_million_delta.csv")

In [8]:
schema = T.StructType([
    T.StructField("id", T.LongType(), nullable=False),
    T.StructField("str_col_1", T.StringType(), nullable=True),
    T.StructField("int_col_2", T.IntegerType(), nullable=True),
    T.StructField("str_col_3", T.StringType(), nullable=True),
    T.StructField("int_col_4", T.IntegerType(), nullable=True),
    T.StructField("str_col_5", T.StringType(), nullable=True),
    T.StructField("int_col_6", T.IntegerType(), nullable=True),
    T.StructField("str_col_7", T.StringType(), nullable=True),
    T.StructField("int_col_8", T.IntegerType(), nullable=True),
    T.StructField("str_col_9", T.StringType(), nullable=True)
])

df_csv_parquet = spark.read.format("csv").option("header",True).schema(schema).load("s3a://mybucket/ten_million_parquet.csv")
df_csv_avro = spark.read.format("csv").option("header",True).schema(schema).load("s3a://mybucket/ten_million_avro.csv")
df_csv_orc = spark.read.format("csv").option("header",True).schema(schema).load("s3a://mybucket/ten_million_orc.csv")
df_csv_delta = spark.read.format("csv").option("header",True).schema(schema).load("s3a://mybucket/ten_million_delta.csv")

In [21]:
df_csv.printSchema()

root
 |-- id: long (nullable = true)
 |-- str_col_1: string (nullable = true)
 |-- int_col_2: integer (nullable = true)
 |-- str_col_3: string (nullable = true)
 |-- int_col_4: integer (nullable = true)
 |-- str_col_5: string (nullable = true)
 |-- int_col_6: integer (nullable = true)
 |-- str_col_7: string (nullable = true)
 |-- int_col_8: integer (nullable = true)
 |-- str_col_9: string (nullable = true)



In [9]:
start_time = time.time()
df_csv_parquet.write.parquet("s3a://mybucket/ten_million_parquet2.parquet")
end_time = time.time()
print(f"Time taken to write as Parquet: {end_time - start_time} seconds")

Time taken to write as Parquet: 54.626307249069214 seconds


In [10]:
start_time = time.time()
df_csv_avro.write.format("avro").save("s3a://mybucket/ten_million_avro2.avro")
end_time = time.time()
print(f"Time taken to write as Avro: {end_time - start_time} seconds")

Time taken to write as Avro: 46.201910972595215 seconds


In [11]:
start_time = time.time()
df_csv_orc.write.orc("s3a://mybucket/ten_million_orc2.orc")
end_time = time.time()
print(f"Time taken to write as ORC: {end_time - start_time} seconds")

Time taken to write as ORC: 62.553457736968994 seconds


In [12]:
start_time = time.time()
df_csv_delta.write.format("delta").save("s3a://mybucket/ten_million_delta2.delta")
end_time = time.time()
print(f"Time taken to write as Delta Lake: {end_time - start_time} seconds")

Time taken to write as Delta Lake: 61.09932518005371 seconds


In [None]:
df_parquet = spark.read.parquet("s3a://mybucket/ten_million_parquet2.parquet")
df_parquet.printSchema()

In [None]:
df_avro = spark.read.format("avro").load("s3a://mybucket/ten_million_avro2.avro")
df_avro.printSchema()

In [None]:
df_orc = spark.read.orc("s3a://mybucket/ten_million_orc2.orc")
df_orc.printSchema()

In [None]:
df_delta = spark.read.format("delta").load("s3a://mybucket/ten_million_delta2.delta")
df_delta.printSchema()

In [None]:
start_time = time.time()
df_parquet \
.select("str_col_5","str_col_7","int_col_2") \
.groupBy("str_col_5","str_col_7") \
.count() \
.orderBy("count") \
.limit(1) \
.show(truncate = False)
end_time = time.time()
print(f"Time taken for query: {end_time - start_time} seconds")

In [None]:
start_time = time.time()
df_avro \
.select("str_col_5","str_col_7","int_col_2") \
.groupBy("str_col_5","str_col_7") \
.count() \
.orderBy("count") \
.limit(1) \
.show(truncate = False)
end_time = time.time()
print(f"Time taken for query: {end_time - start_time} seconds")

In [None]:
start_time = time.time()
df_orc \
.select("str_col_5","str_col_7","int_col_2") \
.groupBy("str_col_5","str_col_7") \
.count() \
.orderBy("count") \
.limit(1) \
.show(truncate = False)
end_time = time.time()
print(f"Time taken for query: {end_time - start_time} seconds")

In [None]:
start_time = time.time()
df_delta \
.select("str_col_5","str_col_7","int_col_2") \
.groupBy("str_col_5","str_col_7") \
.count() \
.orderBy("count") \
.limit(1) \
.show(truncate = False)
end_time = time.time()
print(f"Time taken for query: {end_time - start_time} seconds")

In [18]:
"""
CSV: 74.4 MiB * 8 = 595.2 MiB
ORC: 44.7 MiB * 8 = 357.6 MiB
Parquet: 53.6 MiB * 8 = 428.8 MiB
Avro: 60.2 MiB * 8 = 481.6 MiB
Delta Lake: 53.6 MiB * 8 = 428.8 MiB
"""

In [None]:
"""
version: '3.8'

services:
  minio:
    image: minio/minio
    container_name: minio1
    ports:
      - "9000:9000"
      - "9001:9001"
    volumes:
      - /mnt/data:/data
    environment:
      MINIO_ROOT_USER: minio
      MINIO_ROOT_PASSWORD: minio123
    command: server /data --console-address ":9001"

  jupyter:
    image: quay.io/jupyter/pyspark-notebook
    ports:
      - "8888:8888"
      - "4040:4040"
      - "4041:4041"
      - "4042:4042"
"""