<a href="https://colab.research.google.com/github/ramayer/google-colab-examples/blob/main/Apache_Spark_with_Delta_Tables_on_Google_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Apache Spark with Delta Tables in Google Colab


#### Install Java

In [1]:
!apt-get install -y -qq openjdk-8-jdk-headless 

#### install Spark

In [2]:
!(wget -q --show-progress -nc https://mirrors.ocf.berkeley.edu/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz)
!tar xf spark-3.1.2-bin-hadoop3.2.tgz

## Install pyspark and related python libraries



In [3]:
try:
  import pyspark, findspark, delta
except:
  %pip install -q --upgrade pyspark
  %pip install -q findspark
  %pip install -q delta

# Start a Spark Session


In [4]:
import findspark
import pyspark
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

MAX_MEMORY="8g"
findspark.init()
from pyspark.sql import SparkSession
spark = (pyspark.sql.SparkSession.builder.appName("MyApp") 
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0") 
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
    .config("spark.executor.memory", MAX_MEMORY) 
    .config("spark.driver.memory", MAX_MEMORY) 
    .enableHiveSupport() 
    .getOrCreate()        
    )

spark

## Create a delta table

In [5]:
#from delta.tables import DeltaTable
import delta

df = spark.createDataFrame([{'hi':'hello','w':'world'}])

(df.write.format('delta')
         .mode('overwrite')
         .option("mergeSchema", "true")
         .save('./delta_hello_world')
)


## Query the delta table

In [6]:
spark.read.format("delta").load('./delta_hello_world').createOrReplaceTempView("delta_hello_world")
df2 = spark.sql("""
  select * from delta_hello_world
""")
df2.toPandas()

Unnamed: 0,hi,w
0,hello,world


# Save dataframe on google drive

In [9]:
save_on_google_drive = False
if save_on_google_drive:
  from google.colab import drive
  if not os.path.ismount('/content/gdrive'):
    drive.mount('/content/gdrive')
  else:
    print("drive was already mounted")
  df.write.format("delta").save("/content/gdrive/MyDrive/Colab Datasets/test_delta_table")

## Reddit questions

In [12]:
# https://old.reddit.com/r/apachespark/comments/qrh5qn/join_3k_rows_with_4m_rows_to_create_3bn_row_delta/
import time
if True:
  small_tbl_size = 3000
  large_tbl_size = 4_000_000
  join_col_cardinality = 4

  small_data = [(x,f"row {x}", x%join_col_cardinality) for x in range(small_tbl_size)]
  large_data = [(x,f"row {x}", x%join_col_cardinality) for x in range(large_tbl_size)]

  small_df = spark.createDataFrame(small_data, 'id int, txt string, join_col int')
  large_df = spark.createDataFrame(large_data, 'id int, txt string, join_col int')

  small_df.createOrReplaceTempView('small_tbl')
  large_df.createOrReplaceTempView('large_tbl')

  t0 = time.time()
  spark.sql("""
    SELECT * 
      FROM small_tbl AS s
      JOIN large_tbl AS l ON (s.join_col = l.join_col)
  """).show()
  print(f"{time.time() - t0} seconds")

+---+-----+--------+---+------+--------+
| id|  txt|join_col| id|   txt|join_col|
+---+-----+--------+---+------+--------+
|  1|row 1|       1|  1| row 1|       1|
|  1|row 1|       1|  5| row 5|       1|
|  1|row 1|       1|  9| row 9|       1|
|  1|row 1|       1| 13|row 13|       1|
|  1|row 1|       1| 17|row 17|       1|
|  1|row 1|       1| 21|row 21|       1|
|  1|row 1|       1| 25|row 25|       1|
|  1|row 1|       1| 29|row 29|       1|
|  1|row 1|       1| 33|row 33|       1|
|  1|row 1|       1| 37|row 37|       1|
|  1|row 1|       1| 41|row 41|       1|
|  1|row 1|       1| 45|row 45|       1|
|  1|row 1|       1| 49|row 49|       1|
|  1|row 1|       1| 53|row 53|       1|
|  1|row 1|       1| 57|row 57|       1|
|  1|row 1|       1| 61|row 61|       1|
|  1|row 1|       1| 65|row 65|       1|
|  1|row 1|       1| 69|row 69|       1|
|  1|row 1|       1| 73|row 73|       1|
|  1|row 1|       1| 77|row 77|       1|
+---+-----+--------+---+------+--------+
only showing top