<a href="https://colab.research.google.com/github/ramayer/google-colab-examples/blob/main/Apache_Spark_with_Delta_Tables_on_Google_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Apache Spark 3.2.1 with Delta.io 1.2.1 in Google Colab


#### Install Java (if needed)

* Some versins of Spark depend on a particular version of Java that may differ from what Google Colab pre-installs.  For example, spark-3.1.2 wants openjdk-8.   Not needed for spark 3.2.1, which seems to work with whatever colab's default is.


In [2]:
!apt-get -qq update > /tmp/apt.out
!apt-get install -y -qq openjdk-11-jdk-headless

#### install Spark

In [3]:
!(wget -q --show-progress -nc https://mirrors.ocf.berkeley.edu/apache/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz)
!tar xf spark-3.2.1-bin-hadoop3.2.tgz



## Install pyspark and related python libraries



In [4]:
try:
  import pyspark, findspark, delta
except:
  %pip install -q --upgrade pyspark==3.2.1
  %pip install -q findspark
  %pip install -q delta

[K     |████████████████████████████████| 281.4 MB 37 kB/s 
[K     |████████████████████████████████| 198 kB 46.3 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Building wheel for delta (setup.py) ... [?25l[?25hdone


# Start a Spark Session


In [2]:
import findspark
import pyspark
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"

findspark.init()
MAX_MEMORY="8g"
maven_coords = [
    "org.apache.spark:spark-avro_2.12:3.2.1",
    "io.delta:delta-core_2.12:1.2.1",
    "org.xerial:sqlite-jdbc:3.36.0.3",
    "graphframes:graphframes:0.8.2-spark3.2-s_2.12",
    "com.acervera.osm4scala:osm4scala-spark3-shaded_2.12:1.0.8",
]

spark = (pyspark.sql.SparkSession.builder.appName("MyApp") 
    .config("spark.jars.packages", ",".join(maven_coords))
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
    .config("spark.executor.memory", MAX_MEMORY) 
    .config("spark.driver.memory", MAX_MEMORY) 
    .enableHiveSupport() 
    .getOrCreate()        
    )

spark

## Create a delta table

In [3]:
#from delta.tables import DeltaTable
import delta

df = spark.createDataFrame([{'s':'hello world','i':1234}])

(df.write.format('delta')
         .mode('overwrite')
         .option("mergeSchema", "true")
         .save('./delta_hello_world')
)


## Query the delta table

In [4]:
spark.read.format("delta").load('./delta_hello_world').createOrReplaceTempView("delta_hello_world")
df2 = spark.sql("""
  select * from delta_hello_world
""")
df2.toPandas()

Unnamed: 0,i,s
0,1234,hello world


# Save dataframe on google drive

In [5]:
save_on_google_drive = False
if save_on_google_drive:
  from google.colab import drive
  if not os.path.ismount('/content/gdrive'):
    drive.mount('/content/gdrive')
  else:
    print("drive was already mounted")
  df.write.format("delta").save("/content/gdrive/MyDrive/Colab Datasets/test_delta_table")

## Reddit questions

In [6]:
# https://old.reddit.com/r/apachespark/comments/qrh5qn/join_3k_rows_with_4m_rows_to_create_3bn_row_delta/
import time
if True:
  small_tbl_size = 3000
  large_tbl_size = 4_000_000
  join_col_cardinality = 4

  small_data = [(x,f"row {x}", x%join_col_cardinality) for x in range(small_tbl_size)]
  large_data = [(x,f"row {x}", x%join_col_cardinality) for x in range(large_tbl_size)]

  small_df = spark.createDataFrame(small_data, 'id int, txt string, join_col int')
  large_df = spark.createDataFrame(large_data, 'id int, txt string, join_col int')

  small_df.createOrReplaceTempView('small_tbl')
  large_df.createOrReplaceTempView('large_tbl')

  t0 = time.time()
  spark.sql("""
    SELECT * 
      FROM small_tbl AS s
      JOIN large_tbl AS l ON (s.join_col = l.join_col)
  """).show()
  print(f"{time.time() - t0} seconds")

+---+-----+--------+---+------+--------+
| id|  txt|join_col| id|   txt|join_col|
+---+-----+--------+---+------+--------+
|  1|row 1|       1|  1| row 1|       1|
|  1|row 1|       1|  5| row 5|       1|
|  1|row 1|       1|  9| row 9|       1|
|  1|row 1|       1| 13|row 13|       1|
|  1|row 1|       1| 17|row 17|       1|
|  1|row 1|       1| 21|row 21|       1|
|  1|row 1|       1| 25|row 25|       1|
|  1|row 1|       1| 29|row 29|       1|
|  1|row 1|       1| 33|row 33|       1|
|  1|row 1|       1| 37|row 37|       1|
|  1|row 1|       1| 41|row 41|       1|
|  1|row 1|       1| 45|row 45|       1|
|  1|row 1|       1| 49|row 49|       1|
|  1|row 1|       1| 53|row 53|       1|
|  1|row 1|       1| 57|row 57|       1|
|  1|row 1|       1| 61|row 61|       1|
|  1|row 1|       1| 65|row 65|       1|
|  1|row 1|       1| 69|row 69|       1|
|  1|row 1|       1| 73|row 73|       1|
|  1|row 1|       1| 77|row 77|       1|
+---+-----+--------+---+------+--------+
only showing top

In [7]:
# https://www.reddit.com/r/apachespark/comments/r0fwrx/merge_two_rdds/
d1 = [3,5,8]
d2 = [1,2,3,4]
df1 = spark.createDataFrame(d1,'int').createOrReplaceTempView('v1')
df2 = spark.createDataFrame(d2,'int').createOrReplaceTempView('v2')

spark.sql("""
   select flatten(array(array(v2.value),v1s.values))
     from v2 
     join (select collect_list(value) as values from v1) as v1s
""").show()

+------------------------------------+
|flatten(array(array(value), values))|
+------------------------------------+
|                        [1, 3, 5, 8]|
|                        [2, 3, 5, 8]|
|                        [3, 3, 5, 8]|
|                        [4, 3, 5, 8]|
+------------------------------------+



In [8]:
# https://old.reddit.com/r/apachespark/comments/rmiksv/create_new_column_within_a_join_in_pyspark/

if True:
    d1 = [{'id':1,'animal':'cat'},{'id':2,'animal':'hawk'}]
    d2 = [{'id':1,'weapon':'claw'},{'id':2,'weapon':'talon'}]
    df1 = spark.createDataFrame(d1).createOrReplaceTempView('v1')
    df2 = spark.createDataFrame(d2).createOrReplaceTempView('v2')

    spark.sql("""
      select *, 
             v1.animal || 's have ' ||v2.weapon || 's' as my_new_column
        from v1
        join v2 using (id)
    """).show()

+---+------+------+-----------------+
| id|animal|weapon|    my_new_column|
+---+------+------+-----------------+
|  1|   cat|  claw|  cats have claws|
|  2|  hawk| talon|hawks have talons|
+---+------+------+-----------------+



In [10]:
%%time
# https://old.reddit.com/r/apachespark/comments/rme6zi/working_with_large_dataset/

df = spark.range(2_000_000_000).selectExpr("id%10 as id_mod_ten","*").createOrReplaceTempView("billions_of_rows")

df2 = spark.sql("select count(distinct id_mod_ten) from billions_of_rows")
df2.show()


+--------------------------+
|count(DISTINCT id_mod_ten)|
+--------------------------+
|                        10|
+--------------------------+

CPU times: user 479 ms, sys: 50.8 ms, total: 530 ms
Wall time: 1min 34s
