In [1]:
import os
import sys
os.environ['SPARK_HOME']='/usr/lib/spark'
os.environ['PYLIB']=os.environ['SPARK_HOME']+'/python/lib'
sys.path.insert(0,os.environ['PYLIB']+'/py4j-0.10.7-src.zip')
sys.path.insert(1,os.environ['PYLIB']+'/pyspark.zip')

In [2]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

In [3]:
spark = SparkSession.builder.appName('TestHive') \
.config('spark.warehouse.dir','/apps/hive/warehouse') \
.enableHiveSupport().getOrCreate()

In [4]:
# Having set the driver and driver options we should have spark representing spark session 
# available straight away
spark.version

'2.3.0'

In [5]:
sc = spark.sparkContext

In [6]:
spark.sql('show tables').show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| default|          categories|      false|
| default|            cmfvwtbl|      false|
| default|           customers|      false|
| default|            demtable|      false|
| default|         departments|      false|
| default|                 hkt|      false|
| default|intermediate_acce...|      false|
| default|      my_first_table|      false|
| default|               nsecm|      false|
| default|    ntest_kudu_table|      false|
| default|          oitem_kudu|      false|
| default|         order_items|      false|
| default|              orders|      false|
| default|        product_kudu|      false|
| default|            products|      false|
| default|          spark2ttbl|      false|
| default|spark_kudu_mappin...|      false|
| default|                spkt|      false|
| default|    spkt_mapping_tbl|      false|
| default|             stmdtbl| 

In [37]:
testDF = sc.parallelize([(1, "sachin", 2000, 6, 10, 575, 2), (1, "sachin", 2001, 10, 18, 1003, 3),
                         (1, "sachin", 2002, 16, 26, 1392, 4), (1, "sachin", 2010, 14, 23, 1562, 7), 
                         (2, "virat", 2011, 5, 9, 202, 0), (2, "virat", 2012, 9, 16, 689, 3),
                         (2, "virat", 2016, 12, 18, 1215, 4), (2, "virat", 2017, 10, 16, 1059, 5)
                        ]).toDF(["id",  "fname", "year", "matches", "innings", "runs", "centuries"])

odiDF = sc.parallelize([(1, 2000, "sachin", 34, 34, 1328, 3), (1, 2001, "sachin", 17, 16, 904, 4),
                        (1, 2010, "sachin", 2, 2, 1, 204), (1, 2011, "sachin", 11, 11, 513, 2), 
                        (2, 2008, "virat", 5, 5, 159, 0),(2, 2009, "virat", 10, 8,  325, 1), 
                        (2, 2016, "virat", 10, 10, 739, 3), (2, 2017, "virat", 26, 26, 1460, 6)
                       ]).toDF(["id", "year", "fname", "matches", "innings", "runs", "centuries"])

In [38]:
#cross join
# need to have this setting to cross joins to work as default
spark.conf.set("spark.sql.crossJoin.enabled", True)
testDF.join(odiDF).show()

+---+------+----+-------+-------+----+---------+---+----+------+-------+-------+----+---------+
| id| fname|year|matches|innings|runs|centuries| id|year| fname|matches|innings|runs|centuries|
+---+------+----+-------+-------+----+---------+---+----+------+-------+-------+----+---------+
|  1|sachin|2000|      6|     10| 575|        2|  1|2000|sachin|     34|     34|1328|        3|
|  1|sachin|2000|      6|     10| 575|        2|  1|2001|sachin|     17|     16| 904|        4|
|  1|sachin|2000|      6|     10| 575|        2|  1|2010|sachin|      2|      2|   1|      204|
|  1|sachin|2000|      6|     10| 575|        2|  1|2011|sachin|     11|     11| 513|        2|
|  1|sachin|2001|     10|     18|1003|        3|  1|2000|sachin|     34|     34|1328|        3|
|  1|sachin|2001|     10|     18|1003|        3|  1|2001|sachin|     17|     16| 904|        4|
|  1|sachin|2001|     10|     18|1003|        3|  1|2010|sachin|      2|      2|   1|      204|
|  1|sachin|2001|     10|     18|1003|  

In [43]:
# renaming columns when we already have a DF and we call toDF then we need
# to provide comma separated names for each of the columns
# in contrast, when we are parallelizing and callling to DF we have to provide the colnames as a list
testDF.toDF('fc', 'sc', 'tc', 'foc', 'fic', 'sic', 'sec').show()

+---+------+----+---+---+----+---+
| fc|    sc|  tc|foc|fic| sic|sec|
+---+------+----+---+---+----+---+
|  1|sachin|2000|  6| 10| 575|  2|
|  1|sachin|2001| 10| 18|1003|  3|
|  1|sachin|2002| 16| 26|1392|  4|
|  1|sachin|2010| 14| 23|1562|  7|
|  2| virat|2011|  5|  9| 202|  0|
|  2| virat|2012|  9| 16| 689|  3|
|  2| virat|2016| 12| 18|1215|  4|
|  2| virat|2017| 10| 16|1059|  5|
+---+------+----+---+---+----+---+



In [39]:
# cross join and column renaming to create unique columns
from pyspark.sql.functions import *
# testDF.join(odiDF).select(testDF['id'], testDF['fname']).toDF("a", "b").show()
testDF.join(odiDF).toDF("id", "fname", "year", "tmatches", "tinnings", 
                         "truns", "tcenturies", "sid","syear", "sfname", 
                         "omatches", "oinnings", "oruns", "ocenturies").show()

+---+------+----+--------+--------+-----+----------+---+-----+------+--------+--------+-----+----------+
| id| fname|year|tmatches|tinnings|truns|tcenturies|sid|syear|sfname|omatches|oinnings|oruns|ocenturies|
+---+------+----+--------+--------+-----+----------+---+-----+------+--------+--------+-----+----------+
|  1|sachin|2000|       6|      10|  575|         2|  1| 2000|sachin|      34|      34| 1328|         3|
|  1|sachin|2000|       6|      10|  575|         2|  1| 2001|sachin|      17|      16|  904|         4|
|  1|sachin|2000|       6|      10|  575|         2|  1| 2010|sachin|       2|       2|    1|       204|
|  1|sachin|2000|       6|      10|  575|         2|  1| 2011|sachin|      11|      11|  513|         2|
|  1|sachin|2001|      10|      18| 1003|         3|  1| 2000|sachin|      34|      34| 1328|         3|
|  1|sachin|2001|      10|      18| 1003|         3|  1| 2001|sachin|      17|      16|  904|         4|
|  1|sachin|2001|      10|      18| 1003|         3|  1

In [45]:
# implicit inner join
testDF.join(odiDF, "id").show()

+---+------+----+-------+-------+----+---------+----+------+-------+-------+----+---------+
| id| fname|year|matches|innings|runs|centuries|year| fname|matches|innings|runs|centuries|
+---+------+----+-------+-------+----+---------+----+------+-------+-------+----+---------+
|  1|sachin|2000|      6|     10| 575|        2|2000|sachin|     34|     34|1328|        3|
|  1|sachin|2000|      6|     10| 575|        2|2001|sachin|     17|     16| 904|        4|
|  1|sachin|2000|      6|     10| 575|        2|2010|sachin|      2|      2|   1|      204|
|  1|sachin|2000|      6|     10| 575|        2|2011|sachin|     11|     11| 513|        2|
|  1|sachin|2001|     10|     18|1003|        3|2000|sachin|     34|     34|1328|        3|
|  1|sachin|2001|     10|     18|1003|        3|2001|sachin|     17|     16| 904|        4|
|  1|sachin|2001|     10|     18|1003|        3|2010|sachin|      2|      2|   1|      204|
|  1|sachin|2001|     10|     18|1003|        3|2011|sachin|     11|     11| 513

In [47]:
# implicit inner join using two columns
testDF.join(odiDF, ["id", "year"]).show()

+---+----+------+-------+-------+----+---------+------+-------+-------+----+---------+
| id|year| fname|matches|innings|runs|centuries| fname|matches|innings|runs|centuries|
+---+----+------+-------+-------+----+---------+------+-------+-------+----+---------+
|  1|2010|sachin|     14|     23|1562|        7|sachin|      2|      2|   1|      204|
|  2|2016| virat|     12|     18|1215|        4| virat|     10|     10| 739|        3|
|  2|2017| virat|     10|     16|1059|        5| virat|     26|     26|1460|        6|
|  1|2001|sachin|     10|     18|1003|        3|sachin|     17|     16| 904|        4|
|  1|2000|sachin|      6|     10| 575|        2|sachin|     34|     34|1328|        3|
+---+----+------+-------+-------+----+---------+------+-------+-------+----+---------+



In [50]:
# explicit inner join
testDF.join(odiDF, testDF["id"] == odiDF["id"], "inner").show()

+---+------+----+-------+-------+----+---------+---+----+------+-------+-------+----+---------+
| id| fname|year|matches|innings|runs|centuries| id|year| fname|matches|innings|runs|centuries|
+---+------+----+-------+-------+----+---------+---+----+------+-------+-------+----+---------+
|  1|sachin|2000|      6|     10| 575|        2|  1|2000|sachin|     34|     34|1328|        3|
|  1|sachin|2000|      6|     10| 575|        2|  1|2001|sachin|     17|     16| 904|        4|
|  1|sachin|2000|      6|     10| 575|        2|  1|2010|sachin|      2|      2|   1|      204|
|  1|sachin|2000|      6|     10| 575|        2|  1|2011|sachin|     11|     11| 513|        2|
|  1|sachin|2001|     10|     18|1003|        3|  1|2000|sachin|     34|     34|1328|        3|
|  1|sachin|2001|     10|     18|1003|        3|  1|2001|sachin|     17|     16| 904|        4|
|  1|sachin|2001|     10|     18|1003|        3|  1|2010|sachin|      2|      2|   1|      204|
|  1|sachin|2001|     10|     18|1003|  

In [58]:
# explicit inner join using multiple columns
testDF.join(odiDF, ["id", "year"], "inner").show()
testDF.join(odiDF, (testDF["id"] == odiDF["id"]) & (testDF["year"] == odiDF["year"]), "inner").show()

+---+----+------+-------+-------+----+---------+------+-------+-------+----+---------+
| id|year| fname|matches|innings|runs|centuries| fname|matches|innings|runs|centuries|
+---+----+------+-------+-------+----+---------+------+-------+-------+----+---------+
|  1|2010|sachin|     14|     23|1562|        7|sachin|      2|      2|   1|      204|
|  2|2016| virat|     12|     18|1215|        4| virat|     10|     10| 739|        3|
|  2|2017| virat|     10|     16|1059|        5| virat|     26|     26|1460|        6|
|  1|2001|sachin|     10|     18|1003|        3|sachin|     17|     16| 904|        4|
|  1|2000|sachin|      6|     10| 575|        2|sachin|     34|     34|1328|        3|
+---+----+------+-------+-------+----+---------+------+-------+-------+----+---------+

+---+------+----+-------+-------+----+---------+---+----+------+-------+-------+----+---------+
| id| fname|year|matches|innings|runs|centuries| id|year| fname|matches|innings|runs|centuries|
+---+------+----+-------

In [84]:
#outer joins
testDF.join(odiDF, testDF["id"] == odiDF["id"], "left_outer").show()
testDF.join(odiDF, testDF["id"] == odiDF["id"], "right_outer").show()
testDF.join(odiDF, testDF["id"] == odiDF["id"], "full_outer").show()

+---+------+----+-------+-------+----+---------+---+----+------+-------+-------+----+---------+
| id| fname|year|matches|innings|runs|centuries| id|year| fname|matches|innings|runs|centuries|
+---+------+----+-------+-------+----+---------+---+----+------+-------+-------+----+---------+
|  1|sachin|2000|      6|     10| 575|        2|  1|2000|sachin|     34|     34|1328|        3|
|  1|sachin|2000|      6|     10| 575|        2|  1|2001|sachin|     17|     16| 904|        4|
|  1|sachin|2000|      6|     10| 575|        2|  1|2010|sachin|      2|      2|   1|      204|
|  1|sachin|2000|      6|     10| 575|        2|  1|2011|sachin|     11|     11| 513|        2|
|  1|sachin|2001|     10|     18|1003|        3|  1|2000|sachin|     34|     34|1328|        3|
|  1|sachin|2001|     10|     18|1003|        3|  1|2001|sachin|     17|     16| 904|        4|
|  1|sachin|2001|     10|     18|1003|        3|  1|2010|sachin|      2|      2|   1|      204|
|  1|sachin|2001|     10|     18|1003|  

In [64]:
# see the null values for some particular column
testDF.join(odiDF, ["id", "year"], "leftOuter").toDF(
    "id", "year","fname", "tmatches", "tinnings", "truns", "tcenturies", 
    "sfname", "omatches", "oinnings", "oruns", "ocenturies").where(col("omatches").isNull()).show()


+---+----+------+--------+--------+-----+----------+------+--------+--------+-----+----------+
| id|year| fname|tmatches|tinnings|truns|tcenturies|sfname|omatches|oinnings|oruns|ocenturies|
+---+----+------+--------+--------+-----+----------+------+--------+--------+-----+----------+
|  2|2012| virat|       9|      16|  689|         3|  null|    null|    null| null|      null|
|  1|2002|sachin|      16|      26| 1392|         4|  null|    null|    null| null|      null|
|  2|2011| virat|       5|       9|  202|         0|  null|    null|    null| null|      null|
+---+----+------+--------+--------+-----+----------+------+--------+--------+-----+----------+



In [65]:
# drop all null values
testDF.join(odiDF, ["id", "year"], "leftOuter").toDF(
    "id", "year","fname", "tmatches", "tinnings", "truns", "tcenturies", 
    "sfname", "omatches", "oinnings", "oruns", "ocenturies").na.drop().show()

+---+----+------+--------+--------+-----+----------+------+--------+--------+-----+----------+
| id|year| fname|tmatches|tinnings|truns|tcenturies|sfname|omatches|oinnings|oruns|ocenturies|
+---+----+------+--------+--------+-----+----------+------+--------+--------+-----+----------+
|  1|2010|sachin|      14|      23| 1562|         7|sachin|       2|       2|    1|       204|
|  2|2016| virat|      12|      18| 1215|         4| virat|      10|      10|  739|         3|
|  2|2017| virat|      10|      16| 1059|         5| virat|      26|      26| 1460|         6|
|  1|2001|sachin|      10|      18| 1003|         3|sachin|      17|      16|  904|         4|
|  1|2000|sachin|       6|      10|  575|         2|sachin|      34|      34| 1328|         3|
+---+----+------+--------+--------+-----+----------+------+--------+--------+-----+----------+



In [66]:
# to filter out null values for a column 
testDF.join(odiDF, ["id", "year"], "leftOuter").toDF("id", "year","fname", "tmatches", "tinnings", "truns", 
            "tcenturies", "sfname", "omatches", "oinnings", "oruns", "ocenturies").where(
col('omatches').isNotNull()).show()

+---+----+------+--------+--------+-----+----------+------+--------+--------+-----+----------+
| id|year| fname|tmatches|tinnings|truns|tcenturies|sfname|omatches|oinnings|oruns|ocenturies|
+---+----+------+--------+--------+-----+----------+------+--------+--------+-----+----------+
|  1|2010|sachin|      14|      23| 1562|         7|sachin|       2|       2|    1|       204|
|  2|2016| virat|      12|      18| 1215|         4| virat|      10|      10|  739|         3|
|  2|2017| virat|      10|      16| 1059|         5| virat|      26|      26| 1460|         6|
|  1|2001|sachin|      10|      18| 1003|         3|sachin|      17|      16|  904|         4|
|  1|2000|sachin|       6|      10|  575|         2|sachin|      34|      34| 1328|         3|
+---+----+------+--------+--------+-----+----------+------+--------+--------+-----+----------+



In [81]:
# isnull then 0 else value for a column with null values
testDF.join(odiDF, ["id", "year"], "leftOuter").toDF(
     "id", "year","fname", "tmatches", "tinnings", "truns", "tcenturies", "sfname",
     "omatches", "oinnings", "oruns", "ocenturies").select(
     "id", "year", "fname", "tmatches", "tinnings", "truns", "tcenturies",
     when(isnull(col("omatches")), 0).otherwise(col("omatches")).alias("omtnull")).show()
#     when(isnull(odiDF["omatches"]), 0).otherwise(odiDF["omatches"]).alias("omtnull")).show()

+---+----+------+--------+--------+-----+----------+-------+
| id|year| fname|tmatches|tinnings|truns|tcenturies|omtnull|
+---+----+------+--------+--------+-----+----------+-------+
|  1|2010|sachin|      14|      23| 1562|         7|      2|
|  2|2012| virat|       9|      16|  689|         3|      0|
|  2|2016| virat|      12|      18| 1215|         4|     10|
|  2|2017| virat|      10|      16| 1059|         5|     26|
|  1|2002|sachin|      16|      26| 1392|         4|      0|
|  1|2001|sachin|      10|      18| 1003|         3|     17|
|  2|2011| virat|       5|       9|  202|         0|      0|
|  1|2000|sachin|       6|      10|  575|         2|     34|
+---+----+------+--------+--------+-----+----------+-------+



In [83]:
# adding a computed column
testDF.join(odiDF, ["id", "year"], "leftOuter").toDF(
     "id", "year","fname", "tmatches", "tinnings", "truns", "tcenturies", "sfname",
     "omatches", "oinnings", "oruns", "ocenturies").select(
     "id", "year", "fname", "tmatches", "tinnings", "truns", "tcenturies",
     when(isnull(col("omatches")), 0).otherwise(col("omatches")).alias("omtnull")).withColumn(
    'totmatches', col('tmatches') + col('omtnull')).show()

+---+----+------+--------+--------+-----+----------+-------+----------+
| id|year| fname|tmatches|tinnings|truns|tcenturies|omtnull|totmatches|
+---+----+------+--------+--------+-----+----------+-------+----------+
|  1|2010|sachin|      14|      23| 1562|         7|      2|        16|
|  2|2012| virat|       9|      16|  689|         3|      0|         9|
|  2|2016| virat|      12|      18| 1215|         4|     10|        22|
|  2|2017| virat|      10|      16| 1059|         5|     26|        36|
|  1|2002|sachin|      16|      26| 1392|         4|      0|        16|
|  1|2001|sachin|      10|      18| 1003|         3|     17|        27|
|  2|2011| virat|       5|       9|  202|         0|      0|         5|
|  1|2000|sachin|       6|      10|  575|         2|     34|        40|
+---+----+------+--------+--------+-----+----------+-------+----------+



In [102]:
#leftsemi join
testDF.join(odiDF, testDF["id"] == odiDF["id"], "left_semi").show()

#self join
testDF.alias("a").join(testDF.alias("b")).where(col('a.id') == col('b.id')).show()
testDF.alias("a").join(testDF.alias("b")).where((col('a.id') == col('b.id')) &
    (col('a.year') == col('b.year')) & (col('a.innings') > 10)) .show()



+---+------+----+-------+-------+----+---------+
| id| fname|year|matches|innings|runs|centuries|
+---+------+----+-------+-------+----+---------+
|  1|sachin|2000|      6|     10| 575|        2|
|  1|sachin|2001|     10|     18|1003|        3|
|  1|sachin|2002|     16|     26|1392|        4|
|  1|sachin|2010|     14|     23|1562|        7|
|  2| virat|2011|      5|      9| 202|        0|
|  2| virat|2012|      9|     16| 689|        3|
|  2| virat|2016|     12|     18|1215|        4|
|  2| virat|2017|     10|     16|1059|        5|
+---+------+----+-------+-------+----+---------+

+---+------+----+-------+-------+----+---------+---+------+----+-------+-------+----+---------+
| id| fname|year|matches|innings|runs|centuries| id| fname|year|matches|innings|runs|centuries|
+---+------+----+-------+-------+----+---------+---+------+----+-------+-------+----+---------+
|  1|sachin|2000|      6|     10| 575|        2|  1|sachin|2000|      6|     10| 575|        2|
|  1|sachin|2000|      6|  

In [105]:
# join, grouping and aggregation
testDF.groupBy("year").agg(sum("runs").alias("totruns"), sum("centuries").alias("totcents"), 
                           sumDistinct("centuries").alias("distcent")).show()

+----+-------+--------+--------+
|year|totruns|totcents|distcent|
+----+-------+--------+--------+
|2012|    689|       3|       3|
|2016|   1215|       4|       4|
|2010|   1562|       7|       7|
|2017|   1059|       5|       5|
|2002|   1392|       4|       4|
|2011|    202|       0|       0|
|2000|    575|       2|       2|
|2001|   1003|       3|       3|
+----+-------+--------+--------+

