In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

In [11]:
spark = SparkSession.builder.master("local[*]").appName("Demo").getOrCreate()
spark

In [23]:
empDf = spark.read.option("header",True).option("infrechema",True).csv("Spark-main/employees.csv")
empDf.show(1)

+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
only showing top 1 row



In [24]:
deptDf = spark.read.option("header",True).option("infrechema",True).csv("Spark-main/departments.csv")
deptDf.show(1)

+-------------+---------------+----------+-----------+
|DEPARTMENT_ID|DEPARTMENT_NAME|MANAGER_ID|LOCATION_ID|
+-------------+---------------+----------+-----------+
|           10| Administration|       200|       1700|
+-------------+---------------+----------+-----------+
only showing top 1 row



In [25]:
from pyspark.sql.functions import *

In [26]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 104857600)
#The Value is in Bytes

In [27]:
empDf.join(broadcast(deptDf), empDf.DEPARTMENT_ID == deptDf.DEPARTMENT_ID, "inner").select(empDf.EMPLOYEE_ID, empDf.DEPARTMENT_ID, deptDf.DEPARTMENT_NAME).show(100)

+-----------+-------------+----------------+
|EMPLOYEE_ID|DEPARTMENT_ID| DEPARTMENT_NAME|
+-----------+-------------+----------------+
|        198|           50|        Shipping|
|        199|           50|        Shipping|
|        200|           10|  Administration|
|        201|           20|       Marketing|
|        202|           20|       Marketing|
|        203|           40| Human Resources|
|        204|           70|Public Relations|
|        205|          110|      Accounting|
|        206|          110|      Accounting|
|        100|           90|       Executive|
|        101|           90|       Executive|
|        102|           90|       Executive|
|        103|           60|              IT|
|        104|           60|              IT|
|        105|           60|              IT|
|        106|           60|              IT|
|        107|           60|              IT|
|        108|          100|         Finance|
|        109|          100|         Finance|
|        1

In [29]:
resultDf=empDf.join(broadcast(deptDf), empDf.DEPARTMENT_ID == deptDf.DEPARTMENT_ID, "inner").select(empDf.EMPLOYEE_ID, empDf.DEPARTMENT_ID, deptDf.DEPARTMENT_NAME)
resultDf.show(1)

+-----------+-------------+---------------+
|EMPLOYEE_ID|DEPARTMENT_ID|DEPARTMENT_NAME|
+-----------+-------------+---------------+
|        198|           50|       Shipping|
+-----------+-------------+---------------+
only showing top 1 row



In [33]:
resultDf.write.option("header",True).csv("/tmp/result1")

In [34]:
resultDf.write.mode("overwrite").option("header",True).save("/tmp/result1")

In [35]:
resultDf.write.mode("overwrite").option("header",True).format("csv").save("/tmp/result1")

In [36]:
resultDf.write.mode("append").option("header",True).format("csv").save("/tmp/result1")

In [38]:
resultDf.write.mode("overwrite").partitionBy("DEPARTMENT_NAME").option("header",True).format("csv").save("/tmp/result1")

In [39]:
empDf.rdd.getNumPartitions()

1

In [40]:
deptDf.rdd.getNumPartitions()

1

In [41]:
resultDf.rdd.getNumPartitions()

1

In [42]:
resultDf.repartition(10)

DataFrame[EMPLOYEE_ID: string, DEPARTMENT_ID: string, DEPARTMENT_NAME: string]

In [43]:
resultDf.rdd.getNumPartitions()

1

In [44]:
newDf = resultDf.repartition(10)
newDf.rdd.getNumPartitions()

10

In [45]:
df1 = newDf.repartition(2)

In [46]:
df1.rdd.getNumPartitions()

2

In [47]:
newDf.rdd.getNumPartitions()

10

In [49]:
df2 = newDf.coalesce(20)
df2.rdd.getNumPartitions()

10

In [50]:
df3 = newDf.coalesce(5)
df3.rdd.getNumPartitions()

5

In [52]:
resultDf.coalesce(1).write.mode("overwrite").option("header",True).format("csv").save("/tmp/result")

In [54]:
jsonDf = spark.read.json("Spark-main/jsonexample.json")
jsonDf.show()

+------------+----+-----+-------+
|      Array1|Num1|Text1|  Text2|
+------------+----+-----+-------+
|   [7, 8, 9]| 5.0|Hello|GoodBye|
|[70, 88, 91]| 6.5| This|   That|
|   [1, 2, 3]| 2.0|  Yes|     No|
+------------+----+-----+-------+



In [55]:
jsonDf.printSchema()

root
 |-- Array1: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- Num1: double (nullable = true)
 |-- Text1: string (nullable = true)
 |-- Text2: string (nullable = true)



In [56]:
jsonDf.select(jsonDf.Text1, jsonDf.Array1).show()

+-----+------------+
|Text1|      Array1|
+-----+------------+
|Hello|   [7, 8, 9]|
| This|[70, 88, 91]|
|  Yes|   [1, 2, 3]|
+-----+------------+



In [57]:
jsonDf.select(jsonDf.Text1, jsonDf.Array1[2]).show()

+-----+---------+
|Text1|Array1[2]|
+-----+---------+
|Hello|        9|
| This|       91|
|  Yes|        3|
+-----+---------+



In [58]:
jsonDf.select(jsonDf.Text1, explode(jsonDf.Array1)).show()

+-----+---+
|Text1|col|
+-----+---+
|Hello|  7|
|Hello|  8|
|Hello|  9|
| This| 70|
| This| 88|
| This| 91|
|  Yes|  1|
|  Yes|  2|
|  Yes|  3|
+-----+---+

