In [1]:
import os

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [5]:
import findspark

In [6]:
findspark .init()

In [7]:
import pyspark

In [8]:
from pyspark.sql import SparkSession

In [9]:
if __name__=="__main__":
  spark = SparkSession.builder \
      .appName("myapplication") \
      .master("local[*]") \
      .getOrCreate()

In [10]:
spark

In [11]:
from pyspark.sql import SparkSession

In [12]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,StringType

In [62]:
from pyspark.sql.functions import col,lit

In [69]:
emp=[("pavan","vijayawada",10,'M',30000),
     ("anusha","hyderabad",20,'F',75000),
     ("dwaraka","hyderabad",40,'M',80000),
     ("ramsai","nellore",50,'M',50000),
     ("venkatesh","bimavaram",60,'M',45000),
     ("swapna","sydney",80,'F',200000),
     ("chung","china",110,'F',3000000)]

In [70]:
emp_schema=StructType([StructField("name",StringType()),
                       StructField("location",StringType()),
                       StructField("dept_id",IntegerType()),
                       StructField("gender",StringType()),
                       StructField("salary",IntegerType())])

In [71]:
employee_df=spark.createDataFrame(emp,emp_schema)

In [72]:
employee_df.show()

+---------+----------+-------+------+-------+
|     name|  location|dept_id|gender| salary|
+---------+----------+-------+------+-------+
|    pavan|vijayawada|     10|     M|  30000|
|   anusha| hyderabad|     20|     F|  75000|
|  dwaraka| hyderabad|     40|     M|  80000|
|   ramsai|   nellore|     50|     M|  50000|
|venkatesh| bimavaram|     60|     M|  45000|
|   swapna|    sydney|     80|     F| 200000|
|    chung|     china|    110|     F|3000000|
+---------+----------+-------+------+-------+



In [73]:
department=[("bigdata",10),
            ("mainframes",20),
            ("datascience",30),
            ("performance_testing",40),
            ("banking",50),
            ("animation",60),
            ("digital_marketing",70),
            ("manager",80),
            ("delivery_manager",90),
            ("product_owner",100)]

In [74]:
dept_schema=StructType([StructField("dept_name",StringType()),
                        StructField("dept_id",IntegerType())])

In [75]:
department_df=spark.createDataFrame(department,dept_schema)

In [76]:
department_df.show()

+-------------------+-------+
|          dept_name|dept_id|
+-------------------+-------+
|            bigdata|     10|
|         mainframes|     20|
|        datascience|     30|
|performance_testing|     40|
|            banking|     50|
|          animation|     60|
|  digital_marketing|     70|
|            manager|     80|
|   delivery_manager|     90|
|      product_owner|    100|
+-------------------+-------+



#inner join
#df1.join(df2,identical_column,"inner")

In [77]:
innerjoin_df=employee_df.join(department_df,employee_df.dept_id==department_df.dept_id,"inner")

#####ordering by any column

In [80]:
sorted_innerjoin_df=innerjoin_df.orderBy(col("name").desc())

In [81]:
sorted_innerjoin_df.show()

+---------+----------+-------+------+------+-------------------+-------+
|     name|  location|dept_id|gender|salary|          dept_name|dept_id|
+---------+----------+-------+------+------+-------------------+-------+
|venkatesh| bimavaram|     60|     M| 45000|          animation|     60|
|   swapna|    sydney|     80|     F|200000|            manager|     80|
|   ramsai|   nellore|     50|     M| 50000|            banking|     50|
|    pavan|vijayawada|     10|     M| 30000|            bigdata|     10|
|  dwaraka| hyderabad|     40|     M| 80000|performance_testing|     40|
|   anusha| hyderabad|     20|     F| 75000|         mainframes|     20|
+---------+----------+-------+------+------+-------------------+-------+



In [89]:
sorted_innerjoin_df2=innerjoin_df.orderBy("salary",ascending=True)

In [90]:
sorted_innerjoin_df2.show()

+---------+----------+-------+------+------+-------------------+-------+
|     name|  location|dept_id|gender|salary|          dept_name|dept_id|
+---------+----------+-------+------+------+-------------------+-------+
|    pavan|vijayawada|     10|     M| 30000|            bigdata|     10|
|venkatesh| bimavaram|     60|     M| 45000|          animation|     60|
|   ramsai|   nellore|     50|     M| 50000|            banking|     50|
|   anusha| hyderabad|     20|     F| 75000|         mainframes|     20|
|  dwaraka| hyderabad|     40|     M| 80000|performance_testing|     40|
|   swapna|    sydney|     80|     F|200000|            manager|     80|
+---------+----------+-------+------+------+-------------------+-------+



#####ordering by common column her dept_id

In [92]:
innerjoinsort_df=employee_df.join(department_df,"dept_id","inner")

In [104]:
alias_innerjoin_join=innerjoinsort_df.alias("empdept")

In [105]:
orderby_deptid_innerjoin_df=alias_innerjoin_join.orderBy("empdept.dept_id",ascending=True)

In [106]:
orderby_deptid_innerjoin_df.show()

+-------+---------+----------+------+------+-------------------+
|dept_id|     name|  location|gender|salary|          dept_name|
+-------+---------+----------+------+------+-------------------+
|     10|    pavan|vijayawada|     M| 30000|            bigdata|
|     20|   anusha| hyderabad|     F| 75000|         mainframes|
|     40|  dwaraka| hyderabad|     M| 80000|performance_testing|
|     50|   ramsai|   nellore|     M| 50000|            banking|
|     60|venkatesh| bimavaram|     M| 45000|          animation|
|     80|   swapna|    sydney|     F|200000|            manager|
+-------+---------+----------+------+------+-------------------+



####left outer join
######for left outer join left can be used or leftOuter also can be used

In [107]:
leftouterjoin_df=employee_df.join(department_df,"dept_id","leftOuter")

In [108]:
alias_leftouterjoin=leftouterjoin_df.alias("lefttable")

In [113]:
orderby_lefttable_deptid_df=alias_leftouterjoin.orderBy(col("lefttable.dept_id").asc())

In [114]:
orderby_lefttable_deptid_df.show()

+-------+---------+----------+------+-------+-------------------+
|dept_id|     name|  location|gender| salary|          dept_name|
+-------+---------+----------+------+-------+-------------------+
|     10|    pavan|vijayawada|     M|  30000|            bigdata|
|     20|   anusha| hyderabad|     F|  75000|         mainframes|
|     40|  dwaraka| hyderabad|     M|  80000|performance_testing|
|     50|   ramsai|   nellore|     M|  50000|            banking|
|     60|venkatesh| bimavaram|     M|  45000|          animation|
|     80|   swapna|    sydney|     F| 200000|            manager|
|    110|    chung|     china|     F|3000000|               null|
+-------+---------+----------+------+-------+-------------------+



####rightouter join
######for right outer join right can be used or rightOuter also can be used

In [116]:
rightouterjoin_df=employee_df.join(department_df,"dept_id","right")

In [117]:
alias_rightouterjoin=rightouterjoin_df.alias("righttable")

In [118]:
orderby_righttable_deptid_df=alias_rightouterjoin.orderBy("righttable.dept_id",ascending=True)

In [119]:
orderby_righttable_deptid_df.show()

+-------+---------+----------+------+------+-------------------+
|dept_id|     name|  location|gender|salary|          dept_name|
+-------+---------+----------+------+------+-------------------+
|     10|    pavan|vijayawada|     M| 30000|            bigdata|
|     20|   anusha| hyderabad|     F| 75000|         mainframes|
|     30|     null|      null|  null|  null|        datascience|
|     40|  dwaraka| hyderabad|     M| 80000|performance_testing|
|     50|   ramsai|   nellore|     M| 50000|            banking|
|     60|venkatesh| bimavaram|     M| 45000|          animation|
|     70|     null|      null|  null|  null|  digital_marketing|
|     80|   swapna|    sydney|     F|200000|            manager|
|     90|     null|      null|  null|  null|   delivery_manager|
|    100|     null|      null|  null|  null|      product_owner|
+-------+---------+----------+------+------+-------------------+



###Fullouterjoin
##### we can use either full or fullOuter or outer

```
# This is formatted as code
```



In [122]:
fullouterjoindf=employee_df.join(department_df,"dept_id","full")

#####in full outer join ther no ambiguity of common coloumn selection like from employee table or department table

In [125]:
sorted_fullouterjoindf=fullouterjoindf.orderBy(col("dept_id").asc())

In [126]:
sorted_fullouterjoindf.show()

+-------+---------+----------+------+-------+-------------------+
|dept_id|     name|  location|gender| salary|          dept_name|
+-------+---------+----------+------+-------+-------------------+
|     10|    pavan|vijayawada|     M|  30000|            bigdata|
|     20|   anusha| hyderabad|     F|  75000|         mainframes|
|     30|     null|      null|  null|   null|        datascience|
|     40|  dwaraka| hyderabad|     M|  80000|performance_testing|
|     50|   ramsai|   nellore|     M|  50000|            banking|
|     60|venkatesh| bimavaram|     M|  45000|          animation|
|     70|     null|      null|  null|   null|  digital_marketing|
|     80|   swapna|    sydney|     F| 200000|            manager|
|     90|     null|      null|  null|   null|   delivery_manager|
|    100|     null|      null|  null|   null|      product_owner|
|    110|    chung|     china|     F|3000000|               null|
+-------+---------+----------+------+-------+-------------------+



####when you find the columns are maching but column names are different use withcolumnRenamed so that you can join two tables easily

In [129]:
employeedf_newdf=employee_df.withColumnRenamed("dept_id","desig")

In [130]:
employeedf_newdf.show()

+---------+----------+-----+------+-------+
|     name|  location|desig|gender| salary|
+---------+----------+-----+------+-------+
|    pavan|vijayawada|   10|     M|  30000|
|   anusha| hyderabad|   20|     F|  75000|
|  dwaraka| hyderabad|   40|     M|  80000|
|   ramsai|   nellore|   50|     M|  50000|
|venkatesh| bimavaram|   60|     M|  45000|
|   swapna|    sydney|   80|     F| 200000|
|    chung|     china|  110|     F|3000000|
+---------+----------+-----+------+-------+



In [131]:
departmentdf_newdf=department_df.withColumnRenamed("dept_id","desig")

In [132]:
departmentdf_newdf.show()

+-------------------+-----+
|          dept_name|desig|
+-------------------+-----+
|            bigdata|   10|
|         mainframes|   20|
|        datascience|   30|
|performance_testing|   40|
|            banking|   50|
|          animation|   60|
|  digital_marketing|   70|
|            manager|   80|
|   delivery_manager|   90|
|      product_owner|  100|
+-------------------+-----+



In [133]:
innerjoined_newdf=employeedf_newdf.join(departmentdf_newdf,"desig","inner")

In [134]:
innerjoined_newdf.show()

+-----+---------+----------+------+------+-------------------+
|desig|     name|  location|gender|salary|          dept_name|
+-----+---------+----------+------+------+-------------------+
|   20|   anusha| hyderabad|     F| 75000|         mainframes|
|   40|  dwaraka| hyderabad|     M| 80000|performance_testing|
|   10|    pavan|vijayawada|     M| 30000|            bigdata|
|   50|   ramsai|   nellore|     M| 50000|            banking|
|   80|   swapna|    sydney|     F|200000|            manager|
|   60|venkatesh| bimavaram|     M| 45000|          animation|
+-----+---------+----------+------+------+-------------------+



##leftsemi join gives only machining records from left table if there are matching records in right table right table records will not show

In [135]:
innerjoined_df=employee_df.join(department_df,"dept_id","inner")

In [136]:
innerjoined_df.show()

+-------+---------+----------+------+------+-------------------+
|dept_id|     name|  location|gender|salary|          dept_name|
+-------+---------+----------+------+------+-------------------+
|     20|   anusha| hyderabad|     F| 75000|         mainframes|
|     40|  dwaraka| hyderabad|     M| 80000|performance_testing|
|     10|    pavan|vijayawada|     M| 30000|            bigdata|
|     50|   ramsai|   nellore|     M| 50000|            banking|
|     80|   swapna|    sydney|     F|200000|            manager|
|     60|venkatesh| bimavaram|     M| 45000|          animation|
+-------+---------+----------+------+------+-------------------+



###department column is comming in inner join but in left semi there is no department column that is the difference between inner join and left semi.right table columns which are matching in inner join will not come in left semi join.

In [146]:
leftsemijoined_df=employee_df.join(department_df,"dept_id","left_semi")

In [147]:
leftsemijoined_df.show()

+-------+---------+----------+------+------+
|dept_id|     name|  location|gender|salary|
+-------+---------+----------+------+------+
|     20|   anusha| hyderabad|     F| 75000|
|     40|  dwaraka| hyderabad|     M| 80000|
|     10|    pavan|vijayawada|     M| 30000|
|     50|   ramsai|   nellore|     M| 50000|
|     80|   swapna|    sydney|     F|200000|
|     60|venkatesh| bimavaram|     M| 45000|
+-------+---------+----------+------+------+



In [151]:
semijoined_df=employee_df.join(department_df,"dept_id","semi")

In [152]:
semijoined_df.show()

+-------+---------+----------+------+------+
|dept_id|     name|  location|gender|salary|
+-------+---------+----------+------+------+
|     20|   anusha| hyderabad|     F| 75000|
|     40|  dwaraka| hyderabad|     M| 80000|
|     10|    pavan|vijayawada|     M| 30000|
|     50|   ramsai|   nellore|     M| 50000|
|     80|   swapna|    sydney|     F|200000|
|     60|venkatesh| bimavaram|     M| 45000|
+-------+---------+----------+------+------+



##leftAnti join brings records from the left table which are not matching with records in the right table

In [153]:
leftantijoined_df=employee_df.join(department_df,"dept_id","leftAnti")

In [154]:
leftantijoined_df.show()

+-------+-----+--------+------+-------+
|dept_id| name|location|gender| salary|
+-------+-----+--------+------+-------+
|    110|chung|   china|     F|3000000|
+-------+-----+--------+------+-------+

