In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("pySpWordCountCSV").getOrCreate()

In [3]:
spark.conf.set("spark.sqllegacy.timeParserPolicy", "LEGACY")

In [7]:
df1 = spark.read.csv(".././dataSet//ddFtMx.csv",inferSchema=True, header = True)

In [8]:
df1

DataFrame[empno: int, ename: string, job: string, mgr: int, hiredate: string, sal: int, comm: int, deptno: int]

In [9]:
df1.printSchema()

root
 |-- empno: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: integer (nullable = true)
 |-- hiredate: string (nullable = true)
 |-- sal: integer (nullable = true)
 |-- comm: integer (nullable = true)
 |-- deptno: integer (nullable = true)



In [10]:
df1.columns

['empno', 'ename', 'job', 'mgr', 'hiredate', 'sal', 'comm', 'deptno']

In [11]:
df1.take(2)

[Row(empno=7839, ename='KING', job='PRESIDENT', mgr=None, hiredate='11/17/2001', sal=5000, comm=None, deptno=10),
 Row(empno=7698, ename='BLAKE', job='MANAGER', mgr=7839, hiredate='05/01/2001', sal=2850, comm=None, deptno=30)]

In [16]:
df1.head(2)

Row(empno=7839, ename='KING', job='PRESIDENT', mgr=None, hiredate='11/17/2001', sal=5000, comm=None, deptno=10)

In [28]:
df1.head(2)[0]["hiredate"]

'11/17/2001'

In [13]:
df1.first()

Row(empno=7839, ename='KING', job='PRESIDENT', mgr=None, hiredate='11/17/2001', sal=5000, comm=None, deptno=10)

In [17]:
df1.select("hiredate").show()

+----------+
|  hiredate|
+----------+
|11/17/2001|
|05/01/2001|
|06/09/2001|
|04/02/2001|
|09/28/2001|
|02/20/2001|
|09/08/2001|
|12/03/2001|
|02/22/2001|
|02/03/2001|
|12/17/2000|
|04/19/2007|
|05/23/2007|
|01/23/2002|
+----------+



In [15]:
type(df1.select("hiredate"))

pyspark.sql.dataframe.DataFrame

In [19]:
from pyspark.sql.functions import *
df1 = df1.withColumn('Annualsal', df1.sal*12)
df1.show(2)

+-----+-----+---------+----+----------+----+----+------+---------+
|empno|ename|      job| mgr|  hiredate| sal|comm|deptno|Annualsal|
+-----+-----+---------+----+----------+----+----+------+---------+
| 7839| KING|PRESIDENT|null|11/17/2001|5000|null|    10|    60000|
| 7698|BLAKE|  MANAGER|7839|05/01/2001|2850|null|    30|    34200|
+-----+-----+---------+----+----------+----+----+------+---------+
only showing top 2 rows



In [22]:
df1.withColumn('newDtCol', df1['hiredate']).show(3)

+-----+-----+---------+----+----------+----+----+------+---------+----------+
|empno|ename|      job| mgr|  hiredate| sal|comm|deptno|Annualsal|  newDtCol|
+-----+-----+---------+----+----------+----+----+------+---------+----------+
| 7839| KING|PRESIDENT|null|11/17/2001|5000|null|    10|    60000|11/17/2001|
| 7698|BLAKE|  MANAGER|7839|05/01/2001|2850|null|    30|    34200|05/01/2001|
| 7782|CLARK|  MANAGER|7839|06/09/2001|2450|null|    10|    29400|06/09/2001|
+-----+-----+---------+----+----------+----+----+------+---------+----------+
only showing top 3 rows



In [23]:
df1.withColumnRenamed('sal','salary').show(2)

+-----+-----+---------+----+----------+------+----+------+---------+
|empno|ename|      job| mgr|  hiredate|salary|comm|deptno|Annualsal|
+-----+-----+---------+----+----------+------+----+------+---------+
| 7839| KING|PRESIDENT|null|11/17/2001|  5000|null|    10|    60000|
| 7698|BLAKE|  MANAGER|7839|05/01/2001|  2850|null|    30|    34200|
+-----+-----+---------+----+----------+------+----+------+---------+
only showing top 2 rows



In [24]:
df02 = df1.drop(df1.Annualsal)
df02.show(5)

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7839|  KING|PRESIDENT|null|11/17/2001|5000|null|    10|
| 7698| BLAKE|  MANAGER|7839|05/01/2001|2850|null|    30|
| 7782| CLARK|  MANAGER|7839|06/09/2001|2450|null|    10|
| 7566| JONES|  MANAGER|7839|04/02/2001|2975|null|    20|
| 7654|MARTIN| SALESMAN|7698|09/28/2001|1250|1400|    30|
+-----+------+---------+----+----------+----+----+------+
only showing top 5 rows



In [26]:
from pyspark.sql.functions import *
df2 = df1.select(col("hiredate"),col("sal").cast("double"))
df2.show(2)

+----------+------+
|  hiredate|   sal|
+----------+------+
|11/17/2001|5000.0|
|05/01/2001|2850.0|
+----------+------+
only showing top 2 rows



In [27]:
df2.printSchema()

root
 |-- hiredate: string (nullable = true)
 |-- sal: double (nullable = true)



In [30]:
from pyspark.sql.types import StructField, IntegerType, DoubleType, StringType, StructType, DateType

In [35]:
dataSchema = [StructField('empno',IntegerType(),True),StructField('ename',StringType(),True),
              StructField('job',StringType(),True),
              StructField('sal',DoubleType(),True),
             ]

In [36]:
finalStruc = StructType(fields=dataSchema)

In [39]:
df3 = spark.read.json('./emp.json',schema=finalStruc)
df3.show()

+-----+------+---------+------+
|empno| ename|      job|   sal|
+-----+------+---------+------+
| 7839|  KING|PRESIDENT|5000.0|
| 7698| BLAKE|  MANAGER|2850.0|
| 7782| CLARK|  MANAGER|2450.0|
| 7566| JONES|  MANAGER|2975.0|
| 7654|MARTIN| SALESMAN|1250.0|
| 7499| ALLEN| SALESMAN|1600.0|
| 7844|TURNER| SALESMAN|1500.0|
| 7900| JAMES|    CLERK| 950.0|
| 7521|  WARD| SALESMAN|1250.0|
| 7902|  FORD|  ANALYST|3000.0|
| 7369| SMITH|    CLERK| 800.0|
| 7788| SCOTT|  ANALYST|3000.0|
| 7876| ADAMS|    CLERK|1100.0|
| 7934|MILLER|    CLERK|1300.0|
+-----+------+---------+------+



In [40]:
df3.select("sal")

DataFrame[sal: double]

In [41]:
df4 = df3.withColumn("sal",df3["sal"].cast(DoubleType())).withColumn("ename",df3["ename"].cast(StringType()))

In [42]:
df4.select("sal")

DataFrame[sal: double]

In [43]:
df4.select("ename")

DataFrame[ename: string]

In [44]:
df4.show()

+-----+------+---------+------+
|empno| ename|      job|   sal|
+-----+------+---------+------+
| 7839|  KING|PRESIDENT|5000.0|
| 7698| BLAKE|  MANAGER|2850.0|
| 7782| CLARK|  MANAGER|2450.0|
| 7566| JONES|  MANAGER|2975.0|
| 7654|MARTIN| SALESMAN|1250.0|
| 7499| ALLEN| SALESMAN|1600.0|
| 7844|TURNER| SALESMAN|1500.0|
| 7900| JAMES|    CLERK| 950.0|
| 7521|  WARD| SALESMAN|1250.0|
| 7902|  FORD|  ANALYST|3000.0|
| 7369| SMITH|    CLERK| 800.0|
| 7788| SCOTT|  ANALYST|3000.0|
| 7876| ADAMS|    CLERK|1100.0|
| 7934|MILLER|    CLERK|1300.0|
+-----+------+---------+------+



In [46]:
print('inner, outer, full,fullouter, fill_outer, leftouter,left, left_outer, rightouter, right, right_outer, leftsemi, left_semi, leftanti, left_anti, cross ')

inner, outer, full,fullouter, fill_outer, leftouter,left, left_outer, rightouter, right, right_outer, leftsemi, left_semi, leftanti, left_anti, cross 


In [49]:
dept = spark.read.csv("./dept.csv",inferSchema=True, header = True)
emp = spark.read.csv("./emp.csv",inferSchema=True, header = True)

In [52]:
res04= dept.join(emp,dept.deptno==emp.deptno, 'left')
res04.show(5)

+------+----------+--------+-----+------+---------+----+----------+----+----+------+
|deptno|     dname|     loc|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+------+----------+--------+-----+------+---------+----+----------+----+----+------+
|    10|ACCOUNTING|NEW YORK| 7934|MILLER|    CLERK|7782|01/23/2002|1300|null|    10|
|    10|ACCOUNTING|NEW YORK| 7782| CLARK|  MANAGER|7839|06/09/2001|2450|null|    10|
|    10|ACCOUNTING|NEW YORK| 7839|  KING|PRESIDENT|null|11/17/2001|5000|null|    10|
|    20|  RESEARCH|  DALLAS| 7876| ADAMS|    CLERK|7788|05/23/2007|1100|null|    20|
|    20|  RESEARCH|  DALLAS| 7788| SCOTT|  ANALYST|7566|04/19/2007|3000|null|    20|
+------+----------+--------+-----+------+---------+----+----------+----+----+------+
only showing top 5 rows



In [54]:
res04= emp.crossJoin(dept)
res04.show()

+-----+------+---------+----+----------+----+----+------+------+----------+--------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|deptno|     dname|     loc|
+-----+------+---------+----+----------+----+----+------+------+----------+--------+
| 7839|  KING|PRESIDENT|null|11/17/2001|5000|null|    10|    10|ACCOUNTING|NEW YORK|
| 7839|  KING|PRESIDENT|null|11/17/2001|5000|null|    10|    20|  RESEARCH|  DALLAS|
| 7839|  KING|PRESIDENT|null|11/17/2001|5000|null|    10|    30|     SALES| CHICAGO|
| 7839|  KING|PRESIDENT|null|11/17/2001|5000|null|    10|    40|OPERATIONS|  BOSTON|
| 7698| BLAKE|  MANAGER|7839|05/01/2001|2850|null|    30|    10|ACCOUNTING|NEW YORK|
| 7698| BLAKE|  MANAGER|7839|05/01/2001|2850|null|    30|    20|  RESEARCH|  DALLAS|
| 7698| BLAKE|  MANAGER|7839|05/01/2001|2850|null|    30|    30|     SALES| CHICAGO|
| 7698| BLAKE|  MANAGER|7839|05/01/2001|2850|null|    30|    40|OPERATIONS|  BOSTON|
| 7782| CLARK|  MANAGER|7839|06/09/2001|2450|null|    10|    10|A

In [55]:
res04= dept.join(emp,dept.deptno==emp.deptno, 'inner')
res04.show()

+------+----------+--------+-----+------+---------+----+----------+----+----+------+
|deptno|     dname|     loc|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+------+----------+--------+-----+------+---------+----+----------+----+----+------+
|    10|ACCOUNTING|NEW YORK| 7839|  KING|PRESIDENT|null|11/17/2001|5000|null|    10|
|    30|     SALES| CHICAGO| 7698| BLAKE|  MANAGER|7839|05/01/2001|2850|null|    30|
|    10|ACCOUNTING|NEW YORK| 7782| CLARK|  MANAGER|7839|06/09/2001|2450|null|    10|
|    20|  RESEARCH|  DALLAS| 7566| JONES|  MANAGER|7839|04/02/2001|2975|null|    20|
|    30|     SALES| CHICAGO| 7654|MARTIN| SALESMAN|7698|09/28/2001|1250|1400|    30|
|    30|     SALES| CHICAGO| 7499| ALLEN| SALESMAN|7698|02/20/2001|1600| 300|    30|
|    30|     SALES| CHICAGO| 7844|TURNER| SALESMAN|7698|09/08/2001|1500|   0|    30|
|    30|     SALES| CHICAGO| 7900| JAMES|    CLERK|7698|12/03/2001| 950|null|    30|
|    30|     SALES| CHICAGO| 7521|  WARD| SALESMAN|7698|02/22/200

In [56]:
res04= dept.join(emp,dept.deptno==emp.deptno, 'outer')
res04.show()

+------+----------+--------+-----+------+---------+----+----------+----+----+------+
|deptno|     dname|     loc|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+------+----------+--------+-----+------+---------+----+----------+----+----+------+
|    10|ACCOUNTING|NEW YORK| 7839|  KING|PRESIDENT|null|11/17/2001|5000|null|    10|
|    10|ACCOUNTING|NEW YORK| 7782| CLARK|  MANAGER|7839|06/09/2001|2450|null|    10|
|    10|ACCOUNTING|NEW YORK| 7934|MILLER|    CLERK|7782|01/23/2002|1300|null|    10|
|    20|  RESEARCH|  DALLAS| 7566| JONES|  MANAGER|7839|04/02/2001|2975|null|    20|
|    20|  RESEARCH|  DALLAS| 7902|  FORD|  ANALYST|7566|02/03/2001|3000|null|    20|
|    20|  RESEARCH|  DALLAS| 7369| SMITH|    CLERK|7902|12/17/2000| 800|null|    20|
|    20|  RESEARCH|  DALLAS| 7788| SCOTT|  ANALYST|7566|04/19/2007|3000|null|    20|
|    20|  RESEARCH|  DALLAS| 7876| ADAMS|    CLERK|7788|05/23/2007|1100|null|    20|
|    30|     SALES| CHICAGO| 7698| BLAKE|  MANAGER|7839|05/01/200

In [None]:
res04= dept.join(emp,dept.deptno==emp.deptno, 'inner')
res04.show()

In [58]:
df03 = emp.filter("deptno==10").select(['job']).distinct()
df03.show()

+---------+
|      job|
+---------+
|    CLERK|
|  MANAGER|
|PRESIDENT|
+---------+



In [66]:
df04 = emp.filter("deptno==20").select(['job']).distinct()
df04.show()

+-------+
|    job|
+-------+
|ANALYST|
|  CLERK|
|MANAGER|
+-------+



In [68]:
df03.unionAll(df04).show()

+---------+
|      job|
+---------+
|    CLERK|
|  MANAGER|
|PRESIDENT|
|  ANALYST|
|    CLERK|
|  MANAGER|
+---------+



In [71]:
df03.union(df04).show()   #unionall as ssame show duplicate

+---------+
|      job|
+---------+
|    CLERK|
|  MANAGER|
|PRESIDENT|
|  ANALYST|
|    CLERK|
|  MANAGER|
+---------+



In [70]:
df03.union(df04).distinct().show()   #unionall as ssame show duplicate

+---------+
|      job|
+---------+
|    CLERK|
|  MANAGER|
|PRESIDENT|
|  ANALYST|
+---------+



In [72]:
df03.intersect(df04).show()

+-------+
|    job|
+-------+
|  CLERK|
|MANAGER|
+-------+



In [73]:
df03.subtract(df04).show()

+---------+
|      job|
+---------+
|PRESIDENT|
+---------+



In [74]:
df04.subtract(df03).show()

+-------+
|    job|
+-------+
|ANALYST|
+-------+



In [76]:
import pyspark.sql.functions as f