In [2]:
from  pyspark.sql  import SparkSession
from pyspark import SQLContext,SparkContext
from datetime import datetime
import time
import pyspark.sql.types as types
import pyspark.sql.functions as f
from pyspark.sql.window import Window as window
from pyspark.sql.functions import current_date,datediff,to_date,months_between,length,year, month, dayofmonth

In [3]:
spark = SparkSession.builder \
     .master('local[1]') \
     .appName("Pyspark emp dept tasks") \
     .getOrCreate()

In [4]:
sqlContext = SQLContext(spark)

In [5]:
emp_schema = types.StructType([
    types.StructField("empno", types.StringType(), True),
    types.StructField("ename", types.StringType(), True),
    types.StructField("job", types.StringType(), True),
    types.StructField("mgr", types.StringType(), True),
    types.StructField("hiredate", types.StringType(), True),
    types.StructField("sal", types.StringType(), True),
    types.StructField("comm", types.StringType(), True),
    types.StructField("deptno", types.StringType(), True)])

In [6]:
dept_schema =  types.StructType([
    types.StructField("deptno", types.StringType(), True),
    types.StructField("dname", types.StringType(), True),
    types.StructField("loc", types.StringType(), True)])

In [7]:
emp = sqlContext.read.csv('data/emp.csv',schema=emp_schema)
emp[[emp.empno.cast('int'),
     emp.ename,
     emp.job,
     emp.mgr.cast('int'),
     emp.hiredate.cast('date'),
    emp.sal.cast("double"),
    emp.comm.cast("double"),
    emp.deptno.cast('int')]].show()

+-----+------+---------+----+----------+------+------+------+
|empno| ename|      job| mgr|  hiredate|   sal|  comm|deptno|
+-----+------+---------+----+----------+------+------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|  null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.0|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.0|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|  null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.0|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|  null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|  null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|  null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|  null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.0|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|  null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|  null|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|  null|    20|
| 7934|M

In [8]:
dept = sqlContext.read.csv('data/dept.csv',schema=dept_schema)
dept[[dept.deptno.cast('int'),dept.dname,dept.loc]].show()

+------+----------+--------+
|deptno|     dname|     loc|
+------+----------+--------+
|    10|ACCOUNTING|NEW YORK|
|    20|  RESEARCH|  DALLAS|
|    30|     SALES| CHICAGO|
|    40|OPERATIONS|  BOSTON|
+------+----------+--------+



In [8]:
emp_dept = emp.join(dept,on='deptno',how='inner').show()

+------+-----+------+---------+----+----------+-------+-------+----------+--------+
|deptno|empno| ename|      job| mgr|  hiredate|    sal|   comm|     dname|     loc|
+------+-----+------+---------+----+----------+-------+-------+----------+--------+
|    20| 7369| SMITH|    CLERK|7902|1980-12-17| 800.00|   null|  RESEARCH|  DALLAS|
|    30| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.00| 300.00|     SALES| CHICAGO|
|    30| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.00| 500.00|     SALES| CHICAGO|
|    20| 7566| JONES|  MANAGER|7839|1981-04-02|2975.00|   null|  RESEARCH|  DALLAS|
|    30| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.00|1400.00|     SALES| CHICAGO|
|    30| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.00|   null|     SALES| CHICAGO|
|    10| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.00|   null|ACCOUNTING|NEW YORK|
|    20| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.00|   null|  RESEARCH|  DALLAS|
|    10| 7839|  KING|PRESIDENT|null|1981-11-17|5000.00|   null|ACCOUNTING|NE

In [9]:
#Display unique Jobs from EMP table? 
emp[["job"]].distinct().show()

+---------+
|      job|
+---------+
|  ANALYST|
| SALESMAN|
|    CLERK|
|  MANAGER|
|PRESIDENT|
+---------+



In [10]:
#List the emps in the asc order of their Salaries? 
emp.sort(emp.sal.cast('double'),ascending=True).show()

+-----+------+---------+----+----------+-------+-------+------+
|empno| ename|      job| mgr|  hiredate|    sal|   comm|deptno|
+-----+------+---------+----+----------+-------+-------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.00|   null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.00|   null|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.00|   null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.00|1400.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.00| 500.00|    30|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.00|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.00|   0.00|    30|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.00| 300.00|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.00|   null|    10|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.00|   null|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.00|   null|    20|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.00|   null|    20|
| 7902|  FORD|  ANALYST|7566|1981-12-03|

In [11]:
#List the details of the emps in asc order of the Dptnos and desc of Jobs?
emp.sort(emp.deptno.cast('int'),emp.job,ascending=[True,False]).show()

+-----+------+---------+----+----------+-------+-------+------+
|empno| ename|      job| mgr|  hiredate|    sal|   comm|deptno|
+-----+------+---------+----+----------+-------+-------+------+
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.00|   null|    10|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.00|   null|    10|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.00|   null|    10|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.00|   null|    20|
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.00|   null|    20|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.00|   null|    20|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.00|   null|    20|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.00|   null|    20|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.00| 500.00|    30|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.00|1400.00|    30|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.00|   0.00|    30|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.00| 300.00|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|

In [12]:
#Display all the unique job groups in the descending order? 
emp.select("job").distinct().sort('job',ascending=False).show()

+---------+
|      job|
+---------+
| SALESMAN|
|PRESIDENT|
|  MANAGER|
|    CLERK|
|  ANALYST|
+---------+



In [13]:
#Display all the details of all ‘Mgrs’ 
mgr_df  = emp.filter(emp.mgr.cast('int') != 0)[[emp.mgr.cast('int')]].distinct()
emp.join(mgr_df,emp.empno.cast('int') == mgr_df.mgr,how='inner').show()

+-----+-----+---------+----+----------+-------+----+------+----+
|empno|ename|      job| mgr|  hiredate|    sal|comm|deptno| mgr|
+-----+-----+---------+----+----------+-------+----+------+----+
| 7566|JONES|  MANAGER|7839|1981-04-02|2975.00|null|    20|7566|
| 7698|BLAKE|  MANAGER|7839|1981-05-01|2850.00|null|    30|7698|
| 7782|CLARK|  MANAGER|7839|1981-06-09|2450.00|null|    10|7782|
| 7788|SCOTT|  ANALYST|7566|1982-12-09|3000.00|null|    20|7788|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.00|null|    10|7839|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000.00|null|    20|7902|
+-----+-----+---------+----+----------+-------+----+------+----+



In [14]:
#List the emps who joined before 1981. 
emp.filter(emp.hiredate[0:4].cast('int')<1981).show()

+-----+-----+-----+----+----------+------+----+------+
|empno|ename|  job| mgr|  hiredate|   sal|comm|deptno|
+-----+-----+-----+----+----------+------+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17|800.00|null|    20|
+-----+-----+-----+----+----------+------+----+------+



In [15]:
#Display the Empno, Ename, job, Hiredate, Exp of all Mgrs 
mgr_df  = emp.filter(emp.mgr.cast('int') != 0)[[emp.mgr.cast('int')]].distinct()
mgr_filter = emp.join(mgr_df,emp.empno.cast('int') == mgr_df.mgr,how='inner')[['empno','ename','job','hiredate']]

In [16]:
mgr_filter[['empno','ename','job','hiredate',(months_between(current_date(),
                           mgr_filter.hiredate)/12).alias("years of exp")]].show()

+-----+-----+---------+----------+------------------+
|empno|ename|      job|  hiredate|      years of exp|
+-----+-----+---------+----------+------------------+
| 7566|JONES|  MANAGER|1981-04-02| 39.12903225833333|
| 7698|BLAKE|  MANAGER|1981-05-01|39.048387096666666|
| 7782|CLARK|  MANAGER|1981-06-09|     38.9435483875|
| 7788|SCOTT|  ANALYST|1982-12-09|     37.4435483875|
| 7839| KING|PRESIDENT|1981-11-17| 38.50537634416667|
| 7902| FORD|  ANALYST|1981-12-03| 38.45967741916667|
+-----+-----+---------+----------+------------------+



In [17]:
#List the emps in the asc order of Designations of those joined after the second halfof 1981. 
emp.where((emp.hiredate[6:2].cast('int') > 6) & 
          (emp.hiredate[0:4].cast('int') == 1981)).sort('job',ascending=True).show()

+-----+------+---------+----+----------+-------+-------+------+
|empno| ename|      job| mgr|  hiredate|    sal|   comm|deptno|
+-----+------+---------+----+----------+-------+-------+------+
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.00|   null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.00|   null|    30|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.00|   null|    10|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.00|1400.00|    30|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.00|   0.00|    30|
+-----+------+---------+----+----------+-------+-------+------+



In [18]:
#. List the emps who are either ‘CLERK’ or ‘ANALYST’ in the Desc order. 
emp.filter(emp.job.isin('CLERK' ,'ANALYST')).sort('job').show()
emp.filter((emp.job=='CLERK') | (emp.job=='ANALYST')).sort(emp.job).show()

+-----+------+-------+----+----------+-------+----+------+
|empno| ename|    job| mgr|  hiredate|    sal|comm|deptno|
+-----+------+-------+----+----------+-------+----+------+
| 7902|  FORD|ANALYST|7566|1981-12-03|3000.00|null|    20|
| 7788| SCOTT|ANALYST|7566|1982-12-09|3000.00|null|    20|
| 7369| SMITH|  CLERK|7902|1980-12-17| 800.00|null|    20|
| 7876| ADAMS|  CLERK|7788|1983-01-12|1100.00|null|    20|
| 7934|MILLER|  CLERK|7782|1982-01-23|1300.00|null|    10|
| 7900| JAMES|  CLERK|7698|1981-12-03| 950.00|null|    30|
+-----+------+-------+----+----------+-------+----+------+

+-----+------+-------+----+----------+-------+----+------+
|empno| ename|    job| mgr|  hiredate|    sal|comm|deptno|
+-----+------+-------+----+----------+-------+----+------+
| 7902|  FORD|ANALYST|7566|1981-12-03|3000.00|null|    20|
| 7788| SCOTT|ANALYST|7566|1982-12-09|3000.00|null|    20|
| 7369| SMITH|  CLERK|7902|1980-12-17| 800.00|null|    20|
| 7876| ADAMS|  CLERK|7788|1983-01-12|1100.00|null|    

In [19]:
#List the Enames those are having five characters in their Names. 
emp.filter(length(emp.ename)==5).show()

+-----+-----+--------+----+----------+-------+------+------+
|empno|ename|     job| mgr|  hiredate|    sal|  comm|deptno|
+-----+-----+--------+----+----------+-------+------+------+
| 7369|SMITH|   CLERK|7902|1980-12-17| 800.00|  null|    20|
| 7499|ALLEN|SALESMAN|7698|1981-02-20|1600.00|300.00|    30|
| 7566|JONES| MANAGER|7839|1981-04-02|2975.00|  null|    20|
| 7698|BLAKE| MANAGER|7839|1981-05-01|2850.00|  null|    30|
| 7782|CLARK| MANAGER|7839|1981-06-09|2450.00|  null|    10|
| 7788|SCOTT| ANALYST|7566|1982-12-09|3000.00|  null|    20|
| 7876|ADAMS|   CLERK|7788|1983-01-12|1100.00|  null|    20|
| 7900|JAMES|   CLERK|7698|1981-12-03| 950.00|  null|    30|
+-----+-----+--------+----+----------+-------+------+------+



In [20]:
#List the Enames those are starting with ‘S’ and with five characters
emp.filter((emp.ename.startswith('S')) & (length(emp.ename)==5)).show()

+-----+-----+-------+----+----------+-------+----+------+
|empno|ename|    job| mgr|  hiredate|    sal|comm|deptno|
+-----+-----+-------+----+----------+-------+----+------+
| 7369|SMITH|  CLERK|7902|1980-12-17| 800.00|null|    20|
| 7788|SCOTT|ANALYST|7566|1982-12-09|3000.00|null|    20|
+-----+-----+-------+----+----------+-------+----+------+



In [21]:
#List the emps those are having four chars and third character must be ‘r’. 
emp.filter((emp.ename[3:1].startswith('R')) & (length(emp.ename)==4)).show()

+-----+-----+--------+----+----------+-------+------+------+
|empno|ename|     job| mgr|  hiredate|    sal|  comm|deptno|
+-----+-----+--------+----+----------+-------+------+------+
| 7521| WARD|SALESMAN|7698|1981-02-22|1250.00|500.00|    30|
| 7902| FORD| ANALYST|7566|1981-12-03|3000.00|  null|    20|
+-----+-----+--------+----+----------+-------+------+------+



In [22]:
#List the Five character names starting with ‘S’ and ending with ‘H’
emp.filter((emp.ename.startswith('S')) & (emp.ename.endswith('H'))).show()

+-----+-----+-----+----+----------+------+----+------+
|empno|ename|  job| mgr|  hiredate|   sal|comm|deptno|
+-----+-----+-----+----+----------+------+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17|800.00|null|    20|
+-----+-----+-----+----+----------+------+----+------+



In [25]:
# List the emps who joined in January. 
#emp.select(date_format(emp.hiredate.cast('date'),'MMMMM')).show()
emp.filter(f.date_format(to_date(emp.hiredate,'yyyy-MM-dd'),'MMMMM')=='January').show()

+-----+------+-----+----+----------+-------+----+------+
|empno| ename|  job| mgr|  hiredate|    sal|comm|deptno|
+-----+------+-----+----+----------+-------+----+------+
| 7876| ADAMS|CLERK|7788|1983-01-12|1100.00|null|    20|
| 7934|MILLER|CLERK|7782|1982-01-23|1300.00|null|    10|
+-----+------+-----+----+----------+-------+----+------+



In [27]:
#List all the emps who joined before or after 1981. 
emp.filter(emp.hiredate[0:4].cast('int') != 1981).show()

+-----+------+-------+----+----------+-------+----+------+
|empno| ename|    job| mgr|  hiredate|    sal|comm|deptno|
+-----+------+-------+----+----------+-------+----+------+
| 7369| SMITH|  CLERK|7902|1980-12-17| 800.00|null|    20|
| 7788| SCOTT|ANALYST|7566|1982-12-09|3000.00|null|    20|
| 7876| ADAMS|  CLERK|7788|1983-01-12|1100.00|null|    20|
| 7934|MILLER|  CLERK|7782|1982-01-23|1300.00|null|    10|
+-----+------+-------+----+----------+-------+----+------+



In [28]:
#All mgr's total salary
emp.join(mgr_df,emp.empno == mgr_df.mgr).agg({"sal":"sum","comm":"sum"}).show()

+---------+--------+
|sum(comm)|sum(sal)|
+---------+--------+
|     null| 19275.0|
+---------+--------+



In [29]:
#List the department,details where at least three emps are working 
dept_empl_cnt = emp.groupby("deptno").agg({"empno":"count"}).filter("count(empno)>3")[[emp.deptno.cast('int')]]
emp.join(dept_empl_cnt,emp.deptno == dept_empl_cnt.deptno).show()


+-----+------+--------+----+----------+-------+-------+------+------+
|empno| ename|     job| mgr|  hiredate|    sal|   comm|deptno|deptno|
+-----+------+--------+----+----------+-------+-------+------+------+
| 7369| SMITH|   CLERK|7902|1980-12-17| 800.00|   null|    20|    20|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.00| 300.00|    30|    30|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.00| 500.00|    30|    30|
| 7566| JONES| MANAGER|7839|1981-04-02|2975.00|   null|    20|    20|
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.00|1400.00|    30|    30|
| 7698| BLAKE| MANAGER|7839|1981-05-01|2850.00|   null|    30|    30|
| 7788| SCOTT| ANALYST|7566|1982-12-09|3000.00|   null|    20|    20|
| 7844|TURNER|SALESMAN|7698|1981-09-08|1500.00|   0.00|    30|    30|
| 7876| ADAMS|   CLERK|7788|1983-01-12|1100.00|   null|    20|    20|
| 7900| JAMES|   CLERK|7698|1981-12-03| 950.00|   null|    30|    30|
| 7902|  FORD| ANALYST|7566|1981-12-03|3000.00|   null|    20|    20|
+-----+------+------

In [40]:
# List the details of the department where maximum number of emps are working. 
max_tot_emp = emp.groupby("deptno"). \
    agg(f.count(emp.empno). \
    alias("tot_employees")). \
    agg(f.max('tot_employees'). \
    alias('max_tot_emp'))

In [49]:
dept_agg = emp.groupby("deptno").agg(f.count(emp.empno).alias("tot_emp"))
dept_agg.join(max_tot_emp,dept_agg.tot_emp==max_tot_emp.max_tot_emp)[[dept_agg.deptno]].show()

+------+
|deptno|
+------+
|    30|
+------+



In [40]:
#window functions
#creating df using partition by
emp_partition = window.partitionBy(emp.deptno)

In [49]:
#Add extra column to dataframe using withColumn
# Equals SQL query
# select emp.*,rank() over(partition By emp.deptno order by emp.empno) as dept_rnk
# from emp
emp_win = emp.withColumn("dept_rnk",f.rank().over(emp_partition.orderBy(emp.empno.cast('int'))))
emp_win.show()

+-----+------+---------+----+----------+-------+-------+------+--------+
|empno| ename|      job| mgr|  hiredate|    sal|   comm|deptno|dept_rnk|
+-----+------+---------+----+----------+-------+-------+------+--------+
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.00| 300.00|    30|       1|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.00| 500.00|    30|       2|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.00|1400.00|    30|       3|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.00|   null|    30|       4|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.00|   0.00|    30|       5|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.00|   null|    30|       6|
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.00|   null|    20|       1|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.00|   null|    20|       2|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.00|   null|    20|       3|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.00|   null|    20|       4|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.00|   

In [48]:
#Add multiple group by columns to dataframe
#Equals SQL query
# select emp.*,count(emp.empno) over(partition By emp.deptno) as dept_emp_cnt,
# max(emp.sal) over(partition By emp.deptno) as dept_max_sal
# from emp
emp_cnt = emp.withColumn("dept_emp_cnt",f.count(emp.empno).over(emp_partition)). \
    withColumn("dept_max_sal",f.max(emp.sal.cast('double')).over(emp_partition))
emp_cnt.show()

+-----+------+---------+----+----------+-------+-------+------+------------+------------+
|empno| ename|      job| mgr|  hiredate|    sal|   comm|deptno|dept_emp_cnt|dept_max_sal|
+-----+------+---------+----+----------+-------+-------+------+------------+------------+
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.00| 300.00|    30|           6|      2850.0|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.00| 500.00|    30|           6|      2850.0|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.00|1400.00|    30|           6|      2850.0|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.00|   null|    30|           6|      2850.0|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.00|   0.00|    30|           6|      2850.0|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.00|   null|    30|           6|      2850.0|
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.00|   null|    20|           5|      3000.0|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.00|   null|    20|           5|      3000.0|
| 7788| SC

In [53]:
#Equals SQL query
# select emp.*,lag(emp.sal) over(partition by emp.deptno order by emp.empno) as prev_sal
# ,lead(emp.sal) over(partition by emp.deptno order by emp.empno) as next_sal
# from emp
emp_lag_lead = emp.withColumn("prev_sal",f.lag(emp.sal.cast('double')).over(emp_partition.orderBy(emp.empno))). \
    withColumn("next_sal",f.lead(emp.sal.cast('double')).over(emp_partition.orderBy(emp.empno)))
emp_lag_lead.show()

+-----+------+---------+----+----------+-------+-------+------+--------+--------+
|empno| ename|      job| mgr|  hiredate|    sal|   comm|deptno|prev_sal|next_sal|
+-----+------+---------+----+----------+-------+-------+------+--------+--------+
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.00| 300.00|    30|    null|  1250.0|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.00| 500.00|    30|  1600.0|  1250.0|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.00|1400.00|    30|  1250.0|  2850.0|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.00|   null|    30|  1250.0|  1500.0|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.00|   0.00|    30|  2850.0|   950.0|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.00|   null|    30|  1500.0|    null|
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.00|   null|    20|    null|  2975.0|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.00|   null|    20|   800.0|  3000.0|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.00|   null|    20|  2975.0|  1100.0|
| 7876| ADAMS|  

In [57]:
#get each dept max salary employee details
# Equals SQL Query
#select * from ( select emp.*,max(emp.sal) over(partition by emp.deptno) as max_sal
# from emp) tab
# where sal = max_sal;
emp.withColumn( \
    "dept_emp_cnt",f.count(emp.empno).over(emp_partition)). \
    withColumn( \
    "dept_max_sal",f.max(emp.sal.cast('double')).over(emp_partition)). \
    filter('sal == dept_max_sal').show()

+-----+-----+---------+----+----------+-------+----+------+------------+------------+
|empno|ename|      job| mgr|  hiredate|    sal|comm|deptno|dept_emp_cnt|dept_max_sal|
+-----+-----+---------+----+----------+-------+----+------+------------+------------+
| 7698|BLAKE|  MANAGER|7839|1981-05-01|2850.00|null|    30|           6|      2850.0|
| 7788|SCOTT|  ANALYST|7566|1982-12-09|3000.00|null|    20|           5|      3000.0|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000.00|null|    20|           5|      3000.0|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.00|null|    10|           3|      5000.0|
+-----+-----+---------+----+----------+-------+----+------+------------+------------+



In [14]:
#Dataframe if else conditions(equalent to case when condition then value else value end case)
emp.withColumn("commission",
               f.when(emp.comm == 'null',0.0).
               otherwise(emp.comm.cast('float'))).show()

+-----+------+---------+----+----------+-------+-------+------+----------+
|empno| ename|      job| mgr|  hiredate|    sal|   comm|deptno|commission|
+-----+------+---------+----+----------+-------+-------+------+----------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.00|   null|    20|       0.0|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.00| 300.00|    30|     300.0|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.00| 500.00|    30|     500.0|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.00|   null|    20|       0.0|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.00|1400.00|    30|    1400.0|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.00|   null|    30|       0.0|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.00|   null|    10|       0.0|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.00|   null|    20|       0.0|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.00|   null|    10|       0.0|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.00|   0.00|    30|       0.0|
| 7876| ADAMS|    CLERK|7

In [16]:
data = [("James ","","Smith","36636","M",60000),
        ("Michael ","Rose","","40288","M",70000),
        ("Robert ","","Williams","42114","",400000),
        ("Maria ","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]

In [24]:
cols = ("first_name","middle_name","last_name","dob","gender","salary")
df = spark.createDataFrame(data,cols)
df.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [32]:
df.withColumn("new_gender",f.expr("case when gender = 'M' then 'Male' " +
                   "when gender = 'F' then 'Female' " +
                   "else 'Unknown' end")).show()

+----------+-----------+---------+-----+------+------+----------+
|first_name|middle_name|last_name|  dob|gender|salary|new_gender|
+----------+-----------+---------+-----+------+------+----------+
|    James |           |    Smith|36636|     M| 60000|      Male|
|  Michael |       Rose|         |40288|     M| 70000|      Male|
|   Robert |           | Williams|42114|      |400000|   Unknown|
|    Maria |       Anne|    Jones|39192|     F|500000|    Female|
|       Jen|       Mary|    Brown|     |     F|     0|    Female|
+----------+-----------+---------+-----+------+------+----------+



In [39]:
cond ="""case when gender = 'M' then 'Male'  
                    when gender = 'F' then 'Female' 
                    else 'Unknown' end as new_gender"""
df.selectExpr("*", cond).show()

+----------+-----------+---------+-----+------+------+----------+
|first_name|middle_name|last_name|  dob|gender|salary|new_gender|
+----------+-----------+---------+-----+------+------+----------+
|    James |           |    Smith|36636|     M| 60000|      Male|
|  Michael |       Rose|         |40288|     M| 70000|      Male|
|   Robert |           | Williams|42114|      |400000|   Unknown|
|    Maria |       Anne|    Jones|39192|     F|500000|    Female|
|       Jen|       Mary|    Brown|     |     F|     0|    Female|
+----------+-----------+---------+-----+------+------+----------+



In [40]:
df.add

[0;31mSignature:[0m [0mdf[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0;34m*[0m[0mcols[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Returns a new :class:`DataFrame` that drops the specified column.
This is a no-op if schema doesn't contain the given column name(s).

:param cols: a string name of the column to drop, or a
    :class:`Column` to drop, or a list of string name of the columns to drop.

>>> df.drop('age').collect()
[Row(name='Alice'), Row(name='Bob')]

>>> df.drop(df.age).collect()
[Row(name='Alice'), Row(name='Bob')]

>>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect()
[Row(age=5, height=85, name='Bob')]

>>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect()
[Row(age=5, name='Bob', height=85)]

>>> df.join(df2, 'name', 'inner').drop('age', 'height').collect()
[Row(name='Bob')]

.. versionadded:: 1.4
[0;31mFile:[0m      /usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py
[0;31mType:[0m      method


In [9]:
#Explain plan of DataFrame
emp.explain(True)

== Parsed Logical Plan ==
Relation[empno#0,ename#1,job#2,mgr#3,hiredate#4,sal#5,comm#6,deptno#7] csv

== Analyzed Logical Plan ==
empno: string, ename: string, job: string, mgr: string, hiredate: string, sal: string, comm: string, deptno: string
Relation[empno#0,ename#1,job#2,mgr#3,hiredate#4,sal#5,comm#6,deptno#7] csv

== Optimized Logical Plan ==
Relation[empno#0,ename#1,job#2,mgr#3,hiredate#4,sal#5,comm#6,deptno#7] csv

== Physical Plan ==
*(1) FileScan csv [empno#0,ename#1,job#2,mgr#3,hiredate#4,sal#5,comm#6,deptno#7] Batched: false, Format: CSV, Location: InMemoryFileIndex[hdfs://nn01.itversity.com:8020/user/rposam2020/data/emp.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<empno:string,ename:string,job:string,mgr:string,hiredate:string,sal:string,comm:string,dep...


In [10]:
print(emp._jdf.queryExecution().stringWithStats())

== Optimized Logical Plan ==
Relation[empno#0,ename#1,job#2,mgr#3,hiredate#4,sal#5,comm#6,deptno#7] csv, Statistics(sizeInBytes=717.0 B, hints=none)

== Physical Plan ==
*(1) FileScan csv [empno#0,ename#1,job#2,mgr#3,hiredate#4,sal#5,comm#6,deptno#7] Batched: false, Format: CSV, Location: InMemoryFileIndex[hdfs://nn01.itversity.com:8020/user/rposam2020/data/emp.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<empno:string,ename:string,job:string,mgr:string,hiredate:string,sal:string,comm:string,dep...


In [41]:
spark.stop()