In [1]:
# Tricky question numbers good to refer below: 

# 101, 102, 103, 104, 105, 106, 107, 108, 120, 121, 123, 124, 125, 130, 131, 139, 141, 148, 149, 150
# 151, 154, 159, 160, 161, 164, 165, 178, 179, 180, 194, 197, 206(*), 216, 218, 219, 224, 226

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark import sql
from pyspark.sql import Row                       # To use Row method for column
#from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *

In [3]:
conf = SparkConf()
sc = SparkContext(conf=conf)
sqlContext = sql.SQLContext(sc)

In [4]:
empRDD = sc.textFile("emp.txt",4)
depRDD = sc.textFile("dept.txt")
salRDD = sc.textFile("salgrade.txt")

In [5]:
emp_col = Row('EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO')
dep_col = Row('DEPTNO', 'DNAME', 'DLOC')
sal_col = Row('GRADE', 'LOSAL', 'HISAL')

In [6]:
empROW = empRDD.map(lambda x: x.split(',')).map(lambda r: emp_col(*r))
depRow = depRDD.map(lambda x: x.split(',')).map(lambda r: dep_col(*r))
salROW = salRDD.map(lambda x: x.split(',')).map(lambda r: sal_col(*r))

In [7]:
empDF = sqlContext.createDataFrame(empROW)
depDF = sqlContext.createDataFrame(depRow)
salDF = sqlContext.createDataFrame(salROW)

In [8]:
def toInt(i):
    return i.cast("integer")

def toDouble(d):
    return d.cast("double")

emp = empDF.withColumn("EMPNO", toInt(empDF.EMPNO)).withColumn("MGR", toInt(empDF.MGR)).withColumn("HIREDATE",to_date('HIREDATE')).withColumn("SAL", toDouble(empDF.SAL)).withColumn('COMM', when(empDF.COMM == 'NULL', lit(None)).otherwise(empDF.COMM)).withColumn("DEPTNO", toInt(empDF.DEPTNO)).filter('ENAME != "ENAME"')
dept = depDF.withColumn("DEPTNO", toInt(depDF.DEPTNO))
sal = salDF.withColumn("GRADE", toInt(salDF.GRADE)).withColumn("LOSAL", toDouble(salDF.LOSAL)).withColumn("HISAL", toDouble(salDF.HISAL)).na.drop()

sqlContext.registerDataFrameAsTable(emp, "emptab")
sqlContext.registerDataFrameAsTable(dept, "depttab")
sqlContext.registerDataFrameAsTable(sal, "saltab")

In [9]:
# 101 List the emp whose sal<his manager but more than any other manager.

e = emp.alias("e")
m = emp.alias("m")

cond = [col("e.MGR") == col("m.EMPNO"), col("e.SAL") > col("m.SAL"), col("e.SAL") < col("m.SAL")]

e.join(m, cond).select("e.*").show()

+-----+-----+---+---+--------+---+----+------+
|EMPNO|ENAME|JOB|MGR|HIREDATE|SAL|COMM|DEPTNO|
+-----+-----+---+---+--------+---+----+------+
+-----+-----+---+---+--------+---+----+------+



In [10]:
# 102 List the employee names and his average salary department wise.

empAvgSalDept = emp.join((emp.groupBy("DEPTNO").agg(avg("SAL").alias("avgSAL"))), "DEPTNO")

empAvgSalDept.select("ENAME", "DEPTNO", "avgSAL").show()

+------+------+------------------+
| ENAME|DEPTNO|            avgSAL|
+------+------+------------------+
| SMITH|    20|            2175.0|
| JONES|    20|            2175.0|
| SCOTT|    20|            2175.0|
| ADAMS|    20|            2175.0|
|  FORD|    20|            2175.0|
| CLARK|    10|2916.6666666666665|
|  KING|    10|2916.6666666666665|
|MILLER|    10|2916.6666666666665|
| ALLEN|    30|1566.6666666666667|
|  WARD|    30|1566.6666666666667|
|MARTIN|    30|1566.6666666666667|
| BLAKE|    30|1566.6666666666667|
|TURNER|    30|1566.6666666666667|
| JAMES|    30|1566.6666666666667|
+------+------+------------------+



In [11]:
# 103. Find out least 5 earners of the company

emp.withColumn("rank", dense_rank().over(Window.orderBy(asc("SAL")))).filter("rank < 6").show()

+-----+------+--------+----+----------+------+-------+------+----+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|rank|
+-----+------+--------+----+----------+------+-------+------+----+
| 7369| SMITH|   CLERK|7902|1980-12-17| 800.0|   null|    20|   1|
| 7900| JAMES|   CLERK|7698|1981-12-03| 950.0|   null|    30|   2|
| 7876| ADAMS|   CLERK|7788|1983-01-12|1100.0|   null|    20|   3|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|   4|
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|   4|
| 7934|MILLER|   CLERK|7782|1982-01-23|1300.0|   null|    10|   5|
+-----+------+--------+----+----------+------+-------+------+----+



In [12]:
# 104. Find out emps whose salaries greater than salaries of their managers.

emp.alias("e").join(emp.alias("m"), [col("e.MGR") == col("m.EMPNO"), col("e.SAL") > col("m.SAL")]).show()

+-----+-----+-------+----+----------+------+----+------+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+-----+-----+-------+----+----------+------+----+------+
| 7788|SCOTT|ANALYST|7566|1982-12-09|3000.0|null|    20| 7566|JONES|MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7902| FORD|ANALYST|7566|1981-12-03|3000.0|null|    20| 7566|JONES|MANAGER|7839|1981-04-02|2975.0|null|    20|
+-----+-----+-------+----+----------+------+----+------+-----+-----+-------+----+----------+------+----+------+



In [13]:
# 105. List the managers who are not working under the president

emp.alias("e").join(emp.alias("m"), col("e.MGR") == col("m.EMPNO"))\
.select("m.*").filter(col("e.MGR") != 7839).distinct().show()

+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+
| 7788|SCOTT|ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7902| FORD|ANALYST|7566|1981-12-03|3000.0|null|    20|
| 7566|JONES|MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7698|BLAKE|MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7782|CLARK|MANAGER|7839|1981-06-09|2450.0|null|    10|
+-----+-----+-------+----+----------+------+----+------+



In [14]:
# 106. List the records from emp whose deptno isnot in dept.

deptno = dept.select("DEPTNO").rdd.map(lambda x: x[0]).collect()

emp.filter(~(emp.DEPTNO.isin(deptno))).show()

+-----+-----+---+---+--------+---+----+------+
|EMPNO|ENAME|JOB|MGR|HIREDATE|SAL|COMM|DEPTNO|
+-----+-----+---+---+--------+---+----+------+
+-----+-----+---+---+--------+---+----+------+



In [15]:
# 107. List the Name , Salary, Comm and Net Pay is more than any other employee.

emp.alias("e").join(emp.alias("a"), [col("e.SAL") > col("a.SAL")]).select("e.*", "a.EMPNO", "a.SAL").show()

+-----+------+--------+----+----------+------+-------+------+-----+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|EMPNO|   SAL|
+-----+------+--------+----+----------+------+-------+------+-----+------+
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30| 7369| 800.0|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30| 7521|1250.0|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|    30| 7369| 800.0|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30| 7654|1250.0|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30| 7844|1500.0|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30| 7876|1100.0|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|    30| 7876|1100.0|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30| 7900| 950.0|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30| 7934|1300.0|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|    30| 7900| 950.0|
| 7566| JONES| MANAGER|78

In [16]:
# 108.List the Enames who are retiring after 31-Dec-2000 the max Job period is 20Y.

emp.filter(add_months("HIREDATE", 240) > "2000-12-31").show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|   null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|   null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|    20|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|   null|

In [17]:
#109. List those Emps whose Salary is odd value

emp.filter(emp.SAL%2 == 1).show()

+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+
| 7566|JONES|MANAGER|7839|1981-04-02|2975.0|null|    20|
+-----+-----+-------+----+----------+------+----+------+



In [18]:
# 110. List the emp’s whose Salary contain 3 digits.

emp.filter(length(emp.SAL) == 5).show() #due double value added 3+2 to contain(. and 0)

+-----+-----+-----+----+----------+-----+----+------+
|EMPNO|ENAME|  JOB| MGR|  HIREDATE|  SAL|COMM|DEPTNO|
+-----+-----+-----+----+----------+-----+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17|800.0|null|    20|
| 7900|JAMES|CLERK|7698|1981-12-03|950.0|null|    30|
+-----+-----+-----+----+----------+-----+----+------+



In [19]:
# 111. List the emps who joined in the month of DEC.

emp.filter(month("HIREDATE") == 12).show()

+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+
| 7369|SMITH|  CLERK|7902|1980-12-17| 800.0|null|    20|
| 7788|SCOTT|ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7900|JAMES|  CLERK|7698|1981-12-03| 950.0|null|    30|
| 7902| FORD|ANALYST|7566|1981-12-03|3000.0|null|    20|
+-----+-----+-------+----+----------+------+----+------+



In [20]:
# 112. List the emps whose names contains ‘A’.

emp.filter(emp.ENAME.contains('A')).show()

+-----+------+--------+----+----------+------+-------+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+--------+----+----------+------+-------+------+
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE| MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7782| CLARK| MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7876| ADAMS|   CLERK|7788|1983-01-12|1100.0|   null|    20|
| 7900| JAMES|   CLERK|7698|1981-12-03| 950.0|   null|    30|
+-----+------+--------+----+----------+------+-------+------+



In [21]:
# 113. List the emps whose Deptno is available in his Salary.

emp.filter(instr("SAL", "DEPTNO") > 0).show()

+-----+-----+---+---+--------+---+----+------+
|EMPNO|ENAME|JOB|MGR|HIREDATE|SAL|COMM|DEPTNO|
+-----+-----+---+---+--------+---+----+------+
+-----+-----+---+---+--------+---+----+------+



In [22]:
# 115. List the emps Whose 10% of Salary is equal to year of joining.

emp.filter(emp.SAL*10/100 == year("HIREDATE")).show()

+-----+-----+---+---+--------+---+----+------+
|EMPNO|ENAME|JOB|MGR|HIREDATE|SAL|COMM|DEPTNO|
+-----+-----+---+---+--------+---+----+------+
+-----+-----+---+---+--------+---+----+------+



In [23]:
# 120. List the emps who are working as Managers.

# case 1:
print('case 1 result')
mgrID = emp.select("MGR").distinct().rdd.map(lambda x:x[0]).collect()

emp.filter(emp.EMPNO.isin(mgrID)).show()

# case 2:
print('case 2 result')
emp.alias("e").join(emp.alias("m"), col("e.MGR") == col("m.EMPNO")).select("m.*").distinct().show()

case 1 result
+-----+-----+---------+----+----------+------+----+------+
|EMPNO|ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+---------+----+----------+------+----+------+
| 7566|JONES|  MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7698|BLAKE|  MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7782|CLARK|  MANAGER|7839|1981-06-09|2450.0|null|    10|
| 7788|SCOTT|  ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.0|null|    10|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000.0|null|    20|
+-----+-----+---------+----+----------+------+----+------+

case 2 result
+-----+-----+---------+----+----------+------+----+------+
|EMPNO|ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+---------+----+----------+------+----+------+
| 7788|SCOTT|  ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000.0|null|    20|
| 7566|JONES|  MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7698|BLAKE|  MANAGER|7839

In [24]:
# 121. List THE Name of dept where highest no.of emps are working.

emp.groupBy("DEPTNO").agg(count("EMPNO").alias("count"))\
    .withColumn("rank", dense_rank().over(Window.orderBy(desc("count"))))\
    .filter("rank = 1").show()

+------+-----+----+
|DEPTNO|count|rank|
+------+-----+----+
|    30|    6|   1|
+------+-----+----+



In [25]:
# 122. Count the No.of emps who are working as ‘Managers’(using set option).

mgrID = emp.select("MGR").distinct().rdd.map(lambda x:x[0]).collect()

emp.filter(emp.EMPNO.isin(mgrID)).count()

6

In [26]:
# 123. List the emps who joined in the company on the same date.

cond = [col("e.HIREDATE") == col("h.HIREDATE"), col("e.EMPNO") != col("e.EMPNO")]

emp.alias("e").join(emp.alias("h"), cond)\
.show()

+-----+-----+-------+----+----------+------+----+------+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+-----+-----+-------+----+----------+------+----+------+
| 7900|JAMES|  CLERK|7698|1981-12-03| 950.0|null|    30| 7902| FORD|ANALYST|7566|1981-12-03|3000.0|null|    20|
| 7902| FORD|ANALYST|7566|1981-12-03|3000.0|null|    20| 7900|JAMES|  CLERK|7698|1981-12-03| 950.0|null|    30|
+-----+-----+-------+----+----------+------+----+------+-----+-----+-------+----+----------+------+----+------+



In [27]:
# 124. List the details of the emps whose Grade is equal to one tenth of Sales Dept.

emp.join(sal, emp.SAL.between(sal.LOSAL, sal.HISAL))\
    .join(dept, "DEPTNO").filter('GRADE = 0.1*30').show() #30 means sales dept value

+------+-----+------+--------+----+----------+------+------+-----+------+------+-----+-------+
|DEPTNO|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|  COMM|GRADE| LOSAL| HISAL|DNAME|   DLOC|
+------+-----+------+--------+----+----------+------+------+-----+------+------+-----+-------+
|    30| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0|300.00|    3|1401.0|2000.0|SALES|CHICAGO|
|    30| 7844|TURNER|SALESMAN|7698|1981-09-08|1500.0|  0.00|    3|1401.0|2000.0|SALES|CHICAGO|
+------+-----+------+--------+----+----------+------+------+-----+------+------+-----+-------+



In [28]:
# 125. List the name of the dept where more than average no. of emps are working.

empCount = emp.groupBy("DEPTNO").agg(count("EMPNO").alias("count"))

avgEmpCount = empCount.agg(avg("count")).head()[0]

empCount.filter(col("count") > avgEmpCount).show()


+------+-----+
|DEPTNO|count|
+------+-----+
|    20|    5|
|    30|    6|
+------+-----+



In [29]:
# 126. List the Managers name who is having max no.of emps working under him.

mgrEmplys = emp.groupBy("MGR").agg(count("EMPNO").alias("count"))
maxCount = mgrEmplys.agg(max("count")).head()[0]
mgrEmplys.filter(col("count") == maxCount).join(emp, mgrEmplys.MGR == emp.EMPNO).show()

+----+-----+-----+-----+-------+----+----------+------+----+------+
| MGR|count|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+----+-----+-----+-----+-------+----+----------+------+----+------+
|7698|    5| 7698|BLAKE|MANAGER|7839|1981-05-01|2850.0|null|    30|
+----+-----+-----+-----+-------+----+----------+------+----+------+



In [30]:
# 127. List the Ename and Sal is increased by 15% and expressed as no.of Dollars.

emp.select("ENAME", concat(lit("$"), emp.SAL + emp.SAL*15/100).alias("inc.15%")).show()

+------+--------+
| ENAME| inc.15%|
+------+--------+
| SMITH|  $920.0|
| ALLEN| $1840.0|
|  WARD| $1437.5|
| JONES|$3421.25|
|MARTIN| $1437.5|
| BLAKE| $3277.5|
| CLARK| $2817.5|
| SCOTT| $3450.0|
|  KING| $5750.0|
|TURNER| $1725.0|
| ADAMS| $1265.0|
| JAMES| $1092.5|
|  FORD| $3450.0|
|MILLER| $1495.0|
+------+--------+



In [31]:
# 128. Produce the output of EMP table ‘EMP_AND_JOB’ for Ename and Job.

emp.select(concat(emp.ENAME, lit("-"),emp.JOB).alias("EMP_AND_JOB")).show()

+---------------+
|    EMP_AND_JOB|
+---------------+
|    SMITH-CLERK|
| ALLEN-SALESMAN|
|  WARD-SALESMAN|
|  JONES-MANAGER|
|MARTIN-SALESMAN|
|  BLAKE-MANAGER|
|  CLARK-MANAGER|
|  SCOTT-ANALYST|
| KING-PRESIDENT|
|TURNER-SALESMAN|
|    ADAMS-CLERK|
|    JAMES-CLERK|
|   FORD-ANALYST|
|   MILLER-CLERK|
+---------------+



In [32]:
# 129. Produce the following output from EMP.
# EMPLOYEE
# SMITH (clerk)
# ALLEN (Salesman)

emp.select(concat(emp.ENAME, lit("("), lower(emp.JOB), lit(")"))).show()

+-------------------------------+
|concat(ENAME, (, lower(JOB), ))|
+-------------------------------+
|                   SMITH(clerk)|
|                ALLEN(salesman)|
|                 WARD(salesman)|
|                 JONES(manager)|
|               MARTIN(salesman)|
|                 BLAKE(manager)|
|                 CLARK(manager)|
|                 SCOTT(analyst)|
|                KING(president)|
|               TURNER(salesman)|
|                   ADAMS(clerk)|
|                   JAMES(clerk)|
|                  FORD(analyst)|
|                  MILLER(clerk)|
+-------------------------------+



In [33]:
# 130) List the emps with Hire date in format June 4, 1988.

emp.select(date_format("HIREDATE","MMMM d, y")).show()

+--------------------------------+
|date_format(HIREDATE, MMMM d, y)|
+--------------------------------+
|               December 17, 1980|
|               February 20, 1981|
|               February 22, 1981|
|                   April 2, 1981|
|              September 28, 1981|
|                     May 1, 1981|
|                    June 9, 1981|
|                December 9, 1982|
|               November 17, 1981|
|               September 8, 1981|
|                January 12, 1983|
|                December 3, 1981|
|                December 3, 1981|
|                January 23, 1982|
+--------------------------------+



In [34]:
# 131) Print a list of emp’s Listing ‘just salary’ if Salary is more than 1500, on target if
# Salary is 1500 and ‘Below 1500’ if Salary is less than 1500

emp.withColumn("SAL", when(emp.SAL > 1500, concat(emp.SAL, lit(" JUST SALARY")))\
               .when(emp.SAL > 1500, concat(emp.SAL, lit(" on TARGET")))\
               .otherwise(concat(emp.SAL, lit(" BELOW 1500")))).show()

+-----+------+---------+----+----------+------------------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|               SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------------------+-------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17|  800.0 BELOW 1500|   null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0 JUST SALARY| 300.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22| 1250.0 BELOW 1500| 500.00|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0 JUST SALARY|   null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28| 1250.0 BELOW 1500|1400.00|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0 JUST SALARY|   null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0 JUST SALARY|   null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0 JUST SALARY|   null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0 JUST SALARY|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08| 1500.0 BELOW 1500|   0.00|    30|
| 7876| ADAMS|    CLERK|7

In [35]:
# 132) Write a query which return the day of the week for any date entered in format ‘DD-MM-YY’.


In [36]:
# 133) Write a query to calculate the length of service of any employee.

emp.select((year(current_date()) - year("HIREDATE")).alias("service/ EXP")).show()


+------------+
|service/ EXP|
+------------+
|          38|
|          37|
|          37|
|          37|
|          37|
|          37|
|          37|
|          36|
|          37|
|          37|
|          35|
|          37|
|          37|
|          36|
+------------+



In [37]:
# 135) Emps hired on or before 15 th of any month are paid on the last Friday of that
# month those hired after 15 th are paid on the first Friday of the following month.
# Print a list of emps their hire date and the first pay date. Sort on hire date.



In [38]:
# 136) Count the no. of characters with out considering spaces for each DLOC.

dept.select(length(regexp_replace("DLOC"," ", ""))).show()

+---------------------------------+
|length(regexp_replace(DLOC,  , ))|
+---------------------------------+
|                                4|
|                                7|
|                                6|
|                                7|
|                                6|
+---------------------------------+



In [39]:
#like example

emp.filter(emp.ENAME.like("%TH")).show()

+-----+-----+-----+----+----------+-----+----+------+
|EMPNO|ENAME|  JOB| MGR|  HIREDATE|  SAL|COMM|DEPTNO|
+-----+-----+-----+----+----------+-----+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17|800.0|null|    20|
+-----+-----+-----+----+----------+-----+----+------+



In [40]:
# 139) List those Managers who are getting less than his emps Salary.

emp.alias("e").join(emp.alias("m"), [col("e.MGR") == col("m.EMPNO"), col("e.SAL") > col("m.SAL")], "inner").show()

+-----+-----+-------+----+----------+------+----+------+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+-----+-----+-------+----+----------+------+----+------+
| 7788|SCOTT|ANALYST|7566|1982-12-09|3000.0|null|    20| 7566|JONES|MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7902| FORD|ANALYST|7566|1981-12-03|3000.0|null|    20| 7566|JONES|MANAGER|7839|1981-04-02|2975.0|null|    20|
+-----+-----+-------+----+----------+------+----+------+-----+-----+-------+----+----------+------+----+------+



In [41]:
# 140) Print the details of all the emps who are sub-ordinates to Blake.

blakeID = emp.filter('ENAME = "BLAKE"').select("EMPNO").head()[0]

emp.filter(emp.MGR == blakeID).show()

+-----+------+--------+----+----------+------+-------+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+--------+----+----------+------+-------+------+
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7844|TURNER|SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7900| JAMES|   CLERK|7698|1981-12-03| 950.0|   null|    30|
+-----+------+--------+----+----------+------+-------+------+



In [42]:
# 141) List the emps who are working as Managers using co-related sub-query

emp.filter(emp.EMPNO.isin(emp.select("MGR").distinct().rdd.map(lambda x: x[0]).collect())).show()

+-----+-----+---------+----+----------+------+----+------+
|EMPNO|ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+---------+----+----------+------+----+------+
| 7566|JONES|  MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7698|BLAKE|  MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7782|CLARK|  MANAGER|7839|1981-06-09|2450.0|null|    10|
| 7788|SCOTT|  ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.0|null|    10|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000.0|null|    20|
+-----+-----+---------+----+----------+------+----+------+



In [43]:
# 142) List the emps whose Mgr name is ‘Jones’ and also with his Manager name.


joneId = emp.filter(emp.ENAME == "JONES").select("EMPNO").head()[0]
emp.filter(emp.MGR == joneId).union(emp.filter(emp.ENAME == "JONES")).show()


+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+
| 7788|SCOTT|ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7902| FORD|ANALYST|7566|1981-12-03|3000.0|null|    20|
| 7566|JONES|MANAGER|7839|1981-04-02|2975.0|null|    20|
+-----+-----+-------+----+----------+------+----+------+



In [44]:
# 143) Define a variable representing the expression used to calculate on emps total
# annual remuneration use the variable in a statement, which finds all emps who
# can earn 30000 a year or more.



In [45]:
# 144) Find out how many Managers are their in the company.

emp.filter(~(emp.MGR.isNull())).select("MGR").distinct().count()



6

In [46]:
# # 145) Find Average salary and Average total remuneration for each Job type.
# Remember Salesman earn commission.secommm

i = emp.withColumn("COMM", when(emp.COMM.isNull(), 0).otherwise(emp.COMM))
i.select(avg("SAL").alias("avgSal"), avg(i.SAL + i.COMM).alias("avgSalWithComm")).show()

+-----------------+------------------+
|           avgSal|    avgSalWithComm|
+-----------------+------------------+
|2073.214285714286|2230.3571428571427|
+-----------------+------------------+



In [47]:
# 146) Check whether all the emps numbers are indeed unique.

emp.groupBy("EMPNO").agg(count("EMPNO")).filter("count(EMPNO) > 1").show()

+-----+------------+
|EMPNO|count(EMPNO)|
+-----+------------+
+-----+------------+



In [48]:
# 147) List the emps who are drawing less than 1000 Sort the output by Salary.

emp.filter(emp.SAL < 1000).show()

+-----+-----+-----+----+----------+-----+----+------+
|EMPNO|ENAME|  JOB| MGR|  HIREDATE|  SAL|COMM|DEPTNO|
+-----+-----+-----+----+----------+-----+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17|800.0|null|    20|
| 7900|JAMES|CLERK|7698|1981-12-03|950.0|null|    30|
+-----+-----+-----+----+----------+-----+----+------+



In [49]:
# 148) List the employee Name, Job, Annual Salary, deptno, Dept name and grade who
# earn 36000 a year or who are not CLERKS.

emp.join(dept, "DEPTNO").join(sal, emp.SAL.between(sal.LOSAL, sal.HISAL))\
    .select("ENAME", "JOB", (emp.SAL*12).alias("ANNSAL"), "DEPTNO", "DNAME", "GRADE")\
    .filter('JOB != "CLERK"' and col("ANNSAL") > 36000).show()


+-----+---------+-------+------+----------+-----+
|ENAME|      JOB| ANNSAL|DEPTNO|     DNAME|GRADE|
+-----+---------+-------+------+----------+-----+
| KING|PRESIDENT|60000.0|    10|ACCOUNTING|    5|
+-----+---------+-------+------+----------+-----+



In [50]:
# # 149) Find out the Job that was filled in the first half of 1981 and same job that was
# filled during the second half of 1981.

jobOf81 = emp.filter(col("HIREDATE").between("1981-01-01", "1981-06-30")).select("JOB").distinct().rdd.map(lambda x: x[0]).collect()

emp.filter(emp.JOB.isin(jobOf81) & emp.HIREDATE.between("1981-07-31", "1981-12-31")).show()

+-----+------+--------+----+----------+------+-------+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+--------+----+----------+------+-------+------+
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7844|TURNER|SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
+-----+------+--------+----+----------+------+-------+------+



In [51]:
# 150) Find out the emps who joined in the company before their Managers.

emp.alias("e").join(emp.alias("m"), [col("e.MGR") == col("m.EMPNO"), col("e.HIREDATE") < col("m.HIREDATE")]).select("e.*").show()

+-----+-----+--------+----+----------+------+------+------+
|EMPNO|ENAME|     JOB| MGR|  HIREDATE|   SAL|  COMM|DEPTNO|
+-----+-----+--------+----+----------+------+------+------+
| 7369|SMITH|   CLERK|7902|1980-12-17| 800.0|  null|    20|
| 7499|ALLEN|SALESMAN|7698|1981-02-20|1600.0|300.00|    30|
| 7521| WARD|SALESMAN|7698|1981-02-22|1250.0|500.00|    30|
| 7566|JONES| MANAGER|7839|1981-04-02|2975.0|  null|    20|
| 7698|BLAKE| MANAGER|7839|1981-05-01|2850.0|  null|    30|
| 7782|CLARK| MANAGER|7839|1981-06-09|2450.0|  null|    10|
+-----+-----+--------+----+----------+------+------+------+



In [52]:
emp.filter(emp.EMPNO == 7902).show() #test above result true or not

+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+
| 7902| FORD|ANALYST|7566|1981-12-03|3000.0|null|    20|
+-----+-----+-------+----+----------+------+----+------+



In [53]:
# 151) List all the emps by name and number along with their Manager’s name and
# number. Also List KING who has no ‘Manager’.

emp.alias("e").join(emp.alias("m"), [col("e.MGR") == col("m.EMPNO")], "left")\
.select(col("e.EMPNO").alias("e.ename"),col("e.ENAME").alias("e.empno"),\
        col("m.EMPNO").alias("m.empno"), col("m.ENAME").alias("m.ename")).show()

+-------+-------+-------+-------+
|e.ename|e.empno|m.empno|m.ename|
+-------+-------+-------+-------+
|   7876|  ADAMS|   7788|  SCOTT|
|   7369|  SMITH|   7902|   FORD|
|   7839|   KING|   null|   null|
|   7788|  SCOTT|   7566|  JONES|
|   7902|   FORD|   7566|  JONES|
|   7499|  ALLEN|   7698|  BLAKE|
|   7521|   WARD|   7698|  BLAKE|
|   7654| MARTIN|   7698|  BLAKE|
|   7844| TURNER|   7698|  BLAKE|
|   7900|  JAMES|   7698|  BLAKE|
|   7566|  JONES|   7839|   KING|
|   7698|  BLAKE|   7839|   KING|
|   7782|  CLARK|   7839|   KING|
|   7934| MILLER|   7782|  CLARK|
+-------+-------+-------+-------+



In [54]:
#  152) Find all the emps who earn the minimum Salary for each job wise in ascending
# order.

minSal = emp.groupBy("JOB").agg(min("SAL")).rdd.map(lambda x: x[1]).collect()

emp.filter(emp.SAL.isin(minSal)).sort(asc("SAL")).show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|   null|    20|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|    20|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|   null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null|    10|
+-----+------+---------+----+----------+------+-------+------+



In [55]:
# 153) Find out all the emps who earn highest salary in each job type. Sort in
# descending salary order.

maxSal = emp.groupBy("JOB").agg(max("SAL")).rdd.map(lambda x: x[1]).collect()

emp.filter(emp.SAL.isin(maxSal)).sort(desc("SAL")).show()


+-----+------+---------+----+----------+------+------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|  COMM|DEPTNO|
+-----+------+---------+----+----------+------+------+------+
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|  null|    10|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|  null|    20|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|  null|    20|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|  null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0|300.00|    30|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|  null|    10|
+-----+------+---------+----+----------+------+------+------+



In [56]:
#154) Find out the most recently hired emps in each Dept order by Hiredate.

print("using max")
maxHire = emp.groupBy("DEPTNO").agg(max("HIREDATE")).rdd.map(lambda x:x[1]).collect()

emp.filter(emp.HIREDATE.isin(maxHire)).sort(asc("HIREDATE")).distinct().show()
#In the above format the result is not accurate. We are getting twice the time of dept 20. So better use 
#rank to get the right result

print("using rank")
emp.withColumn("rank", dense_rank().over(Window.partitionBy("DEPTNO").orderBy(desc("HIREDATE")))).\
filter("rank = 1").show()

using max
+-----+------+-------+----+----------+------+----+------+
|EMPNO| ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+-------+----+----------+------+----+------+
| 7900| JAMES|  CLERK|7698|1981-12-03| 950.0|null|    30|
| 7902|  FORD|ANALYST|7566|1981-12-03|3000.0|null|    20|
| 7934|MILLER|  CLERK|7782|1982-01-23|1300.0|null|    10|
| 7876| ADAMS|  CLERK|7788|1983-01-12|1100.0|null|    20|
+-----+------+-------+----+----------+------+----+------+

using rank
+-----+------+-----+----+----------+------+----+------+----+
|EMPNO| ENAME|  JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|rank|
+-----+------+-----+----+----------+------+----+------+----+
| 7876| ADAMS|CLERK|7788|1983-01-12|1100.0|null|    20|   1|
| 7934|MILLER|CLERK|7782|1982-01-23|1300.0|null|    10|   1|
| 7900| JAMES|CLERK|7698|1981-12-03| 950.0|null|    30|   1|
+-----+------+-----+----+----------+------+----+------+----+



In [57]:
# 155) List the employee name,Salary and Deptno for each employee who earns a
# salary greater than the average for their department order by Deptno

avgSal = emp.groupBy("DEPTNO").agg(avg("SAL").alias("avgSal"))

emp.join(avgSal, "DEPTNO").filter("SAL > avgSal").sort("DEPTNO")

DataFrame[DEPTNO: int, EMPNO: int, ENAME: string, JOB: string, MGR: int, HIREDATE: date, SAL: double, COMM: string, avgSal: double]

In [58]:
#156) List the Deptno where there are no emps.
# No such cases, but answer below

emp.groupBy("DEPTNO").agg(count("EMPNO")).filter("count(EMPNO) = 0").show()

+------+------------+
|DEPTNO|count(EMPNO)|
+------+------------+
+------+------------+



In [59]:
# 157) List the No.of emp’s and Avg salary within each department for each job.

emp.groupBy("DEPTNO", "JOB").agg(avg("SAL"),count("EMPNO")).show()

+------+---------+--------+------------+
|DEPTNO|      JOB|avg(SAL)|count(EMPNO)|
+------+---------+--------+------------+
|    20|  ANALYST|  3000.0|           2|
|    20|  MANAGER|  2975.0|           1|
|    30|  MANAGER|  2850.0|           1|
|    30| SALESMAN|  1400.0|           4|
|    30|    CLERK|   950.0|           1|
|    20|    CLERK|   950.0|           2|
|    10|PRESIDENT|  5000.0|           1|
|    10|    CLERK|  1300.0|           1|
|    10|  MANAGER|  2450.0|           1|
+------+---------+--------+------------+



In [60]:
# 158) Find the maximum average salary drawn for each job except for ‘President’.

emp.filter('JOB <> "PRESIDENT"').groupBy("JOB").agg(avg("SAL")).show()

+--------+------------------+
|     JOB|          avg(SAL)|
+--------+------------------+
| ANALYST|            3000.0|
|SALESMAN|            1400.0|
|   CLERK|            1037.5|
| MANAGER|2758.3333333333335|
+--------+------------------+



In [61]:
# 159) Find the name and Job of the emps who earn Max salary and Commission.

def netpay(sal, comm):
    return when(comm.isNull(), sal).otherwise(comm+sal)

netpay_udf = netpay

emp.withColumn("NETPAY", netpay_udf(emp.SAL, emp.COMM)).groupby("ENAME","JOB").agg(max("SAL")).show()


+------+---------+--------+
| ENAME|      JOB|max(SAL)|
+------+---------+--------+
| SMITH|    CLERK|   800.0|
| SCOTT|  ANALYST|  3000.0|
|  FORD|  ANALYST|  3000.0|
|MARTIN| SALESMAN|  1250.0|
|  KING|PRESIDENT|  5000.0|
| ALLEN| SALESMAN|  1600.0|
|  WARD| SALESMAN|  1250.0|
| ADAMS|    CLERK|  1100.0|
| JAMES|    CLERK|   950.0|
| CLARK|  MANAGER|  2450.0|
| BLAKE|  MANAGER|  2850.0|
| JONES|  MANAGER|  2975.0|
|MILLER|    CLERK|  1300.0|
|TURNER| SALESMAN|  1500.0|
+------+---------+--------+



In [62]:
# 160) List the Name, Job and Salary of the emps who are not belonging to the
# department 10 but who have the same job and Salary as the emps of dept 10.

d10JobSal = emp.filter("DEPTNO = 10").select("JOB","SAL").rdd

d10Job = d10JobSal.map(lambda x:x[0]).collect()
d10Sal = d10JobSal.map(lambda x:x[1]).collect()

emp.filter((emp.DEPTNO != 10) & (emp.JOB.isin(d10Job)) | (emp.SAL.isin(d10Sal))).show()



+-----+------+---------+----+----------+------+----+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+---------+----+----------+------+----+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|null|    20|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|null|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|null|    10|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|null|    30|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|null|    10|
+-----+------+---------+----+----------+------+----+------+



In [63]:
# 161) List the Deptno, Name, Job, Salary and Sal+Comm of the SALESMAN who are
# earning maximum salary and commission in descending order.

emp.filter('JOB = "SALESMAN"').withColumn("NETPAY", when(emp.COMM.isNull(), emp.SAL).otherwise(emp.SAL+emp.COMM))\
    .withColumn("rank", dense_rank().over(Window.orderBy(desc("NETPAY")))).filter("rank = 1").sort(desc("NETPAY")).show()

+-----+------+--------+----+----------+------+-------+------+------+----+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|NETPAY|rank|
+-----+------+--------+----+----------+------+-------+------+------+----+
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|2650.0|   1|
+-----+------+--------+----+----------+------+-------+------+------+----+



In [64]:
# 162) List the Deptno, Name, Job, Salary and Sal+Comm of the emps who earn the
# second highest earnings (sal + comm.).

emp.withColumn("NETPAY", when(emp.COMM.isNull(), emp.SAL).otherwise(emp.SAL+emp.COMM))\
    .withColumn("rank", dense_rank().over(Window.orderBy(desc("NETPAY")))).filter("rank = 2").sort(desc("NETPAY")).show()

+-----+-----+-------+----+----------+------+----+------+------+----+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|NETPAY|rank|
+-----+-----+-------+----+----------+------+----+------+------+----+
| 7788|SCOTT|ANALYST|7566|1982-12-09|3000.0|null|    20|3000.0|   2|
| 7902| FORD|ANALYST|7566|1981-12-03|3000.0|null|    20|3000.0|   2|
+-----+-----+-------+----+----------+------+----+------+------+----+



In [65]:
# 163) List the Deptno and their average salaries for dept with the average salary less
# than the averages for all department

avgSal = emp.agg(avg("SAL")).head()[0]

emp.groupBy("DEPTNO").agg(avg("SAL").alias("avgDSal")).filter(col("avgDSal") < avgSal).show()



+------+------------------+
|DEPTNO|           avgDSal|
+------+------------------+
|    30|1566.6666666666667|
+------+------------------+



In [66]:
# 164) List out the Names and Salaries of the emps along with their manager names
# and salaries for those emps who earn more salary than their Manager.

emp.alias("e").join(emp.alias("m"), [col("e.MGR") == col("m.EMPNO"), col("e.SAL") > col("m.SAL")]).select("e.ENAME", "e.SAL", "m.ENAME", "m.SAL").show()

+-----+------+-----+------+
|ENAME|   SAL|ENAME|   SAL|
+-----+------+-----+------+
|SCOTT|3000.0|JONES|2975.0|
| FORD|3000.0|JONES|2975.0|
+-----+------+-----+------+



In [67]:
# 165) List out the Name, Job, Salary of the emps in the department with the highest
# average salary.

avgDSal = emp.groupBy("DEPTNO").agg(avg("SAL"))

maxAvgDSal = avgDSal.agg(max("avg(SAL)")).head()[0]

maxDAvgSal = avgDSal.filter(col("avg(SAL)") == maxAvgDSal)

emp.join(maxDAvgSal, "DEPTNO").show()



+------+-----+------+---------+----+----------+------+----+------------------+
|DEPTNO|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|          avg(SAL)|
+------+-----+------+---------+----+----------+------+----+------------------+
|    10| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|null|2916.6666666666665|
|    10| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|null|2916.6666666666665|
|    10| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|null|2916.6666666666665|
+------+-----+------+---------+----+----------+------+----+------------------+



In [68]:
# 166) List the empno,sal,comm. Of emps.

emp.select("EMPNO", "SAL", emp.COMM).show()

+-----+------+-------+
|EMPNO|   SAL|   COMM|
+-----+------+-------+
| 7369| 800.0|   null|
| 7499|1600.0| 300.00|
| 7521|1250.0| 500.00|
| 7566|2975.0|   null|
| 7654|1250.0|1400.00|
| 7698|2850.0|   null|
| 7782|2450.0|   null|
| 7788|3000.0|   null|
| 7839|5000.0|   null|
| 7844|1500.0|   0.00|
| 7876|1100.0|   null|
| 7900| 950.0|   null|
| 7902|3000.0|   null|
| 7934|1300.0|   null|
+-----+------+-------+



In [69]:
# 167) List the details of the emps in the ascending order of the sal.

emp.sort(asc("SAL")).show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|   null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|   null|    20|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|   null|    20|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|

In [70]:
# 168) List the dept in the ascending order of the job and the desc order of the emps
# print empno, ename.

emp.sort(asc("SAL"),desc("EMPNO")).show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|   null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|   null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|    20|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|   null|

In [71]:
# 169) Display the unique dept of the emps.


In [72]:
# 170) Display the unique dept with jobs.

In [73]:
# 171) Display the details of the blake.

emp.filter('ENAME == "BLAKE"').show()

emp.filter((col("ENAME") == "BLAKE") & (col("EMPNO") == 7698)).show() #Just an try

+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+
| 7698|BLAKE|MANAGER|7839|1981-05-01|2850.0|null|    30|
+-----+-----+-------+----+----------+------+----+------+

+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+
| 7698|BLAKE|MANAGER|7839|1981-05-01|2850.0|null|    30|
+-----+-----+-------+----+----------+------+----+------+



In [74]:
# 172) List all the clerks.

#Differnet Syntaxs

emp.filter('JOB == "CLERK"').show()

emp.filter(emp.JOB == "CLERK").show()

emp.filter(col("JOB") == "CLERK").show()

+-----+------+-----+----+----------+------+----+------+
|EMPNO| ENAME|  JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+-----+----+----------+------+----+------+
| 7369| SMITH|CLERK|7902|1980-12-17| 800.0|null|    20|
| 7876| ADAMS|CLERK|7788|1983-01-12|1100.0|null|    20|
| 7900| JAMES|CLERK|7698|1981-12-03| 950.0|null|    30|
| 7934|MILLER|CLERK|7782|1982-01-23|1300.0|null|    10|
+-----+------+-----+----+----------+------+----+------+

+-----+------+-----+----+----------+------+----+------+
|EMPNO| ENAME|  JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+-----+----+----------+------+----+------+
| 7369| SMITH|CLERK|7902|1980-12-17| 800.0|null|    20|
| 7876| ADAMS|CLERK|7788|1983-01-12|1100.0|null|    20|
| 7900| JAMES|CLERK|7698|1981-12-03| 950.0|null|    30|
| 7934|MILLER|CLERK|7782|1982-01-23|1300.0|null|    10|
+-----+------+-----+----+----------+------+----+------+

+-----+------+-----+----+----------+------+----+------+
|EMPNO| ENAME|  JOB| MGR|  HIREDATE|   SAL|COM

In [75]:
# 173) list all the employees joined on 1 st may 81.

In [76]:
# 174) List the empno,ename,sal,deptno of the dept 10 emps in the ascending order of
# salary.

emp.filter("DEPTNO == 10").sort(asc("SAL")).show()

+-----+------+---------+----+----------+------+----+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+---------+----+----------+------+----+------+
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|null|    10|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|null|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|null|    10|
+-----+------+---------+----+----------+------+----+------+



In [77]:
# 175) List the emps whose salaries are less than 3500.

emp.filter("SAL < 3500").show()


+-----+------+--------+----+----------+------+-------+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+--------+----+----------+------+-------+------+
| 7369| SMITH|   CLERK|7902|1980-12-17| 800.0|   null|    20|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7566| JONES| MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE| MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7782| CLARK| MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7788| SCOTT| ANALYST|7566|1982-12-09|3000.0|   null|    20|
| 7844|TURNER|SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7876| ADAMS|   CLERK|7788|1983-01-12|1100.0|   null|    20|
| 7900| JAMES|   CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7902|  FORD| ANALYST|7566|1981-12-03|3000.0|   null|    20|
| 7934|MILLER|   CLERK|7782|1982-01-23|1300.0|   null|    10|
+-----+-

In [78]:
# 176) List the empno,ename,sal of all the emp joined before 1 apr 81.

emp.filter('HIREDATE < "1981-04-01"').show()

+-----+-----+--------+----+----------+------+------+------+
|EMPNO|ENAME|     JOB| MGR|  HIREDATE|   SAL|  COMM|DEPTNO|
+-----+-----+--------+----+----------+------+------+------+
| 7369|SMITH|   CLERK|7902|1980-12-17| 800.0|  null|    20|
| 7499|ALLEN|SALESMAN|7698|1981-02-20|1600.0|300.00|    30|
| 7521| WARD|SALESMAN|7698|1981-02-22|1250.0|500.00|    30|
+-----+-----+--------+----+----------+------+------+------+



In [79]:
# 177) List the emp whose annual sal is <25000 in the asc order of the salaries.

emp.filter(emp.SAL*12 < 25000).show()

+-----+------+--------+----+----------+------+-------+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+--------+----+----------+------+-------+------+
| 7369| SMITH|   CLERK|7902|1980-12-17| 800.0|   null|    20|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7844|TURNER|SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7876| ADAMS|   CLERK|7788|1983-01-12|1100.0|   null|    20|
| 7900| JAMES|   CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7934|MILLER|   CLERK|7782|1982-01-23|1300.0|   null|    10|
+-----+------+--------+----+----------+------+-------+------+



In [80]:
# 178) List the empno,ename,annsal,dailysal of all the salesmen in the asc ann sal

emp.select("EMPNO", "ENAME", (emp.SAL*12).alias("ANNSAL"), round((emp.SAL/30)).alias("DAILYSAL")).sort(asc("ANNSAL")).show()

+-----+------+-------+--------+
|EMPNO| ENAME| ANNSAL|DAILYSAL|
+-----+------+-------+--------+
| 7369| SMITH| 9600.0|    27.0|
| 7900| JAMES|11400.0|    32.0|
| 7876| ADAMS|13200.0|    37.0|
| 7654|MARTIN|15000.0|    42.0|
| 7521|  WARD|15000.0|    42.0|
| 7934|MILLER|15600.0|    43.0|
| 7844|TURNER|18000.0|    50.0|
| 7499| ALLEN|19200.0|    53.0|
| 7782| CLARK|29400.0|    82.0|
| 7698| BLAKE|34200.0|    95.0|
| 7566| JONES|35700.0|    99.0|
| 7902|  FORD|36000.0|   100.0|
| 7788| SCOTT|36000.0|   100.0|
| 7839|  KING|60000.0|   167.0|
+-----+------+-------+--------+



In [81]:
# 179) List the empno,ename,hiredate,current date & exp in the ascending order of the
# exp.

emp.select("EMPNO", "ENAME", "HIREDATE", current_date(), (datediff(current_date(), "HIREDATE")/365).alias("EXP")).show()


+-----+------+----------+--------------+------------------+
|EMPNO| ENAME|  HIREDATE|current_date()|               EXP|
+-----+------+----------+--------------+------------------+
| 7369| SMITH|1980-12-17|    2018-11-27| 37.96986301369863|
| 7499| ALLEN|1981-02-20|    2018-11-27| 37.79178082191781|
| 7521|  WARD|1981-02-22|    2018-11-27| 37.78630136986301|
| 7566| JONES|1981-04-02|    2018-11-27|37.679452054794524|
| 7654|MARTIN|1981-09-28|    2018-11-27| 37.18904109589041|
| 7698| BLAKE|1981-05-01|    2018-11-27|              37.6|
| 7782| CLARK|1981-06-09|    2018-11-27| 37.49315068493151|
| 7788| SCOTT|1982-12-09|    2018-11-27| 35.99178082191781|
| 7839|  KING|1981-11-17|    2018-11-27| 37.05205479452055|
| 7844|TURNER|1981-09-08|    2018-11-27| 37.24383561643835|
| 7876| ADAMS|1983-01-12|    2018-11-27|  35.8986301369863|
| 7900| JAMES|1981-12-03|    2018-11-27| 37.00821917808219|
| 7902|  FORD|1981-12-03|    2018-11-27| 37.00821917808219|
| 7934|MILLER|1982-01-23|    2018-11-27|

In [82]:
#180 List the emps whose exp is more than 37 years.

emp.filter(datediff(current_date(), "HIREDATE")/365 > 36).show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|   null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|    20|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|   null|    10|
+-----+------+---------+----+----------+------+-------+

In [83]:
# 181) List
# the
# empno,ename,sal,TA30%,DA
# 40%,HRA
# 50%,GROSS,LIC,PF,net,deduction,net allow and net sal in the ascending order
# of the net salary.


emp.select("EMPNO", "ENAME", "SAL", (emp.SAL*0.3).alias("TA"), (emp.SAL*0.4).alias("DA"), (emp.SAL*0.5).alias("HRA")).show()

+-----+------+------+------+------+------+
|EMPNO| ENAME|   SAL|    TA|    DA|   HRA|
+-----+------+------+------+------+------+
| 7369| SMITH| 800.0| 240.0| 320.0| 400.0|
| 7499| ALLEN|1600.0| 480.0| 640.0| 800.0|
| 7521|  WARD|1250.0| 375.0| 500.0| 625.0|
| 7566| JONES|2975.0| 892.5|1190.0|1487.5|
| 7654|MARTIN|1250.0| 375.0| 500.0| 625.0|
| 7698| BLAKE|2850.0| 855.0|1140.0|1425.0|
| 7782| CLARK|2450.0| 735.0| 980.0|1225.0|
| 7788| SCOTT|3000.0| 900.0|1200.0|1500.0|
| 7839|  KING|5000.0|1500.0|2000.0|2500.0|
| 7844|TURNER|1500.0| 450.0| 600.0| 750.0|
| 7876| ADAMS|1100.0| 330.0| 440.0| 550.0|
| 7900| JAMES| 950.0| 285.0| 380.0| 475.0|
| 7902|  FORD|3000.0| 900.0|1200.0|1500.0|
| 7934|MILLER|1300.0| 390.0| 520.0| 650.0|
+-----+------+------+------+------+------+



In [84]:
# 182) List the emps who are working as managers.

emp.filter(emp.EMPNO.isin(emp.select("MGR").distinct().rdd.map(lambda x: x[0]).collect())).show()

+-----+-----+---------+----+----------+------+----+------+
|EMPNO|ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+---------+----+----------+------+----+------+
| 7566|JONES|  MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7698|BLAKE|  MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7782|CLARK|  MANAGER|7839|1981-06-09|2450.0|null|    10|
| 7788|SCOTT|  ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.0|null|    10|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000.0|null|    20|
+-----+-----+---------+----+----------+------+----+------+



In [85]:
# 183) List the emps who are either clerks or managers.

emp.filter((emp.EMPNO.isin(emp.select("MGR").distinct().rdd.map(lambda x: x[0]).collect())) | (emp.JOB == "CLERK")).show()

+-----+------+---------+----+----------+------+----+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+---------+----+----------+------+----+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|null|    20|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|null|    10|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|null|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|null|    20|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|null|    10|
+-----+------+---------+----+----------+------+----+------+



In [86]:
# 184) List the emps who have joined on the following dates 1 may 81,17 nov 81,30
# dec 81

emp.filter(col("HIREDATE").isin("1981-05-01", "1981-11-17", "1981-12-30")).show()

+-----+-----+---------+----+----------+------+----+------+
|EMPNO|ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+---------+----+----------+------+----+------+
| 7698|BLAKE|  MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.0|null|    10|
+-----+-----+---------+----+----------+------+----+------+



In [87]:
# 185) List the emps who have joined in the year 1981.

emp.filter(year("HIREDATE") == 1981).show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|    20|
+-----+------+---------+----+----------+------+-------+------+



In [88]:
# 186) List the emps whose annual sal ranging from 23000 to 40000.

emp.filter((emp.SAL*12).between(23000, 40000)).select("*",emp.SAL*12).show()

+-----+-----+-------+----+----------+------+----+------+----------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|(SAL * 12)|
+-----+-----+-------+----+----------+------+----+------+----------+
| 7566|JONES|MANAGER|7839|1981-04-02|2975.0|null|    20|   35700.0|
| 7698|BLAKE|MANAGER|7839|1981-05-01|2850.0|null|    30|   34200.0|
| 7782|CLARK|MANAGER|7839|1981-06-09|2450.0|null|    10|   29400.0|
| 7788|SCOTT|ANALYST|7566|1982-12-09|3000.0|null|    20|   36000.0|
| 7902| FORD|ANALYST|7566|1981-12-03|3000.0|null|    20|   36000.0|
+-----+-----+-------+----+----------+------+----+------+----------+



In [89]:
# 187) List the emps working under the mgrs 7369,7890,7654,7900.

emp.filter(emp.MGR.isin([7839,7890,7654,7900])).show()

+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+
| 7566|JONES|MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7698|BLAKE|MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7782|CLARK|MANAGER|7839|1981-06-09|2450.0|null|    10|
+-----+-----+-------+----+----------+------+----+------+



In [90]:
# 188) List the emps who joined in the second half of 82.

emp.filter(emp.HIREDATE.between("1982-06-01", "1982-12-31")).show()

+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+
| 7788|SCOTT|ANALYST|7566|1982-12-09|3000.0|null|    20|
+-----+-----+-------+----+----------+------+----+------+



In [91]:
# 189) List all the 4char emps.

emp.filter(length(emp.ENAME) == 4).show()

+-----+-----+---------+----+----------+------+------+------+
|EMPNO|ENAME|      JOB| MGR|  HIREDATE|   SAL|  COMM|DEPTNO|
+-----+-----+---------+----+----------+------+------+------+
| 7521| WARD| SALESMAN|7698|1981-02-22|1250.0|500.00|    30|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.0|  null|    10|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000.0|  null|    20|
+-----+-----+---------+----+----------+------+------+------+



In [92]:
# 190) List the emp names starting with ‘M’ with 6 chars.

emp.filter((emp.ENAME.like("M%")) & (length(emp.ENAME) == 6)).show()

+-----+------+--------+----+----------+------+-------+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+--------+----+----------+------+-------+------+
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7934|MILLER|   CLERK|7782|1982-01-23|1300.0|   null|    10|
+-----+------+--------+----+----------+------+-------+------+



In [93]:
# 191) List the emps end with ‘H’ all together 5 chars.

emp.filter((emp.ENAME.like("%H")) & (length(emp.ENAME) == 5)).show()

+-----+-----+-----+----+----------+-----+----+------+
|EMPNO|ENAME|  JOB| MGR|  HIREDATE|  SAL|COMM|DEPTNO|
+-----+-----+-----+----+----------+-----+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17|800.0|null|    20|
+-----+-----+-----+----+----------+-----+----+------+



In [94]:
# 192) List names start with ‘M’.

emp.filter((emp.ENAME.like("M%"))).show()

+-----+------+--------+----+----------+------+-------+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+--------+----+----------+------+-------+------+
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7934|MILLER|   CLERK|7782|1982-01-23|1300.0|   null|    10|
+-----+------+--------+----+----------+------+-------+------+



In [95]:
# 193) List the emps who joined in the year 81.

emp.filter(year(emp.HIREDATE) == 1981).show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|    20|
+-----+------+---------+----+----------+------+-------+------+



In [96]:
# 194) List the emps whose sal is ending with 00.

emp.filter(emp.SAL.cast("integer").like("%00")).show()

+-----+------+---------+----+----------+------+------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|  COMM|DEPTNO|
+-----+------+---------+----+----------+------+------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|  null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0|300.00|    30|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|  null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|  null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|  0.00|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|  null|    20|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|  null|    20|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|  null|    10|
+-----+------+---------+----+----------+------+------+------+



In [97]:
# 195) List the emp who joined in the month of JAN.

emp.filter(month("HIREDATE") == 01).show()

+-----+------+-----+----+----------+------+----+------+
|EMPNO| ENAME|  JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+-----+----+----------+------+----+------+
| 7876| ADAMS|CLERK|7788|1983-01-12|1100.0|null|    20|
| 7934|MILLER|CLERK|7782|1982-01-23|1300.0|null|    10|
+-----+------+-----+----+----------+------+----+------+



In [98]:
# 196) Who joined in the month having char ‘a’.


emp.filter(date_format("HIREDATE","MMM").like("_a%")).show()


+-----+------+-------+----+----------+------+----+------+
|EMPNO| ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+-------+----+----------+------+----+------+
| 7698| BLAKE|MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7876| ADAMS|  CLERK|7788|1983-01-12|1100.0|null|    20|
| 7934|MILLER|  CLERK|7782|1982-01-23|1300.0|null|    10|
+-----+------+-------+----+----------+------+----+------+



In [99]:
# 197) Who joined in the month having second char ‘a’

emp.filter(date_format("HIREDATE","MMM").like("_a%")).show()


+-----+------+-------+----+----------+------+----+------+
|EMPNO| ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+-------+----+----------+------+----+------+
| 7698| BLAKE|MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7876| ADAMS|  CLERK|7788|1983-01-12|1100.0|null|    20|
| 7934|MILLER|  CLERK|7782|1982-01-23|1300.0|null|    10|
+-----+------+-------+----+----------+------+----+------+



In [100]:
# 198) List the emps whose salary is 4 digit number.

emp.filter(length(emp.SAL.cast("integer")) == 4).show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|   null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|   null|    20|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|    20|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|   null|    10|
+-----+------+---------+----+----------+------+-------+

In [101]:
# 199) List the emp who joined in 80’s.

emp.filter(year("HIREDATE") == 1980).show()

+-----+-----+-----+----+----------+-----+----+------+
|EMPNO|ENAME|  JOB| MGR|  HIREDATE|  SAL|COMM|DEPTNO|
+-----+-----+-----+----+----------+-----+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17|800.0|null|    20|
+-----+-----+-----+----+----------+-----+----+------+



In [102]:
# 200) List the emp who are clerks who have exp more than 8ys.

emp.filter((emp.JOB == "CLERK") & (datediff(current_date(), emp.HIREDATE) > 365*8)).show()

+-----+------+-----+----+----------+------+----+------+
|EMPNO| ENAME|  JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+-----+----+----------+------+----+------+
| 7369| SMITH|CLERK|7902|1980-12-17| 800.0|null|    20|
| 7876| ADAMS|CLERK|7788|1983-01-12|1100.0|null|    20|
| 7900| JAMES|CLERK|7698|1981-12-03| 950.0|null|    30|
| 7934|MILLER|CLERK|7782|1982-01-23|1300.0|null|    10|
+-----+------+-----+----+----------+------+----+------+



In [103]:
# 201) List the mgrs of dept 10 or 20.

emp.alias("e").join(emp.alias("m"), [col("e.MGR") == col("m.EMPNO")]).select("m.*").distinct().filter(col("DEPTNO").isin([10,20])).show()

+-----+-----+---------+----+----------+------+----+------+
|EMPNO|ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+---------+----+----------+------+----+------+
| 7788|SCOTT|  ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000.0|null|    20|
| 7566|JONES|  MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.0|null|    10|
| 7782|CLARK|  MANAGER|7839|1981-06-09|2450.0|null|    10|
+-----+-----+---------+----+----------+------+----+------+



In [104]:
# 202) List the emps joined in jan with salary ranging from 1500 to 4000.

emp.filter((month("HIREDATE") == 02) & (emp.SAL.between(1500, 4000))).show()

+-----+-----+--------+----+----------+------+------+------+
|EMPNO|ENAME|     JOB| MGR|  HIREDATE|   SAL|  COMM|DEPTNO|
+-----+-----+--------+----+----------+------+------+------+
| 7499|ALLEN|SALESMAN|7698|1981-02-20|1600.0|300.00|    30|
+-----+-----+--------+----+----------+------+------+------+



In [105]:
# 203) List the unique jobs of dept 20 and 30 in desc order.

emp.filter(emp.DEPTNO.isin([20,30])).select("JOB").distinct().show()

+--------+
|     JOB|
+--------+
| ANALYST|
|SALESMAN|
|   CLERK|
| MANAGER|
+--------+



In [106]:
# 204) List the emps along with exp of those working under the mgr whose number is
# starting with 7 but should not have a 9 joined before 1983.



In [107]:
# 205) List the emps who are working as either mgr or analyst with the salary ranging
# from 2000 to 5000 and with out comm.

emp.alias("e").join(emp.alias("m"), [col("e.MGR") == col("m.EMPNO")]).select("m.*").distinct()\
.filter((col("SAL").between(2000,5000))).show()

+-----+-----+---------+----+----------+------+----+------+
|EMPNO|ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+---------+----+----------+------+----+------+
| 7788|SCOTT|  ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000.0|null|    20|
| 7566|JONES|  MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7698|BLAKE|  MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.0|null|    10|
| 7782|CLARK|  MANAGER|7839|1981-06-09|2450.0|null|    10|
+-----+-----+---------+----+----------+------+----+------+



In [108]:
# 206) List the empno,ename,sal,job of the emps with /ann sal <34000 but receiving
# some comm. Which should not be>sal and desg should be sales man working
# for dept 30.

emp.filter((emp.JOB == "SALESMAN") & ((emp.DEPTNO) == 30)).filter((12 * when(emp.COMM.isNull(), emp.SAL)\
    .otherwise(emp.SAL + emp.COMM) < 34000)).filter(emp.SAL>emp.COMM).show()

+-----+------+--------+----+----------+------+------+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|  COMM|DEPTNO|
+-----+------+--------+----+----------+------+------+------+
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0|300.00|    30|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0|500.00|    30|
| 7844|TURNER|SALESMAN|7698|1981-09-08|1500.0|  0.00|    30|
+-----+------+--------+----+----------+------+------+------+



In [109]:
# 207) List the emps who are working for dept 10 or 20 with desgs as clerk or analyst
# with a sal is either 3 or 4 digits with an exp>8ys but does not belong to mons of
# mar,apr,sep and working for mgrs &no is not ending with 88 and 56.



In [110]:
# 208) List the empno,ename,sal,job,deptno&exp of all the emps belongs to dept 10 or
# 20 with an exp 6 to 10 y working under the same mgr with out comm. With a
# job not ending irrespective of the position with comm.>200 with exp>=7y and
# sal<2500 but not belongs to the month sep or nov working under the mgr whose
# no is not having digits either 9 or 0 in the asc dept& desc dept

In [111]:
# 209) List the details of the emps working at Chicago.

emp.join(dept, "DEPTNO").filter('DLOC == "CHICAGO"').show()

+------+-----+------+--------+----+----------+------+-------+-----+-------+
|DEPTNO|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DNAME|   DLOC|
+------+-----+------+--------+----+----------+------+-------+-----+-------+
|    30| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|SALES|CHICAGO|
|    30| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|SALES|CHICAGO|
|    30| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|SALES|CHICAGO|
|    30| 7698| BLAKE| MANAGER|7839|1981-05-01|2850.0|   null|SALES|CHICAGO|
|    30| 7844|TURNER|SALESMAN|7698|1981-09-08|1500.0|   0.00|SALES|CHICAGO|
|    30| 7900| JAMES|   CLERK|7698|1981-12-03| 950.0|   null|SALES|CHICAGO|
+------+-----+------+--------+----+----------+------+-------+-----+-------+



In [112]:
# 210) List the empno,ename,deptno,loc of all the emps.

emp.join(dept, "DEPTNO").select("EMPNO", "ENAME", "DEPTNO", "DLOC").show()

+-----+------+------+--------+
|EMPNO| ENAME|DEPTNO|    DLOC|
+-----+------+------+--------+
| 7369| SMITH|    20|  DALLAS|
| 7566| JONES|    20|  DALLAS|
| 7788| SCOTT|    20|  DALLAS|
| 7876| ADAMS|    20|  DALLAS|
| 7902|  FORD|    20|  DALLAS|
| 7782| CLARK|    10|NEW YORK|
| 7839|  KING|    10|NEW YORK|
| 7934|MILLER|    10|NEW YORK|
| 7499| ALLEN|    30| CHICAGO|
| 7521|  WARD|    30| CHICAGO|
| 7654|MARTIN|    30| CHICAGO|
| 7698| BLAKE|    30| CHICAGO|
| 7844|TURNER|    30| CHICAGO|
| 7900| JAMES|    30| CHICAGO|
+-----+------+------+--------+



In [113]:
# Some break to practice some miscellaneous functions

emp.withColumnRenamed("EMPNO", "ENO").show()

+----+------+---------+----+----------+------+-------+------+
| ENO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+----+------+---------+----+----------+------+-------+------+
|7369| SMITH|    CLERK|7902|1980-12-17| 800.0|   null|    20|
|7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
|7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
|7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20|
|7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
|7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30|
|7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
|7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|   null|    20|
|7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null|    10|
|7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
|7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|   null|    20|
|7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|
|7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|    20|
|7934|MI

In [114]:
emp.schema.names

['EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO']

In [115]:
#Find the sum of SAL+COMM department wise

emp.groupBy("DEPTNO").agg(sum(when(emp.COMM.isNull(), emp.COMM).otherwise(emp.SAL+emp.COMM))).show()

+------+-------------------------------------------------------------+
|DEPTNO|sum(CASE WHEN (COMM IS NULL) THEN COMM ELSE (SAL + COMM) END)|
+------+-------------------------------------------------------------+
|    20|                                                         null|
|    10|                                                         null|
|    30|                                                       7800.0|
+------+-------------------------------------------------------------+



In [116]:
emp.groupBy("DEPTNO").agg(sum(when(col("COMM").isNull(), col("SAL")).otherwise(col("SAL")+col("COMM")))).show()

+------+------------------------------------------------------------+
|DEPTNO|sum(CASE WHEN (COMM IS NULL) THEN SAL ELSE (SAL + COMM) END)|
+------+------------------------------------------------------------+
|    20|                                                     10875.0|
|    10|                                                      8750.0|
|    30|                                                     11600.0|
+------+------------------------------------------------------------+



In [117]:
emp.filter(col("COMM").isNull()).count()

10

In [118]:
emp.na.fill({'COMM': (0.0)}).show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|    0.0|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|    0.0|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|    0.0|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|    0.0|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|    0.0|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|    0.0|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|    0.0|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|    0.0|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|    0.0|

In [119]:
emp.withColumn("COMM", col("COMM").cast("double")).printSchema()

root
 |-- EMPNO: integer (nullable = true)
 |-- ENAME: string (nullable = true)
 |-- JOB: string (nullable = true)
 |-- MGR: integer (nullable = true)
 |-- HIREDATE: date (nullable = true)
 |-- SAL: double (nullable = true)
 |-- COMM: double (nullable = true)
 |-- DEPTNO: integer (nullable = true)



In [120]:
allenJobSAL = emp.filter('ENAME = "ALLEN"').select("SAL", "JOB")

allenSAL = allenJobSAL.head()[0]

allenJOB = allenJobSAL.head()[1]

emp.filter((col("SAL") == allenSAL) | (col("JOB") == allenJOB)).show()

+-----+------+--------+----+----------+------+-------+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+--------+----+----------+------+-------+------+
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7844|TURNER|SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
+-----+------+--------+----+----------+------+-------+------+



In [121]:
emp.join(sal, emp.SAL.between(sal.LOSAL, sal.HISAL), 'inner')\
    .join(dept, "DEPTNO", 'inner').filter('DNAME = "SALES"').show()

+------+-----+------+--------+----+----------+------+-------+-----+------+------+-----+-------+
|DEPTNO|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|GRADE| LOSAL| HISAL|DNAME|   DLOC|
+------+-----+------+--------+----+----------+------+-------+-----+------+------+-----+-------+
|    30| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|    2|1201.0|1400.0|SALES|CHICAGO|
|    30| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    3|1401.0|2000.0|SALES|CHICAGO|
|    30| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    2|1201.0|1400.0|SALES|CHICAGO|
|    30| 7698| BLAKE| MANAGER|7839|1981-05-01|2850.0|   null|    4|2001.0|3000.0|SALES|CHICAGO|
|    30| 7844|TURNER|SALESMAN|7698|1981-09-08|1500.0|   0.00|    3|1401.0|2000.0|SALES|CHICAGO|
|    30| 7900| JAMES|   CLERK|7698|1981-12-03| 950.0|   null|    1| 700.0|1200.0|SALES|CHICAGO|
+------+-----+------+--------+----+----------+------+-------+-----+------+------+-----+-------+



In [122]:
emp.join(sal, emp.SAL.between(sal.LOSAL, sal.HISAL))\
    .join(dept, "DEPTNO").filter('DNAME = "SALES"')

DataFrame[DEPTNO: int, EMPNO: int, ENAME: string, JOB: string, MGR: int, HIREDATE: date, SAL: double, COMM: string, GRADE: int, LOSAL: double, HISAL: double, DNAME: string, DLOC: string]

In [123]:
emp.rdd.saveAsTextFile("test")

Py4JJavaError: An error occurred while calling o1435.saveAsTextFile.
: org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory file:/home/nineleaps/spark/test already exists
	at org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)
	at org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.assertConf(SparkHadoopWriter.scala:283)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:71)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1096)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1067)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:957)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1493)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1472)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1472)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1472)
	at org.apache.spark.api.java.JavaRDDLike$class.saveAsTextFile(JavaRDDLike.scala:550)
	at org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:45)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
e_c = emp.rdd.coalesce(1)

In [None]:
e_c.toDF().write.format("ORC").save("test2")

In [None]:
pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x))
sets = pairs.partitionBy(4).collect()
set(sets[0])

In [None]:
set(sets[0])

In [None]:
print("Testing: union(otherDataset)")

u01_rdd = sc.parallelize(range(1,9))
u02_rdd = sc.parallelize(range(5,15))
union_rdd = u01_rdd.union(u02_rdd).collect()

union_rdd

In [None]:
print("Testing: intersection(otherDataset)")

i01_rdd = sc.parallelize(range(1,9))
i02_rdd = sc.parallelize(range(1,15))
intersection_rdd = i01_rdd.intersection(i02_rdd).collect()

intersection_rdd

In [None]:
print("Testing: intersection(otherDataset)")

s01_rdd = sc.parallelize(['A', 'B'])
s02_rdd = sc.parallelize(['B', 'C'])
subtract_rdd = s01_rdd.subtract(s02_rdd).collect()

subtract_rdd

In [None]:
# Good to understand the below scenerio
# Say I have the data set one A,B and the dataset B,C here I want the result A,C Because I want to subtract b from both the sets.

set1 = sc.parallelize(['A', 'B'])
set2 = sc.parallelize(['B', 'C'])
u_set = set1.union(set2) #result A,B,B,C
i_set = set1.intersection(set2) #result B
u_set.subtract(i_set).collect()

In [None]:
a = sc.parallelize([(1,"sun"),(2,"moon"),(3,"mercury")])

a.collectAsMap()[2]

In [None]:
type(a)

In [None]:
# Back to Dataframe questions

In [None]:
# 211) List the empno,ename,loc,dname of all the depts.,10 and 20.

emp.join(dept, "DEPTNO").filter(col("DEPTNO").isin([10,20])).show() #filter after join

emp.alias("e").join(dept.alias("d"), [col("e.DEPTNO") == col("d.DEPTNO"), col("e.DEPTNO").isin([10,20])]).show()

# made condition during join itself

In [None]:
# 212) List the empno, ename, sal, loc of the emps working at Chicago dallas with an
# exp>38ys

emp.join(dept, "DEPTNO").filter((year(current_date()) - (year("HIREDATE")) > 37) & (dept.DLOC.isin(["DALLAS", "CHICAGO"]))).show()

In [None]:
# 213) List the emps along with loc of those who belongs to dallas ,newyork with sal
# ranging from 2000 to 5000 joined in 81.

emp.join(dept, "DEPTNO").filter((col("DLOC").isin(["DALLAS", "NEWYORK"])) & (col("SAL").between(2000, 5000)) & (year("HIREDATE") == 1981)).show()

In [None]:
# 214) List the empno,ename,sal,grade of all emps.

emp.alias("e").join(sal.alias("s"), col("e.SAL").between(col("s.LOSAL"), col("s.HISAL"))).show()



In [None]:
# 215) List the grade 2 and 3 emp of Chicago.

emp.alias("e").join(sal.alias("s"), col("e.SAL").between(col("s.LOSAL"), col("s.HISAL")))\
.join(dept.alias("d"), col("e.DEPTNO") == col("d.DEPTNO"))\
.filter((col("s.grade").isin([2,3])) & (col("d.DLOC") == "CHICAGO")).show()

In [None]:
# 216) List the emps with loc and grade of accounting dept or the locs dallas or
# Chicago with the grades 3 to 5 &exp >6y


emp.alias("e").join(sal.alias("s"), col("e.SAL").between(col("s.LOSAL"), col("s.HISAL"))).join(dept.alias("d"), "DEPTNO")\
.filter((col("d.DNAME") == "ACCOUNTING") | (col("d.DLOC").isin(["DALLAS", "CHICAGO"])) & (col("s.GRADE").isin(3,4,5)) & ((year(current_date()) - year("HIREDATE")) > 7)).show()

In [None]:
# 217) List the grades 3 emps of research and operations depts.. joined after 1987 and
# whose names should not be either miller or allen.

emp.alias("e").join(sal.alias("s"), col("e.SAL").between(col("s.LOSAL"), col("s.HISAL"))).join(dept.alias("d"), "DEPTNO")\
.filter((year("HIREDATE") > 1987) & (~col("e.ENAME").isin(["MILLER", "ALLEN"]))).show()

In [None]:
# 218) List the emps whose job is same as smith.

smithJOb = emp.filter(col("ENAME") == "SMITH").select("JOB").head()[0]
emp.filter(col("JOB") == emp.filter(col("ENAME") == "SMITH").select("JOB").head()[0]).show() # wriiten like subquery in sql

In [None]:
# 219) List the emps who are senior to miller.

yearOfMiller = emp.filter(col("ENAME") == "MILLER").select("HIREDATE").head()[0]

emp.filter(col("HIREDATE") > yearOfMiller).show()

In [None]:
# 220) List the emps whose job is same as either allen or sal>allen.


allenJob = emp.filter(col("ENAME") == "ALLEN").select("JOB", "SAL").head()[0]
allenSal = emp.filter(col("ENAME") == "ALLEN").select("JOB", "SAL").head()[1]

emp.filter((col("JOB") == allenJob) | (col("SAL") > allenSal)).show()

In [None]:
# 221) List the emps who are senior to their own manager.

emp.alias("e").join(emp.alias("m"), [col("e.MGR") == col("m.EMPNO"), col("e.HIREDATE") < col("m.HIREDATE")]).select("e.*").show()

In [None]:
# 222) List the emps whose sal greater than blakes sal.

blakSal = emp.filter(col("ENAME") == "BLAKE").select("SAL").head()[0]

emp.filter(col("SAL") > blakSal).show()

In [None]:
# 223) List the dept 10 emps whose sal>allen sal.

allenSal = emp.filter(col("ENAME") == "ALLEN").select("SAL").head()[0]

emp.filter((col("SAL") > allenSal) & (col("DEPTNO") == 10)).show()

In [None]:
# 224) List the mgrs who are senior to king and who are junior to smith.

emp.alias("e").join(emp.alias("m"), col("e.MGR") == col("m.EMPNO")).\
filter((col("m.HIREDATE") < emp.filter(col("ENAME") == "KING").select("HIREDATE").head()[0]) & \
       (col("m.HIREDATE") > emp.filter(col("ENAME") == "SMITH").select("HIREDATE").head()[0])).select("m.*").distinct().show()

In [None]:
# 225) List the empno,ename,loc,sal,dname,loc of the all the emps belonging to king
# dept.

kingDep = emp.filter(col("ENAME") == "KING").head()[0]

emp.filter(col("DEPTNO") == 10).join(dept, emp.DEPTNO == dept.DEPTNO).show()

In [None]:
# 226) List the emps whose salgrade are greater than the grade of miller.

gradeMill = emp.filter(col("ENAME") == "MILLER").alias("e")\
    .join(sal.alias("s"), col("e.SAL").between(col("s.LOSAL"), col("s.HISAL"))).select("s.GRADE").head()[0]


emp.alias("e")\
    .join(sal.alias("s"), col("e.SAL").between(col("s.LOSAL"), col("s.HISAL"))).filter(col("S.GRADE") > gradeMill).show()


In [None]:
# 227) List the emps who are belonging dallas or Chicago with the grade same as
# adams or exp more than smith.

all3 = emp.alias("e").join(sal.alias("s"), e.SAL.between(col("s.LOSAL"), col("s.HISAL"))).join(dept.alias("d"), "DEPTNO")

all3\
.filter( (col("DLOC").isin(["DALLAS", "CHICAGO"])) \
        & (col("GRADE") == all3.filter((col("ENAME") == "ADAMS")).select("GRADE").head()[0]) \
        | (datediff(current_date(), "HIREDATE") > datediff(current_date(), lit(emp.filter(col("ENAME") == "SMITH").select("HIREDATE").head()[0])))).show()

In [None]:
# 228) List the emps whose sal is same as ford or blake.

fordSal = emp.filter(col("ENAME") == "FORD").select("SAL").head()[0]
blackSal = emp.filter(col("ENAME") == "BLAKE").select("SAL").head()[0]

emp.filter(col("SAL").isin([fordSal, blackSal])).show()

In [None]:
# 229) List the emps whose sal is same as any one of the following.



In [None]:
# 230) Sal of any clerk of emp1 table.

emp.filter(col("JOB") == "CLERK").show()

In [None]:
# 231) Any emp of emp2 joined before 82.

emp.filter(year("HIREDATE") < 1982).show()