In [1]:
from pyspark import SparkContext, SparkConf
from pyspark import sql
from pyspark.sql import Row                       # To use Row method for column
#from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql import Window

In [2]:
conf = SparkConf().set("spark.executor.memory", "4g")
sc = SparkContext(conf=conf)
sqlContext = sql.SQLContext(sc)

In [3]:
sc._conf.set("spark.driver.memory","3")
sc._conf.set("spark.sql.crossJoin.enabled", "true")

<pyspark.conf.SparkConf at 0x7efea5c34ad0>

In [4]:
sc._conf.getAll()

[(u'spark.sql.crossJoin.enabled', u'true'),
 (u'spark.driver.port', u'44325'),
 (u'spark.app.id', u'local-1543318839362'),
 (u'spark.rdd.compress', u'True'),
 (u'spark.executor.memory', u'4g'),
 (u'spark.serializer.objectStreamReset', u'100'),
 (u'spark.master', u'local[*]'),
 (u'spark.driver.memory', u'3'),
 (u'spark.executor.id', u'driver'),
 (u'spark.submit.deployMode', u'client'),
 (u'spark.ui.showConsoleProgress', u'true'),
 (u'spark.app.name', u'pyspark-shell'),
 (u'spark.driver.host', u'192.168.1.75')]

In [5]:
sc._conf.get("spark.submit.deployMode")

u'client'

In [6]:
m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap()

m

{1: 2, 3: 4}

In [7]:
#Tricky RDD Example with dict type

sam = sc.parallelize((['A',1,1],['B',1,2],['C',1,5],['D',2,3],['A',2,1],['B',2,5],['C',2,3],['D',2,1]))

df_sample = sqlContext.createDataFrame(sam, ['C1','C2','C3'])

sa = df_sample.rdd

sa1 = sa.map(lambda x: (x[0],(x[1],x[2])))

sa1.collect()

sa2 = sa1.groupByKey().map(lambda x : (x[0], dict(list(x[1]))))

sa3 = sqlContext.createDataFrame(sa2, ['C1', 'C2C3'])

sa3.show()

+---+----------------+
| C1|            C2C3|
+---+----------------+
|  A|[1 -> 1, 2 -> 1]|
|  C|[1 -> 5, 2 -> 3]|
|  B|[1 -> 2, 2 -> 5]|
|  D|        [2 -> 1]|
+---+----------------+



In [8]:
sam = sc.parallelize((['A',1,1],['B',1,2],['C',1,5],['D',2,3],['A',2,1],['B',2,5],['C',2,3],['D',2,1]))

df_sample = sqlContext.createDataFrame(sam, ['C1','C2','C3'])

sa = df_sample.rdd

a = sa.filter(lambda x: 'B' in x[0]).map(lambda x: ((x[1]), (x[2])))

print(a.collectAsMap()[2])

a.collect()

5


[(1, 2), (2, 5)]

In [9]:
empRDD = sc.textFile("emp.txt")
depRDD = sc.textFile("dept.txt")
salRDD = sc.textFile("salgrade.txt")

In [10]:
emp_col = Row('EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO')
dep_col = Row('DEPTNO', 'DNAME', 'DLOC')
sal_col = Row('GRADE', 'LOSAL', 'HISAL')

In [11]:
empROW = empRDD.map(lambda x: x.split(',')).map(lambda r: emp_col(*r))
depRow = depRDD.map(lambda x: x.split(',')).map(lambda r: dep_col(*r))
salROW = salRDD.map(lambda x: x.split(',')).map(lambda r: sal_col(*r))

In [12]:
empDF = sqlContext.createDataFrame(empROW)
depDF = sqlContext.createDataFrame(depRow)
salDF = sqlContext.createDataFrame(salROW)

In [13]:
# Converting Dataframe values to it types

def toInt(i):
    return i.cast("integer")

def toDouble(d):
    return d.cast("double")

emp = empDF.withColumn("EMPNO", toInt(empDF.EMPNO)).withColumn("MGR", toInt(empDF.MGR)).withColumn("HIREDATE",to_date('HIREDATE')).withColumn("SAL", toDouble(empDF.SAL)).withColumn('COMM', when(empDF.COMM == 'NULL', lit(None)).otherwise(empDF.COMM)).withColumn("DEPTNO", toInt(empDF.DEPTNO)).filter('ENAME != "ENAME"')
dept = depDF.withColumn("DEPTNO", toInt(depDF.DEPTNO)).na.drop()
sal = salDF.withColumn("GRADE", toInt(salDF.GRADE)).withColumn("LOSAL", toDouble(salDF.LOSAL)).withColumn("HISAL", toDouble(salDF.HISAL)).na.drop()

sqlContext.registerDataFrameAsTable(emp, "emptab")
sqlContext.registerDataFrameAsTable(dept, "depttab")
sqlContext.registerDataFrameAsTable(sal, "saltab")


In [14]:
#Assignment to Practice dataframe

In [15]:
# 1) Display all the information of the emp,dept,sal table?

emp.select('*').show()
dept.select('*').show()
sal.select('*').show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|   null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|   null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|   null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|

In [16]:
# 2. Display unique Jobs from EMP table?

emp.select("JOB").distinct().show()

+---------+
|      JOB|
+---------+
|  ANALYST|
| SALESMAN|
|    CLERK|
|  MANAGER|
|PRESIDENT|
+---------+



In [17]:
# 3. List the emps in the asc order of their Salaries?

emp.sort(asc('SAL')).show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|   null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|   null|    20|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|   null|    20|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|

In [18]:
# 4. List the details of the emps in asc order of the Dptnos and desc of Jobs?

emp.sort(asc('DEPTNO'),desc('JOB')).show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null|    10|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|   null|    10|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|   null|    20|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|   null|    20|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|    20|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|   null|    20|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|

In [19]:
# 5. Display all the unique job groups in the descending order?

emp.select("JOB").sort(desc("JOB")).distinct().show()


+---------+
|      JOB|
+---------+
| SALESMAN|
|PRESIDENT|
|  MANAGER|
|    CLERK|
|  ANALYST|
+---------+



In [20]:
sqlContext.registerDataFrameAsTable(emp, "empTab")

sqlContext.sql("Select * from emptab where EMPNO in (select MGR from emptab)").show()

+-----+-----+---------+----+----------+------+----+------+
|EMPNO|ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+---------+----+----------+------+----+------+
| 7788|SCOTT|  ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000.0|null|    20|
| 7566|JONES|  MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7698|BLAKE|  MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.0|null|    10|
| 7782|CLARK|  MANAGER|7839|1981-06-09|2450.0|null|    10|
+-----+-----+---------+----+----------+------+----+------+



In [21]:
# 6. Display all the details of all ‘Mgrs’

mgrId = emp.select("MGR").distinct()

mgrIDL = mgrId.rdd.map(lambda x: x[0]).collect()  # x[0]: first column, collect will return []

mgrDet = emp.filter(emp.EMPNO.isin(mgrIDL)) #passing mgrIDL-list inside to filter

mgrDet.show()

+-----+-----+---------+----+----------+------+----+------+
|EMPNO|ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+---------+----+----------+------+----+------+
| 7566|JONES|  MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7698|BLAKE|  MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7782|CLARK|  MANAGER|7839|1981-06-09|2450.0|null|    10|
| 7788|SCOTT|  ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.0|null|    10|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000.0|null|    20|
+-----+-----+---------+----+----------+------+----+------+



In [22]:
# 7. List the emps who joined before 1981.

emp.filter(year('HIREDATE') < '1981').show()

+-----+-----+-----+----+----------+-----+----+------+
|EMPNO|ENAME|  JOB| MGR|  HIREDATE|  SAL|COMM|DEPTNO|
+-----+-----+-----+----+----------+-----+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17|800.0|null|    20|
+-----+-----+-----+----+----------+-----+----+------+



In [23]:
#Find the name of the highest paied employee

a = emp.withColumn("rank", dense_rank().over(Window.orderBy(desc("SAL"))))

b = a.filter('rank = 1').select('ENAME').show()

+-----+
|ENAME|
+-----+
| KING|
+-----+



In [24]:
#8. List the Empno, Ename, Sal, Daily sal of all emps in the asc order of Annsal.

wid_dS = emp.withColumn("dSAL", emp.SAL/30).withColumn("aSal", emp.SAL*12) #with daily salary

wid_dS.sort(asc('aSal')).show() #wid_asc



+-----+------+---------+----+----------+------+-------+------+------------------+-------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|              dSAL|   aSal|
+-----+------+---------+----+----------+------+-------+------+------------------+-------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|   null|    20|26.666666666666668| 9600.0|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|31.666666666666668|11400.0|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|   null|    20|36.666666666666664|13200.0|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|41.666666666666664|15000.0|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|41.666666666666664|15000.0|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|   null|    10|43.333333333333336|15600.0|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|              50.0|18000.0|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|53.333333333333336|19200.0|
| 7782| CL

In [25]:
# 9. Display the Empno, Ename, job, Hiredate, Exp of all Mgrs

mgrDet.withColumn("EXP", year(current_date()) - year("HIREDATE")).show()

+-----+-----+---------+----+----------+------+----+------+---+
|EMPNO|ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|EXP|
+-----+-----+---------+----+----------+------+----+------+---+
| 7566|JONES|  MANAGER|7839|1981-04-02|2975.0|null|    20| 37|
| 7698|BLAKE|  MANAGER|7839|1981-05-01|2850.0|null|    30| 37|
| 7782|CLARK|  MANAGER|7839|1981-06-09|2450.0|null|    10| 37|
| 7788|SCOTT|  ANALYST|7566|1982-12-09|3000.0|null|    20| 36|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.0|null|    10| 37|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000.0|null|    20| 37|
+-----+-----+---------+----+----------+------+----+------+---+



In [26]:
# 10.List the Empno, Ename, Sal, Exp of all emps working for Mgr 7839.

emp.filter('MGR = 7839').withColumn("EXP", year(current_date()) - year("HIREDATE")).show()


+-----+-----+-------+----+----------+------+----+------+---+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|EXP|
+-----+-----+-------+----+----------+------+----+------+---+
| 7566|JONES|MANAGER|7839|1981-04-02|2975.0|null|    20| 37|
| 7698|BLAKE|MANAGER|7839|1981-05-01|2850.0|null|    30| 37|
| 7782|CLARK|MANAGER|7839|1981-06-09|2450.0|null|    10| 37|
+-----+-----+-------+----+----------+------+----+------+---+



In [27]:
# 11.Display all the details of the emps whose Comm. Is more than their Sal.

emp.filter('COMM > SAL').show()

+-----+------+--------+----+----------+------+-------+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+--------+----+----------+------+-------+------+
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
+-----+------+--------+----+----------+------+-------+------+



In [28]:
# 12. List the emps in the asc order of Designations of those joined after the second half of 1981.

# emp.filter(col("HIREDATE") > '1981-06-01').filter(year(col("HIREDATE")) == 1981).sort(asc('JOB')).show()


emp.filter(col("HIREDATE") > '1981-06-01').filter(year(col("HIREDATE")) == 1981).sort(asc('JOB')).show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null|    10|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
+-----+------+---------+----+----------+------+-------+------+



In [29]:
# 13. List the emps along with their Exp and Daily Sal is more than Rs.100.

emp.select('*', (year(current_date()) - year(emp.HIREDATE)).alias("EXP"), col('SAL')/30).show(truncate=True)


+-----+------+---------+----+----------+------+-------+------+---+------------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|EXP|        (SAL / 30)|
+-----+------+---------+----+----------+------+-------+------+---+------------------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|   null|    20| 38|26.666666666666668|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30| 37|53.333333333333336|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30| 37|41.666666666666664|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20| 37| 99.16666666666667|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30| 37|41.666666666666664|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30| 37|              95.0|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10| 37| 81.66666666666667|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|   null|    20| 36|             100.0|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null

In [30]:
# 14. List the emps who are either ‘CLERK’ or ‘ANALYST’ in the Desc order.

emp.filter(emp.JOB.isin('CLERK', 'ANALYST')).orderBy(emp.JOB, ascending = False).show()

emp.filter(emp.JOB.isin('CLERK', 'ANALYST')).sort(desc('JOB')).show() #orderBy(emp.JOB, ascending = False).show()

+-----+------+-------+----+----------+------+----+------+
|EMPNO| ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+-------+----+----------+------+----+------+
| 7369| SMITH|  CLERK|7902|1980-12-17| 800.0|null|    20|
| 7934|MILLER|  CLERK|7782|1982-01-23|1300.0|null|    10|
| 7900| JAMES|  CLERK|7698|1981-12-03| 950.0|null|    30|
| 7876| ADAMS|  CLERK|7788|1983-01-12|1100.0|null|    20|
| 7788| SCOTT|ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7902|  FORD|ANALYST|7566|1981-12-03|3000.0|null|    20|
+-----+------+-------+----+----------+------+----+------+

+-----+------+-------+----+----------+------+----+------+
|EMPNO| ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+-------+----+----------+------+----+------+
| 7369| SMITH|  CLERK|7902|1980-12-17| 800.0|null|    20|
| 7934|MILLER|  CLERK|7782|1982-01-23|1300.0|null|    10|
| 7900| JAMES|  CLERK|7698|1981-12-03| 950.0|null|    30|
| 7876| ADAMS|  CLERK|7788|1983-01-12|1100.0|null|    20|
| 7788| SCOTT

In [31]:
# 15. List the emps who joined on 1-MAY-81,3-DEC-81,17-DEC-80,19-JAN-80 in asc order of seniority.

emp.filter(emp.HIREDATE.isin('1981-05-01','1981-12-03','1980-12-17','1980-01-19')).show()

+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+
| 7369|SMITH|  CLERK|7902|1980-12-17| 800.0|null|    20|
| 7698|BLAKE|MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7900|JAMES|  CLERK|7698|1981-12-03| 950.0|null|    30|
| 7902| FORD|ANALYST|7566|1981-12-03|3000.0|null|    20|
+-----+-----+-------+----+----------+------+----+------+



In [32]:
# 16. List the emp who are working for the Deptno 10 or20.

emp.filter(emp.DEPTNO.isin(10,20)).show()


+-----+------+---------+----+----------+------+----+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+---------+----+----------+------+----+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|null|    20|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|null|    10|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|null|    20|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|null|    20|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|null|    10|
+-----+------+---------+----+----------+------+----+------+



In [33]:
#17. List the emps who are joined in the year 80.

emp.filter(emp.HIREDATE.between('1980-01-01','1980-12-31')).show()

emp.filter(year("HIREDATE") == 1980).show()

+-----+-----+-----+----+----------+-----+----+------+
|EMPNO|ENAME|  JOB| MGR|  HIREDATE|  SAL|COMM|DEPTNO|
+-----+-----+-----+----+----------+-----+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17|800.0|null|    20|
+-----+-----+-----+----+----------+-----+----+------+

+-----+-----+-----+----+----------+-----+----+------+
|EMPNO|ENAME|  JOB| MGR|  HIREDATE|  SAL|COMM|DEPTNO|
+-----+-----+-----+----+----------+-----+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17|800.0|null|    20|
+-----+-----+-----+----+----------+-----+----+------+



In [34]:
#18. List the emps who are joined in the month of Aug 1980.


emp.filter(year('HIREDATE') == 1980).filter(month('HIREDATE') == 12).show()

+-----+-----+-----+----+----------+-----+----+------+
|EMPNO|ENAME|  JOB| MGR|  HIREDATE|  SAL|COMM|DEPTNO|
+-----+-----+-----+----+----------+-----+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17|800.0|null|    20|
+-----+-----+-----+----+----------+-----+----+------+



In [35]:
#19. List the emps Who Annual sal ranging from 22000 and 45000.

emp.filter((emp.SAL*12).between(22000, 45000)).show()

+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+
| 7566|JONES|MANAGER|7839|1981-04-02|2975.0|null|    20|
| 7698|BLAKE|MANAGER|7839|1981-05-01|2850.0|null|    30|
| 7782|CLARK|MANAGER|7839|1981-06-09|2450.0|null|    10|
| 7788|SCOTT|ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7902| FORD|ANALYST|7566|1981-12-03|3000.0|null|    20|
+-----+-----+-------+----+----------+------+----+------+



In [36]:
#20. List the Enames those are having five characters in their Names.

emp.filter(length(emp.ENAME) == 5).show()

+-----+-----+--------+----+----------+------+------+------+
|EMPNO|ENAME|     JOB| MGR|  HIREDATE|   SAL|  COMM|DEPTNO|
+-----+-----+--------+----+----------+------+------+------+
| 7369|SMITH|   CLERK|7902|1980-12-17| 800.0|  null|    20|
| 7499|ALLEN|SALESMAN|7698|1981-02-20|1600.0|300.00|    30|
| 7566|JONES| MANAGER|7839|1981-04-02|2975.0|  null|    20|
| 7698|BLAKE| MANAGER|7839|1981-05-01|2850.0|  null|    30|
| 7782|CLARK| MANAGER|7839|1981-06-09|2450.0|  null|    10|
| 7788|SCOTT| ANALYST|7566|1982-12-09|3000.0|  null|    20|
| 7876|ADAMS|   CLERK|7788|1983-01-12|1100.0|  null|    20|
| 7900|JAMES|   CLERK|7698|1981-12-03| 950.0|  null|    30|
+-----+-----+--------+----+----------+------+------+------+



In [37]:
#21. List the Enames those are starting with ‘S’ and with five characters.

emp.filter(length(emp.ENAME) == 5).filter(substring(col('ENAME'),0,1) == 'S').show()

+-----+-----+-------+----+----------+------+----+------+
|EMPNO|ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-------+----+----------+------+----+------+
| 7369|SMITH|  CLERK|7902|1980-12-17| 800.0|null|    20|
| 7788|SCOTT|ANALYST|7566|1982-12-09|3000.0|null|    20|
+-----+-----+-------+----+----------+------+----+------+



In [38]:
#22. List the emps those are having four chars and third character must be ‘r’.

emp.filter(length('ENAME') == 4).filter(substring(col('ENAME'),3,1) == 'R').show()

+-----+-----+--------+----+----------+------+------+------+
|EMPNO|ENAME|     JOB| MGR|  HIREDATE|   SAL|  COMM|DEPTNO|
+-----+-----+--------+----+----------+------+------+------+
| 7521| WARD|SALESMAN|7698|1981-02-22|1250.0|500.00|    30|
| 7902| FORD| ANALYST|7566|1981-12-03|3000.0|  null|    20|
+-----+-----+--------+----+----------+------+------+------+



In [39]:
# 28. List the emps those who joined in 80’s.

emp.filter(year("HIREDATE") == 1980).show()

+-----+-----+-----+----+----------+-----+----+------+
|EMPNO|ENAME|  JOB| MGR|  HIREDATE|  SAL|COMM|DEPTNO|
+-----+-----+-----+----+----------+-----+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17|800.0|null|    20|
+-----+-----+-----+----+----------+-----+----+------+



In [40]:
# 29. List the emps who does not belong to Deptno 20.

emp.filter("DEPTNO != 20").show()

+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.0|   null|    10|
+-----+------+---------+----+----------+------+-------+------+



In [41]:
# 30. List all the emps except ‘PRESIDENT’ & ‘MGR” in asc order of Salaries.

emp.filter(~emp.JOB.isin('PRESIDENT', 'MGR')).show()

+-----+------+--------+----+----------+------+-------+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+--------+----+----------+------+-------+------+
| 7369| SMITH|   CLERK|7902|1980-12-17| 800.0|   null|    20|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7566| JONES| MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE| MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7782| CLARK| MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7788| SCOTT| ANALYST|7566|1982-12-09|3000.0|   null|    20|
| 7844|TURNER|SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7876| ADAMS|   CLERK|7788|1983-01-12|1100.0|   null|    20|
| 7900| JAMES|   CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7902|  FORD| ANALYST|7566|1981-12-03|3000.0|   null|    20|
| 7934|MILLER|   CLERK|7782|1982-01-23|1300.0|   null|    10|
+-----+-

In [42]:
# 31. List all the emps who joined before or after 1981.

emp.filter(year("HIREDATE") != 1981).show()

+-----+------+-------+----+----------+------+----+------+
|EMPNO| ENAME|    JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+------+-------+----+----------+------+----+------+
| 7369| SMITH|  CLERK|7902|1980-12-17| 800.0|null|    20|
| 7788| SCOTT|ANALYST|7566|1982-12-09|3000.0|null|    20|
| 7876| ADAMS|  CLERK|7788|1983-01-12|1100.0|null|    20|
| 7934|MILLER|  CLERK|7782|1982-01-23|1300.0|null|    10|
+-----+------+-------+----+----------+------+----+------+



In [43]:
# 32. List the emps whose Empno not starting with digit78.

emp.filter(~emp.EMPNO.like('78%')).show()

+-----+------+--------+----+----------+------+-------+------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+--------+----+----------+------+-------+------+
| 7369| SMITH|   CLERK|7902|1980-12-17| 800.0|   null|    20|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7566| JONES| MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE| MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7782| CLARK| MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7788| SCOTT| ANALYST|7566|1982-12-09|3000.0|   null|    20|
| 7900| JAMES|   CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7902|  FORD| ANALYST|7566|1981-12-03|3000.0|   null|    20|
| 7934|MILLER|   CLERK|7782|1982-01-23|1300.0|   null|    10|
+-----+------+--------+----+----------+------+-------+------+



In [44]:
# 33. List the emps who are working under ‘MGR’.

# select e.ename || ‘ works for ‘ || m.ename from emp e ,emp m where e.mgr =
# m.empno ;

empE = emp
empM = emp

empE.join(empM, empE.EMPNO == empM.MGR) #.select('*').show()

#emp.alias("a").join(b.alias("b"), joinExprs)

#e.mgr = m.empno



DataFrame[EMPNO: int, ENAME: string, JOB: string, MGR: int, HIREDATE: date, SAL: double, COMM: string, DEPTNO: int, EMPNO: int, ENAME: string, JOB: string, MGR: int, HIREDATE: date, SAL: double, COMM: string, DEPTNO: int]

In [45]:
# 34. List the emps who joined in any year but not belongs to the month of March.

emp.filter(month(emp.HIREDATE) != 03).show()


+-----+------+---------+----+----------+------+-------+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|   SAL|   COMM|DEPTNO|
+-----+------+---------+----+----------+------+-------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800.0|   null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975.0|   null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850.0|   null|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.0|   null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-09|3000.0|   null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.0|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0|   0.00|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1100.0|   null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950.0|   null|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000.0|   null|

In [46]:
# 35. List all the Clerks of Deptno 20

emp.filter("JOB = 'CLERK'").filter(col("DEPTNO") == 20).show()


+-----+-----+-----+----+----------+------+----+------+
|EMPNO|ENAME|  JOB| MGR|  HIREDATE|   SAL|COMM|DEPTNO|
+-----+-----+-----+----+----------+------+----+------+
| 7369|SMITH|CLERK|7902|1980-12-17| 800.0|null|    20|
| 7876|ADAMS|CLERK|7788|1983-01-12|1100.0|null|    20|
+-----+-----+-----+----+----------+------+----+------+

