In [1]:
from pyspark.sql import SparkSession

## 1.	Load data from local files

In [2]:
spark = SparkSession.builder.appName("pySp1").getOrCreate()

#### In case of error in type casting to datetime, use following code

In [3]:
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [4]:
df = spark.read.csv('D:/DataSets/emp.csv', header = True)

## 2.	Display the schema of the DataFrame 

In [5]:
cols_check = df.columns
cols_check

['empno', 'ename', 'job', 'mgr', 'hiredate', 'sal', 'comm', 'deptno']

In [6]:
df.dtypes

[('empno', 'string'),
 ('ename', 'string'),
 ('job', 'string'),
 ('mgr', 'string'),
 ('hiredate', 'string'),
 ('sal', 'string'),
 ('comm', 'string'),
 ('deptno', 'string')]

In [7]:
print(df.printSchema())

root
 |-- empno: string (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: string (nullable = true)
 |-- hiredate: string (nullable = true)
 |-- sal: string (nullable = true)
 |-- comm: string (nullable = true)
 |-- deptno: string (nullable = true)

None


In [8]:
df.show(5)

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7839|  KING|PRESIDENT|null|11/17/2001|5000|null|    10|
| 7698| BLAKE|  MANAGER|7839|05/01/2001|2850|null|    30|
| 7782| CLARK|  MANAGER|7839|06/09/2001|2450|null|    10|
| 7566| JONES|  MANAGER|7839|04/02/2001|2975|null|    20|
| 7654|MARTIN| SALESMAN|7698|09/28/2001|1250|1400|    30|
+-----+------+---------+----+----------+----+----+------+
only showing top 5 rows



## 3. Data Type Conversion

In [9]:
from pyspark.sql.types import *
from pyspark.sql.functions import to_date, unix_timestamp, from_unixtime

In [10]:
df = df.withColumn("empno", df.empno.cast('integer')) \
        .withColumn("mgr", df.mgr.cast('integer')) \
        .withColumn("sal", df.sal.cast('float')) \
        .withColumn("comm", df.comm.cast('float')) \
        .withColumn("deptno", df.deptno.cast('integer'))

In [11]:
df = df.withColumn('hiredate', to_date(unix_timestamp(df.hiredate, 'MM/dd/yyyy').cast("timestamp")))

In [12]:
df.dtypes

[('empno', 'int'),
 ('ename', 'string'),
 ('job', 'string'),
 ('mgr', 'int'),
 ('hiredate', 'date'),
 ('sal', 'float'),
 ('comm', 'float'),
 ('deptno', 'int')]

## 4. Show the head of the DataFrame 

In [13]:
df.show(5)

+-----+------+---------+----+----------+------+------+------+
|empno| ename|      job| mgr|  hiredate|   sal|  comm|deptno|
+-----+------+---------+----+----------+------+------+------+
| 7839|  KING|PRESIDENT|null|2001-11-17|5000.0|  null|    10|
| 7698| BLAKE|  MANAGER|7839|2001-05-01|2850.0|  null|    30|
| 7782| CLARK|  MANAGER|7839|2001-06-09|2450.0|  null|    10|
| 7566| JONES|  MANAGER|7839|2001-04-02|2975.0|  null|    20|
| 7654|MARTIN| SALESMAN|7698|2001-09-28|1250.0|1400.0|    30|
+-----+------+---------+----+----------+------+------+------+
only showing top 5 rows



In [14]:
df.take(5)

[Row(empno=7839, ename='KING', job='PRESIDENT', mgr=None, hiredate=datetime.date(2001, 11, 17), sal=5000.0, comm=None, deptno=10),
 Row(empno=7698, ename='BLAKE', job='MANAGER', mgr=7839, hiredate=datetime.date(2001, 5, 1), sal=2850.0, comm=None, deptno=30),
 Row(empno=7782, ename='CLARK', job='MANAGER', mgr=7839, hiredate=datetime.date(2001, 6, 9), sal=2450.0, comm=None, deptno=10),
 Row(empno=7566, ename='JONES', job='MANAGER', mgr=7839, hiredate=datetime.date(2001, 4, 2), sal=2975.0, comm=None, deptno=20),
 Row(empno=7654, ename='MARTIN', job='SALESMAN', mgr=7698, hiredate=datetime.date(2001, 9, 28), sal=1250.0, comm=1400.0, deptno=30)]

In pyspark, take() and show() are different. show() prints results, take() returns a list of rows (in PySpark) and can be used to create a new dataframe. They are both actions.

## 5.	Select Columns from the DataFrame

In [15]:
df.select('ename','sal').show(5)

+------+------+
| ename|   sal|
+------+------+
|  KING|5000.0|
| BLAKE|2850.0|
| CLARK|2450.0|
| JONES|2975.0|
|MARTIN|1250.0|
+------+------+
only showing top 5 rows



#### Alternative
df1 = df.select('ename','sal')
<p>
df1.show(5)

## 6.	Show the Statistics of the DataFrame

In [16]:
df.select('ename','sal').describe().show()

+-------+-----+------------------+
|summary|ename|               sal|
+-------+-----+------------------+
|  count|   14|                14|
|   mean| null| 2073.214285714286|
| stddev| null|1182.5032235162716|
|    min|ADAMS|             800.0|
|    max| WARD|            5000.0|
+-------+-----+------------------+



In [17]:
df.describe('job').show() # show the stats for a specific column

+-------+--------+
|summary|     job|
+-------+--------+
|  count|      14|
|   mean|    null|
| stddev|    null|
|    min| ANALYST|
|    max|SALESMAN|
+-------+--------+



## 7. Drop Duplicates 

In [18]:
df.select('job').show()

+---------+
|      job|
+---------+
|PRESIDENT|
|  MANAGER|
|  MANAGER|
|  MANAGER|
| SALESMAN|
| SALESMAN|
| SALESMAN|
|    CLERK|
| SALESMAN|
|  ANALYST|
|    CLERK|
|  ANALYST|
|    CLERK|
|    CLERK|
+---------+



In [19]:
df.select('job').distinct().show()

+---------+
|      job|
+---------+
|  ANALYST|
| SALESMAN|
|    CLERK|
|  MANAGER|
|PRESIDENT|
+---------+



In [20]:
df.select('job').dropDuplicates().show(5)

+---------+
|      job|
+---------+
|  ANALYST|
| SALESMAN|
|    CLERK|
|  MANAGER|
|PRESIDENT|
+---------+



In [21]:
df_job = df.select('job').dropDuplicates()

In [22]:
df_job.show()

+---------+
|      job|
+---------+
|  ANALYST|
| SALESMAN|
|    CLERK|
|  MANAGER|
|PRESIDENT|
+---------+



## 8. Missing Values 

### 8.1 Check NA

In [23]:
from pyspark.sql.functions import isnull, when, count, col

In [24]:
## See if we have missing values
df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).show()

+-----+-----+---+---+--------+---+----+------+
|empno|ename|job|mgr|hiredate|sal|comm|deptno|
+-----+-----+---+---+--------+---+----+------+
|    0|    0|  0|  1|       0|  0|  10|     0|
+-----+-----+---+---+--------+---+----+------+



pyspark.sql.functions.isnan(col): an expression that returns true iff the column is NaN.
<br>
isNull() :True if the current expression is null.

### 8.2 Drop NA

In [25]:
df2 = df.dropna(how='any', subset =['comm', 'mgr'])
print(df2.count())
print(df.count())

4
14


In [26]:
df02 = df.dropna(how='all', subset =['comm', 'mgr'])
print(df02.count())
print(df.count())

13
14


In [27]:
df.show()

+-----+------+---------+----+----------+------+------+------+
|empno| ename|      job| mgr|  hiredate|   sal|  comm|deptno|
+-----+------+---------+----+----------+------+------+------+
| 7839|  KING|PRESIDENT|null|2001-11-17|5000.0|  null|    10|
| 7698| BLAKE|  MANAGER|7839|2001-05-01|2850.0|  null|    30|
| 7782| CLARK|  MANAGER|7839|2001-06-09|2450.0|  null|    10|
| 7566| JONES|  MANAGER|7839|2001-04-02|2975.0|  null|    20|
| 7654|MARTIN| SALESMAN|7698|2001-09-28|1250.0|1400.0|    30|
| 7499| ALLEN| SALESMAN|7698|2001-02-20|1600.0| 300.0|    30|
| 7844|TURNER| SALESMAN|7698|2001-09-08|1500.0|   0.0|    30|
| 7900| JAMES|    CLERK|7698|2001-12-03| 950.0|  null|    30|
| 7521|  WARD| SALESMAN|7698|2001-02-22|1250.0| 500.0|    30|
| 7902|  FORD|  ANALYST|7566|2001-02-03|3000.0|  null|    20|
| 7369| SMITH|    CLERK|7902|2000-12-17| 800.0|  null|    20|
| 7788| SCOTT|  ANALYST|7566|2007-04-19|3000.0|  null|    20|
| 7876| ADAMS|    CLERK|7788|2007-05-23|1100.0|  null|    20|
| 7934|M

In [28]:
df02.show()

+-----+------+--------+----+----------+------+------+------+
|empno| ename|     job| mgr|  hiredate|   sal|  comm|deptno|
+-----+------+--------+----+----------+------+------+------+
| 7698| BLAKE| MANAGER|7839|2001-05-01|2850.0|  null|    30|
| 7782| CLARK| MANAGER|7839|2001-06-09|2450.0|  null|    10|
| 7566| JONES| MANAGER|7839|2001-04-02|2975.0|  null|    20|
| 7654|MARTIN|SALESMAN|7698|2001-09-28|1250.0|1400.0|    30|
| 7499| ALLEN|SALESMAN|7698|2001-02-20|1600.0| 300.0|    30|
| 7844|TURNER|SALESMAN|7698|2001-09-08|1500.0|   0.0|    30|
| 7900| JAMES|   CLERK|7698|2001-12-03| 950.0|  null|    30|
| 7521|  WARD|SALESMAN|7698|2001-02-22|1250.0| 500.0|    30|
| 7902|  FORD| ANALYST|7566|2001-02-03|3000.0|  null|    20|
| 7369| SMITH|   CLERK|7902|2000-12-17| 800.0|  null|    20|
| 7788| SCOTT| ANALYST|7566|2007-04-19|3000.0|  null|    20|
| 7876| ADAMS|   CLERK|7788|2007-05-23|1100.0|  null|    20|
| 7934|MILLER|   CLERK|7782|2002-01-23|1300.0|  null|    10|
+-----+------+--------+-

### 8.3 Replace NA

In [29]:
df02 = df.fillna({'comm':0,'mgr': 0})

In [30]:
df02.show()

+-----+------+---------+----+----------+------+------+------+
|empno| ename|      job| mgr|  hiredate|   sal|  comm|deptno|
+-----+------+---------+----+----------+------+------+------+
| 7839|  KING|PRESIDENT|   0|2001-11-17|5000.0|   0.0|    10|
| 7698| BLAKE|  MANAGER|7839|2001-05-01|2850.0|   0.0|    30|
| 7782| CLARK|  MANAGER|7839|2001-06-09|2450.0|   0.0|    10|
| 7566| JONES|  MANAGER|7839|2001-04-02|2975.0|   0.0|    20|
| 7654|MARTIN| SALESMAN|7698|2001-09-28|1250.0|1400.0|    30|
| 7499| ALLEN| SALESMAN|7698|2001-02-20|1600.0| 300.0|    30|
| 7844|TURNER| SALESMAN|7698|2001-09-08|1500.0|   0.0|    30|
| 7900| JAMES|    CLERK|7698|2001-12-03| 950.0|   0.0|    30|
| 7521|  WARD| SALESMAN|7698|2001-02-22|1250.0| 500.0|    30|
| 7902|  FORD|  ANALYST|7566|2001-02-03|3000.0|   0.0|    20|
| 7369| SMITH|    CLERK|7902|2000-12-17| 800.0|   0.0|    20|
| 7788| SCOTT|  ANALYST|7566|2007-04-19|3000.0|   0.0|    20|
| 7876| ADAMS|    CLERK|7788|2007-05-23|1100.0|   0.0|    20|
| 7934|M

In [31]:
df02 = df.dropna(how='all', subset =['comm', 'mgr'])

In [32]:
df02.show()

+-----+------+--------+----+----------+------+------+------+
|empno| ename|     job| mgr|  hiredate|   sal|  comm|deptno|
+-----+------+--------+----+----------+------+------+------+
| 7698| BLAKE| MANAGER|7839|2001-05-01|2850.0|  null|    30|
| 7782| CLARK| MANAGER|7839|2001-06-09|2450.0|  null|    10|
| 7566| JONES| MANAGER|7839|2001-04-02|2975.0|  null|    20|
| 7654|MARTIN|SALESMAN|7698|2001-09-28|1250.0|1400.0|    30|
| 7499| ALLEN|SALESMAN|7698|2001-02-20|1600.0| 300.0|    30|
| 7844|TURNER|SALESMAN|7698|2001-09-08|1500.0|   0.0|    30|
| 7900| JAMES|   CLERK|7698|2001-12-03| 950.0|  null|    30|
| 7521|  WARD|SALESMAN|7698|2001-02-22|1250.0| 500.0|    30|
| 7902|  FORD| ANALYST|7566|2001-02-03|3000.0|  null|    20|
| 7369| SMITH|   CLERK|7902|2000-12-17| 800.0|  null|    20|
| 7788| SCOTT| ANALYST|7566|2007-04-19|3000.0|  null|    20|
| 7876| ADAMS|   CLERK|7788|2007-05-23|1100.0|  null|    20|
| 7934|MILLER|   CLERK|7782|2002-01-23|1300.0|  null|    10|
+-----+------+--------+-

In [33]:
from pyspark.sql.functions import mean

In [34]:
avg = df.select(mean(df.comm)).collect()[0][0]
print(avg)

550.0


In [35]:
df02 = df.fillna({'comm':avg})
df02.show()

+-----+------+---------+----+----------+------+------+------+
|empno| ename|      job| mgr|  hiredate|   sal|  comm|deptno|
+-----+------+---------+----+----------+------+------+------+
| 7839|  KING|PRESIDENT|null|2001-11-17|5000.0| 550.0|    10|
| 7698| BLAKE|  MANAGER|7839|2001-05-01|2850.0| 550.0|    30|
| 7782| CLARK|  MANAGER|7839|2001-06-09|2450.0| 550.0|    10|
| 7566| JONES|  MANAGER|7839|2001-04-02|2975.0| 550.0|    20|
| 7654|MARTIN| SALESMAN|7698|2001-09-28|1250.0|1400.0|    30|
| 7499| ALLEN| SALESMAN|7698|2001-02-20|1600.0| 300.0|    30|
| 7844|TURNER| SALESMAN|7698|2001-09-08|1500.0|   0.0|    30|
| 7900| JAMES|    CLERK|7698|2001-12-03| 950.0| 550.0|    30|
| 7521|  WARD| SALESMAN|7698|2001-02-22|1250.0| 500.0|    30|
| 7902|  FORD|  ANALYST|7566|2001-02-03|3000.0| 550.0|    20|
| 7369| SMITH|    CLERK|7902|2000-12-17| 800.0| 550.0|    20|
| 7788| SCOTT|  ANALYST|7566|2007-04-19|3000.0| 550.0|    20|
| 7876| ADAMS|    CLERK|7788|2007-05-23|1100.0| 550.0|    20|
| 7934|M

## 9. Datetime Manipulations

In [36]:
from pyspark.sql.functions import year, month, dayofmonth, dayofyear, weekofyear, hour, minute                

In [37]:
df5 = df.select(df.hiredate, year(df.hiredate).alias('dt_year'), \
                month(df.hiredate).alias('dt_month'), \
                dayofmonth(df.hiredate).alias('dt_day'), \
                dayofyear(df.hiredate).alias('dt_dayofy'), \
                hour(df.hiredate).alias('dt_hour'), \
                minute(df.hiredate).alias('dt_min'), \
                weekofyear(df.hiredate).alias('dt_week_no'), \
                unix_timestamp(df.hiredate).alias('dt_int'))
df5.show()

+----------+-------+--------+------+---------+-------+------+----------+----------+
|  hiredate|dt_year|dt_month|dt_day|dt_dayofy|dt_hour|dt_min|dt_week_no|    dt_int|
+----------+-------+--------+------+---------+-------+------+----------+----------+
|2001-11-17|   2001|      11|    17|      321|      0|     0|        46|1005937200|
|2001-05-01|   2001|       5|     1|      121|      0|     0|        18| 988657200|
|2001-06-09|   2001|       6|     9|      160|      0|     0|        23| 992026800|
|2001-04-02|   2001|       4|     2|       92|      0|     0|        14| 986151600|
|2001-09-28|   2001|       9|    28|      271|      0|     0|        39|1001617200|
|2001-02-20|   2001|       2|    20|       51|      0|     0|         8| 982609200|
|2001-09-08|   2001|       9|     8|      251|      0|     0|        36| 999889200|
|2001-12-03|   2001|      12|     3|      337|      0|     0|        49|1007319600|
|2001-02-22|   2001|       2|    22|       53|      0|     0|         8| 982

## 10. 	Filter Data Based on Conditions 

In [38]:
df.where((df.deptno == 10) | (df.deptno == 20)).show(5)

+-----+-----+---------+----+----------+------+----+------+
|empno|ename|      job| mgr|  hiredate|   sal|comm|deptno|
+-----+-----+---------+----+----------+------+----+------+
| 7839| KING|PRESIDENT|null|2001-11-17|5000.0|null|    10|
| 7782|CLARK|  MANAGER|7839|2001-06-09|2450.0|null|    10|
| 7566|JONES|  MANAGER|7839|2001-04-02|2975.0|null|    20|
| 7902| FORD|  ANALYST|7566|2001-02-03|3000.0|null|    20|
| 7369|SMITH|    CLERK|7902|2000-12-17| 800.0|null|    20|
+-----+-----+---------+----+----------+------+----+------+
only showing top 5 rows



In [39]:
df[df.deptno.isin(10, 20)].show(5)

+-----+-----+---------+----+----------+------+----+------+
|empno|ename|      job| mgr|  hiredate|   sal|comm|deptno|
+-----+-----+---------+----+----------+------+----+------+
| 7839| KING|PRESIDENT|null|2001-11-17|5000.0|null|    10|
| 7782|CLARK|  MANAGER|7839|2001-06-09|2450.0|null|    10|
| 7566|JONES|  MANAGER|7839|2001-04-02|2975.0|null|    20|
| 7902| FORD|  ANALYST|7566|2001-02-03|3000.0|null|    20|
| 7369|SMITH|    CLERK|7902|2000-12-17| 800.0|null|    20|
+-----+-----+---------+----+----------+------+----+------+
only showing top 5 rows



In [40]:
df.select('empno', 'ename', 'deptno').filter((df.deptno == 10) | (df.deptno == 20)).show()

+-----+------+------+
|empno| ename|deptno|
+-----+------+------+
| 7839|  KING|    10|
| 7782| CLARK|    10|
| 7566| JONES|    20|
| 7902|  FORD|    20|
| 7369| SMITH|    20|
| 7788| SCOTT|    20|
| 7876| ADAMS|    20|
| 7934|MILLER|    10|
+-----+------+------+



In [41]:
df.select('empno', 'ename', 'job', 'sal', 'deptno').filter((df.sal >2000) & (df.job != 'ANALYST')).show()

+-----+-----+---------+------+------+
|empno|ename|      job|   sal|deptno|
+-----+-----+---------+------+------+
| 7839| KING|PRESIDENT|5000.0|    10|
| 7698|BLAKE|  MANAGER|2850.0|    30|
| 7782|CLARK|  MANAGER|2450.0|    10|
| 7566|JONES|  MANAGER|2975.0|    20|
+-----+-----+---------+------+------+



## 11.	Group By with Aggregation Functions

Common aggreagation functions for both pandas and pyspark include: sum(), count(),mean(), min(),max()

In [42]:
import pyspark.sql.functions as F 

In [43]:
df.groupby(df.deptno).agg(F.sum('sal')).show() # SELECT deptno, SUM(sal) FROM emp GROUP BY deptno
#F.mean(), F.max(), F.countDistinct(), F.min(), F.count()

+------+--------+
|deptno|sum(sal)|
+------+--------+
|    20| 10875.0|
|    10|  8750.0|
|    30|  9400.0|
+------+--------+



In [44]:
df.groupby(df.deptno).agg(F.countDistinct('job')).show()

+------+----------+
|deptno|count(job)|
+------+----------+
|    20|         3|
|    10|         3|
|    30|         3|
+------+----------+



In [45]:
df.groupby(df.deptno).agg(F.count('job')).show()

+------+----------+
|deptno|count(job)|
+------+----------+
|    20|         5|
|    10|         3|
|    30|         6|
+------+----------+



In [46]:
df.groupBy(['deptno','job']).agg({'sal': 'sum', 'empno': 'count'}).show()

+------+---------+------------+--------+
|deptno|      job|count(empno)|sum(sal)|
+------+---------+------------+--------+
|    20|  ANALYST|           2|  6000.0|
|    20|  MANAGER|           1|  2975.0|
|    30|  MANAGER|           1|  2850.0|
|    30| SALESMAN|           4|  5600.0|
|    30|    CLERK|           1|   950.0|
|    10|PRESIDENT|           1|  5000.0|
|    20|    CLERK|           2|  1900.0|
|    10|    CLERK|           1|  1300.0|
|    10|  MANAGER|           1|  2450.0|
+------+---------+------------+--------+



It's hard to compare the aggregation results, since the pandas dataframe and pyspark dataframe are in different orders. The following shows how can we sort the data frame based on specific columns. 

## 12. Sort Data

In pandas, we use sort_values(), while we use sort() in pyspark to sort the data frame based on specific columns. The default sorting order is ascending.  

In [47]:
df_agg = df.groupBy(['deptno','job']).agg({'sal': 'sum', 'empno': 'count'}) \
                                    .sort(['deptno','job'], ascending =True)
df_agg.show()

+------+---------+------------+--------+
|deptno|      job|count(empno)|sum(sal)|
+------+---------+------------+--------+
|    10|    CLERK|           1|  1300.0|
|    10|  MANAGER|           1|  2450.0|
|    10|PRESIDENT|           1|  5000.0|
|    20|  ANALYST|           2|  6000.0|
|    20|    CLERK|           2|  1900.0|
|    20|  MANAGER|           1|  2975.0|
|    30|    CLERK|           1|   950.0|
|    30|  MANAGER|           1|  2850.0|
|    30| SALESMAN|           4|  5600.0|
+------+---------+------------+--------+



## 13. Rename Columns 

After the aggregation functions, the names of some columns are not reasonable. We need to rename these column names to avoid confusion. The following shows how can we rename columns in pandas and pyspark dataframe. 

In [48]:
df_agg = df_agg.withColumnRenamed("count(empno)","empCount").withColumnRenamed("sum(sal)", "salSum")
df_agg.show()

+------+---------+--------+------+
|deptno|      job|empCount|salSum|
+------+---------+--------+------+
|    10|    CLERK|       1|1300.0|
|    10|  MANAGER|       1|2450.0|
|    10|PRESIDENT|       1|5000.0|
|    20|  ANALYST|       2|6000.0|
|    20|    CLERK|       2|1900.0|
|    20|  MANAGER|       1|2975.0|
|    30|    CLERK|       1| 950.0|
|    30|  MANAGER|       1|2850.0|
|    30| SALESMAN|       4|5600.0|
+------+---------+--------+------+



## 14. Create a New Column 

In [49]:
df3 = df.withColumn('AnnualSal', df.sal*12)
df3.show()

+-----+------+---------+----+----------+------+------+------+---------+
|empno| ename|      job| mgr|  hiredate|   sal|  comm|deptno|AnnualSal|
+-----+------+---------+----+----------+------+------+------+---------+
| 7839|  KING|PRESIDENT|null|2001-11-17|5000.0|  null|    10|  60000.0|
| 7698| BLAKE|  MANAGER|7839|2001-05-01|2850.0|  null|    30|  34200.0|
| 7782| CLARK|  MANAGER|7839|2001-06-09|2450.0|  null|    10|  29400.0|
| 7566| JONES|  MANAGER|7839|2001-04-02|2975.0|  null|    20|  35700.0|
| 7654|MARTIN| SALESMAN|7698|2001-09-28|1250.0|1400.0|    30|  15000.0|
| 7499| ALLEN| SALESMAN|7698|2001-02-20|1600.0| 300.0|    30|  19200.0|
| 7844|TURNER| SALESMAN|7698|2001-09-08|1500.0|   0.0|    30|  18000.0|
| 7900| JAMES|    CLERK|7698|2001-12-03| 950.0|  null|    30|  11400.0|
| 7521|  WARD| SALESMAN|7698|2001-02-22|1250.0| 500.0|    30|  15000.0|
| 7902|  FORD|  ANALYST|7566|2001-02-03|3000.0|  null|    20|  36000.0|
| 7369| SMITH|    CLERK|7902|2000-12-17| 800.0|  null|    20|   

## 15. Join Tables

In [50]:
TableA =  df
TableB = spark.read.csv('D:/DataSets/dept.csv', header = True)

In [51]:
TableB = TableB.withColumnRenamed("deptno","dno")

In [52]:
ta = TableA.alias('e') 
tb = TableB.alias('d')

In [53]:
tb.join(ta, ta.deptno==tb.dno).show()
# right, right_outer, full, default ‘inner’, how ='left'

+---+----------+--------+-----+------+---------+----+----------+------+------+------+
|dno|     dname|     loc|empno| ename|      job| mgr|  hiredate|   sal|  comm|deptno|
+---+----------+--------+-----+------+---------+----+----------+------+------+------+
| 10|ACCOUNTING|NEW YORK| 7839|  KING|PRESIDENT|null|2001-11-17|5000.0|  null|    10|
| 30|     SALES| CHICAGO| 7698| BLAKE|  MANAGER|7839|2001-05-01|2850.0|  null|    30|
| 10|ACCOUNTING|NEW YORK| 7782| CLARK|  MANAGER|7839|2001-06-09|2450.0|  null|    10|
| 20|  RESEARCH|  DALLAS| 7566| JONES|  MANAGER|7839|2001-04-02|2975.0|  null|    20|
| 30|     SALES| CHICAGO| 7654|MARTIN| SALESMAN|7698|2001-09-28|1250.0|1400.0|    30|
| 30|     SALES| CHICAGO| 7499| ALLEN| SALESMAN|7698|2001-02-20|1600.0| 300.0|    30|
| 30|     SALES| CHICAGO| 7844|TURNER| SALESMAN|7698|2001-09-08|1500.0|   0.0|    30|
| 30|     SALES| CHICAGO| 7900| JAMES|    CLERK|7698|2001-12-03| 950.0|  null|    30|
| 30|     SALES| CHICAGO| 7521|  WARD| SALESMAN|7698|2

In [54]:
TableB.show()

+---+----------+--------+
|dno|     dname|     loc|
+---+----------+--------+
| 10|ACCOUNTING|NEW YORK|
| 20|  RESEARCH|  DALLAS|
| 30|     SALES| CHICAGO|
| 40|OPERATIONS|  BOSTON|
+---+----------+--------+



In [55]:
tb.join(ta, ta.deptno==tb.dno, how = 'left').show()

+---+----------+--------+-----+------+---------+----+----------+------+------+------+
|dno|     dname|     loc|empno| ename|      job| mgr|  hiredate|   sal|  comm|deptno|
+---+----------+--------+-----+------+---------+----+----------+------+------+------+
| 10|ACCOUNTING|NEW YORK| 7934|MILLER|    CLERK|7782|2002-01-23|1300.0|  null|    10|
| 10|ACCOUNTING|NEW YORK| 7782| CLARK|  MANAGER|7839|2001-06-09|2450.0|  null|    10|
| 10|ACCOUNTING|NEW YORK| 7839|  KING|PRESIDENT|null|2001-11-17|5000.0|  null|    10|
| 20|  RESEARCH|  DALLAS| 7876| ADAMS|    CLERK|7788|2007-05-23|1100.0|  null|    20|
| 20|  RESEARCH|  DALLAS| 7788| SCOTT|  ANALYST|7566|2007-04-19|3000.0|  null|    20|
| 20|  RESEARCH|  DALLAS| 7369| SMITH|    CLERK|7902|2000-12-17| 800.0|  null|    20|
| 20|  RESEARCH|  DALLAS| 7902|  FORD|  ANALYST|7566|2001-02-03|3000.0|  null|    20|
| 20|  RESEARCH|  DALLAS| 7566| JONES|  MANAGER|7839|2001-04-02|2975.0|  null|    20|
| 30|     SALES| CHICAGO| 7521|  WARD| SALESMAN|7698|2

In [56]:
ta.join(tb, ta.deptno==tb.dno, how = 'right').show()

+-----+------+---------+----+----------+------+------+------+---+----------+--------+
|empno| ename|      job| mgr|  hiredate|   sal|  comm|deptno|dno|     dname|     loc|
+-----+------+---------+----+----------+------+------+------+---+----------+--------+
| 7934|MILLER|    CLERK|7782|2002-01-23|1300.0|  null|    10| 10|ACCOUNTING|NEW YORK|
| 7782| CLARK|  MANAGER|7839|2001-06-09|2450.0|  null|    10| 10|ACCOUNTING|NEW YORK|
| 7839|  KING|PRESIDENT|null|2001-11-17|5000.0|  null|    10| 10|ACCOUNTING|NEW YORK|
| 7876| ADAMS|    CLERK|7788|2007-05-23|1100.0|  null|    20| 20|  RESEARCH|  DALLAS|
| 7788| SCOTT|  ANALYST|7566|2007-04-19|3000.0|  null|    20| 20|  RESEARCH|  DALLAS|
| 7369| SMITH|    CLERK|7902|2000-12-17| 800.0|  null|    20| 20|  RESEARCH|  DALLAS|
| 7902|  FORD|  ANALYST|7566|2001-02-03|3000.0|  null|    20| 20|  RESEARCH|  DALLAS|
| 7566| JONES|  MANAGER|7839|2001-04-02|2975.0|  null|    20| 20|  RESEARCH|  DALLAS|
| 7521|  WARD| SALESMAN|7698|2001-02-22|1250.0| 500.0|

In [57]:
ta.join(tb, ta.deptno==tb.dno, how = 'full').show()

+-----+------+---------+----+----------+------+------+------+---+----------+--------+
|empno| ename|      job| mgr|  hiredate|   sal|  comm|deptno|dno|     dname|     loc|
+-----+------+---------+----+----------+------+------+------+---+----------+--------+
| 7566| JONES|  MANAGER|7839|2001-04-02|2975.0|  null|    20| 20|  RESEARCH|  DALLAS|
| 7902|  FORD|  ANALYST|7566|2001-02-03|3000.0|  null|    20| 20|  RESEARCH|  DALLAS|
| 7369| SMITH|    CLERK|7902|2000-12-17| 800.0|  null|    20| 20|  RESEARCH|  DALLAS|
| 7788| SCOTT|  ANALYST|7566|2007-04-19|3000.0|  null|    20| 20|  RESEARCH|  DALLAS|
| 7876| ADAMS|    CLERK|7788|2007-05-23|1100.0|  null|    20| 20|  RESEARCH|  DALLAS|
| null|  null|     null|null|      null|  null|  null|  null| 40|OPERATIONS|  BOSTON|
| 7839|  KING|PRESIDENT|null|2001-11-17|5000.0|  null|    10| 10|ACCOUNTING|NEW YORK|
| 7782| CLARK|  MANAGER|7839|2001-06-09|2450.0|  null|    10| 10|ACCOUNTING|NEW YORK|
| 7934|MILLER|    CLERK|7782|2002-01-23|1300.0|  null|