## Spark code Practice

In [2]:
df = spark.read.format("csv").option('header', 'true').option('inferSchema', True).load('./data/countries.csv')
df.show()

+----------+-------------------+--------------------+------------+----------+----------------+----------+---------+---------+-------------+----------------------+----------------------+
|COUNTRY_ID|               NAME|         NATIONALITY|COUNTRY_CODE|ISO_ALPHA2|         CAPITAL|POPULATION| AREA_KM2|REGION_ID|SUB_REGION_ID|INTERMEDIATE_REGION_ID|ORGANIZATION_REGION_ID|
+----------+-------------------+--------------------+------------+----------+----------------+----------+---------+---------+-------------+----------------------+----------------------+
|         1|        Afghanistan|              Afghan|         AFG|        AF|           Kabul|  38041754| 652230.0|       30|           30|                  NULL|                    30|
|         2|            Albania|            Albanian|         ALB|        AL|          Tirana|   2880917|  28748.0|       20|           70|                  NULL|                    20|
|         3|            Algeria|            Algerian|         DZA|    

## Read Json file

### Read single line json file

In [3]:

df_single_json  = spark.read.format('json').load('./data/countries_single_line.json')
df_single_json.show(10, False)

+---------+----------------+------------+----------+----------------------+----------+-------------------+--------------------+----------------------+----------+---------+-------------+
|AREA_KM2 |CAPITAL         |COUNTRY_CODE|COUNTRY_ID|INTERMEDIATE_REGION_ID|ISO_ALPHA2|NAME               |NATIONALITY         |ORGANIZATION_REGION_ID|POPULATION|REGION_ID|SUB_REGION_ID|
+---------+----------------+------------+----------+----------------------+----------+-------------------+--------------------+----------------------+----------+---------+-------------+
|652230.0 |Kabul           |AFG         |1         |NULL                  |AF        |Afghanistan        |Afghan              |30                    |38041754  |30       |30           |
|28748.0  |Tirana          |ALB         |2         |NULL                  |AL        |Albania            |Albanian            |20                    |2880917   |20       |70           |
|2381741.0|Algiers         |DZA         |3         |NULL              

### Read multiline json

In [4]:
df_multiline_json = spark.read.format('json').option('multiline', 'true').load('./data/countries_multi_line.json')
df_multiline_json.show(10, False)


+---------+----------------+------------+----------+----------------------+----------+-------------------+--------------------+----------------------+----------+---------+-------------+
|AREA_KM2 |CAPITAL         |COUNTRY_CODE|COUNTRY_ID|INTERMEDIATE_REGION_ID|ISO_ALPHA2|NAME               |NATIONALITY         |ORGANIZATION_REGION_ID|POPULATION|REGION_ID|SUB_REGION_ID|
+---------+----------------+------------+----------+----------------------+----------+-------------------+--------------------+----------------------+----------+---------+-------------+
|652230.0 |Kabul           |AFG         |1         |NULL                  |AF        |Afghanistan        |Afghan              |30                    |38041754  |30       |30           |
|28748.0  |Tirana          |ALB         |2         |NULL                  |AL        |Albania            |Albanian            |20                    |2880917   |20       |70           |
|2381741.0|Algiers         |DZA         |3         |NULL              

### Read employee data 

In [5]:
emp_df = spark.read.format('json').option('multiline', 'true').load('./data/employee.json')
emp_df.show(10, False)

+------+--------+----------+------+
|emp_id|emp_name|manager_id|salary|
+------+--------+----------+------+
|1     |Alice   |NULL      |90000 |
|2     |Bob     |1         |75000 |
|3     |Charlie |7         |72000 |
|4     |David   |7         |60000 |
|5     |Eve     |2         |58000 |
|6     |Frank   |3         |62000 |
|7     |Grace   |3         |58000 |
|8     |Hank    |7         |50000 |
|9     |Ivy     |5         |53000 |
|10    |Jack    |5         |51000 |
+------+--------+----------+------+
only showing top 10 rows


### CSV Without Column Names/Schema

In [6]:
ew_data = spark.read.format('csv').load('./data/emp-without_header.csv')
emp_df2 = ew_data.toDF('emp_id', 'emp_name','salary','manager_id')
emp_df2.show()


+------+--------+------+----------+
|emp_id|emp_name|salary|manager_id|
+------+--------+------+----------+
|     1|   Alice| 90000|      NULL|
|     2|     Bob| 75000|         1|
|     3| Charlie| 72000|         1|
|     4|   David| 60000|         2|
|     5|     Eve| 58000|         2|
|     6|   Frank| 62000|         3|
|     7|   Grace| 61000|         3|
|     8|    Hank| 50000|         4|
|     9|     Ivy| 53000|         5|
|    10|    Jack| 51000|         5|
+------+--------+------+----------+



### Find 3rd Highest Salary

In [7]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

windowSpec = Window.orderBy(col('salary').desc())
emp_df = emp_df.withColumn('sal_rank', rank().over(windowSpec))
emp_df.show()
thrid_highest_salary = emp_df.filter(col('sal_rank') ==3)
thrid_highest_salary.show()







+------+--------+----------+------+--------+
|emp_id|emp_name|manager_id|salary|sal_rank|
+------+--------+----------+------+--------+
|    12|     Leo|         7| 91000|       1|
|     1|   Alice|      NULL| 90000|       2|
|     2|     Bob|         1| 75000|       3|
|     3| Charlie|         7| 72000|       4|
|     6|   Frank|         3| 62000|       5|
|     4|   David|         7| 60000|       6|
|     5|     Eve|         2| 58000|       7|
|     7|   Grace|         3| 58000|       7|
|     9|     Ivy|         5| 53000|       9|
|    10|    Jack|         5| 51000|      10|
|     8|    Hank|         7| 50000|      11|
|    11|   Kathy|         7| 48000|      12|
|    13|     Mia|         7| 45000|      13|
|    14|    Nina|         7| 44000|      14|
+------+--------+----------+------+--------+

+------+--------+----------+------+--------+
|emp_id|emp_name|manager_id|salary|sal_rank|
+------+--------+----------+------+--------+
|     2|     Bob|         1| 75000|       3|
+------+-

### Filter Rows Where Employee Salary > Manager Salary


In [8]:
get_emp_df = emp_df.alias('emp') 
get_mgr_df = emp_df.alias('mgr')

join_df = get_emp_df.join(get_mgr_df, get_mgr_df.emp_id == get_emp_df.manager_id, 'inner')
result_df = join_df.filter( get_mgr_df.salary > get_emp_df.salary )
result_df.show()

+------+--------+----------+------+--------+------+--------+----------+------+--------+
|emp_id|emp_name|manager_id|salary|sal_rank|emp_id|emp_name|manager_id|salary|sal_rank|
+------+--------+----------+------+--------+------+--------+----------+------+--------+
|     2|     Bob|         1| 75000|       3|     1|   Alice|      NULL| 90000|       2|
|     6|   Frank|         3| 62000|       5|     3| Charlie|         7| 72000|       4|
|     5|     Eve|         2| 58000|       7|     2|     Bob|         1| 75000|       3|
|     7|   Grace|         3| 58000|       7|     3| Charlie|         7| 72000|       4|
|     9|     Ivy|         5| 53000|       9|     5|     Eve|         2| 58000|       7|
|    10|    Jack|         5| 51000|      10|     5|     Eve|         2| 58000|       7|
|     8|    Hank|         7| 50000|      11|     7|   Grace|         3| 58000|       7|
|    11|   Kathy|         7| 48000|      12|     7|   Grace|         3| 58000|       7|
|    13|     Mia|         7| 450