In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Basic Transformations'). \
    master('yarn'). \
    getOrCreate()

In [2]:
from pyspark.sql.functions import *
from pyspark.sql import Window

___How to get second highest salary department wise without using window functions and with out using native sql(spark.sql(...)) .Get the result through dataframe in spark.___

In [3]:
data = [("Engg", "Sam", 1000),
("Engg", "Smith", 2000),
("HR", "Denis", 1500),
("HR", "Danny", 3000),
("IT", "David", 2000),
("IT", "John", 3000)]

In [4]:
sal_df = spark.createDataFrame(data, ["DeptID", "EmpName", "Salary"])

In [5]:
sal_df.show()

+------+-------+------+
|DeptID|EmpName|Salary|
+------+-------+------+
|  Engg|    Sam|  1000|
|  Engg|  Smith|  2000|
|    HR|  Denis|  1500|
|    HR|  Danny|  3000|
|    IT|  David|  2000|
|    IT|   John|  3000|
+------+-------+------+



___Using window function___

In [14]:
win_spec = Window. \
    partitionBy("DeptID"). \
    orderBy(col("Salary").desc())

In [15]:
second_highest_sal = sal_df. \
    withColumn("rank", dense_rank().over(win_spec))

In [16]:
second_highest_sal.show()

+------+-------+------+----+
|DeptID|EmpName|Salary|rank|
+------+-------+------+----+
|    HR|  Danny|  3000|   1|
|    HR|  Denis|  1500|   2|
|  Engg|  Smith|  2000|   1|
|  Engg|    Sam|  1000|   2|
|    IT|   John|  3000|   1|
|    IT|  David|  2000|   2|
+------+-------+------+----+



In [17]:
second_highest_sal = second_highest_sal. \
    filter("rank == 2"). \
    drop("rank")

In [18]:
second_highest_sal.show()

+------+-------+------+
|DeptID|EmpName|Salary|
+------+-------+------+
|    HR|  Denis|  1500|
|  Engg|    Sam|  1000|
|    IT|  David|  2000|
+------+-------+------+



___Without window function___

In [24]:
sal2 = sal_df. \
    groupBy("DeptID"). \
    max("Salary")

In [20]:
sal2.show()

+------+-----------+
|DeptID|max(Salary)|
+------+-----------+
|    HR|       3000|
|  Engg|       2000|
|    IT|       3000|
+------+-----------+



In [26]:
data = [[('A', 1)], [('A', 1), ('A', 1) ]]

# o/p: = [[('A', 1)], [('A', 2) ]]

1. Read a student csv file
2. add new column name Remarks -pass/fail . It will be decided based on marks
3. Save this data into hive table

```
df = spark.read.csv()
df2 = df.withColumn("Remarks", when(df."marks" > 40, "pass").otherwise("fail"))

df2.write. \
	.format("csv")
	.mode("Overwrite")
	.saveAsTable("students")    
```
=====================================================

1)if we have duplicate column entry in data will we be able to read it using spark

2)if yes is there any way i can rename such columns like

```
a b a c a b
1 2 1 2 1 2
```
to
```
a b a_2 c a_3 b_2
1 2 1   2 1   2
```

In [28]:
%%sh

cat dup_data.txt

a b a c a b
1 2 1 2 1 2


In [31]:
df = spark.read.option("header", True).option("delimiter", " ").csv("dup_data.txt")

In [32]:
df.show()

+---+---+---+---+---+---+
| a0| b1| a2|  c| a4| b5|
+---+---+---+---+---+---+
|  1|  2|  1|  2|  1|  2|
+---+---+---+---+---+---+

