In [0]:
csv_file="dbfs:/FileStore/tables/Book1.csv"
df2 = spark.read.option("header",True).option("inferschema",True).csv(csv_file)

In [0]:
df2.display()

Name of Employee,Sales,Quarter,State
Mohak,1000,1,Rajasthan
Vijay,300,1,Panjab
Tapasi,400,1,Gujarat
Mansi,500,1,Goa
Bipin,800,1,Rajasthan
Mohak,1000,2,Gujarat
Vijay,500,2,Panjab
Tapasi,700,2,Gujarat
Mansi,50,2,Rajasthan
Bipin,60,2,Rajasthan


In [0]:
df2.printSchema()

root
 |-- Name of Employee: string (nullable = true)
 |-- Sales: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- State: string (nullable = true)



#Find total sales per employee in above DataFrame

In [0]:
df3=df2.groupBy("Name of Employee").sum("Sales").sort("Name of Employee", ascending=[True])
df3.display()

Name of Employee,sum(Sales)
Bipin,1210
Mansi,1500
Mohak,4000
Tapasi,2100
Vijay,2600


#Find total sales by state in above DataFrame

In [0]:
df4=df2.groupBy("State").sum("Sales").sort("State", ascending=[True])
df4.display()

State,sum(Sales)
Goa,1450
Gujarat,3400
Panjab,3600
Rajasthan,2960


#Find total sales by both employee & state in above DataFrame 

In [0]:
df5=df2.groupBy("Name of Employee","State").sum("Sales").sort("Name of Employee","sum(Sales)", ascending=[True,True])
df5.display()

Name of Employee,State,sum(Sales)
Bipin,Gujarat,300
Bipin,Rajasthan,910
Mansi,Rajasthan,50
Mansi,Goa,1450
Mohak,Panjab,1000
Mohak,Gujarat,1000
Mohak,Rajasthan,2000
Tapasi,Gujarat,2100
Vijay,Panjab,2600


#Find Max individual sale by State in above DataFrame

In [0]:
df6=df2.groupBy("State").max("Sales").sort("State", ascending=[True])
df6.display()

State,max(Sales)
Goa,750
Gujarat,1000
Panjab,1000
Rajasthan,1000


#Find Mean, median and min sales by State in above DataFrame

In [0]:
from pyspark.sql.functions import median,avg,min
import pyspark.sql.functions as F
df6=df2.groupBy("State").agg(avg("Sales").alias("avg_salary"), \
         median("Sales").alias("sum_bonus"), \
         min("Sales").alias("max_bonus") \
     ).sort("State",ascending=[True])
df6.show(truncate=False)

+---------+-----------------+---------+---------+
|State    |avg_salary       |sum_bonus|max_bonus|
+---------+-----------------+---------+---------+
|Goa      |483.3333333333333|500.0    |200      |
|Gujarat  |566.6666666666666|550.0    |250      |
|Panjab   |720.0            |900.0    |300      |
|Rajasthan|493.3333333333333|430.0    |50       |
+---------+-----------------+---------+---------+



In [0]:
data = (['Mohak', 'Rajesh', 'Freya', 'Aditya', 'Anika'], [2012, 2012, 2013, 2014, 2014],
[10, 22, 11, 32, 23],
[2, 2, 3, 3, 3])
df7=spark.createDataFrame(data, ["name","year","score","catches"])
df7.display()

name,year,score,catches,_5
Mohak,Rajesh,Freya,Aditya,Anika
2012,2012,2013,2014,2014
10,22,11,32,23
2,2,3,3,3


#Pivot the table and Sort the DataFrames rows by score, in descending order

In [0]:
from pyspark.sql import Row
rows = [Row(name=name, year=year, score=score, catches=catches) for name, year, score, catches in zip(*data)]
df8 = spark.createDataFrame(rows)
df8.sort("name").show()

+------+----+-----+-------+
|  name|year|score|catches|
+------+----+-----+-------+
|Aditya|2014|   32|      3|
| Anika|2014|   23|      3|
| Freya|2013|   11|      3|
| Mohak|2012|   10|      2|
|Rajesh|2012|   22|      2|
+------+----+-----+-------+



#Sort the DataFrames rows by catches and then by score, in ascending order/sort by multiple columns

In [0]:
df9=df8.sort('catches', 'score').show()

+------+----+-----+-------+
|  name|year|score|catches|
+------+----+-----+-------+
| Mohak|2012|   10|      2|
|Rajesh|2012|   22|      2|
| Freya|2013|   11|      3|
| Anika|2014|   23|      3|
|Aditya|2014|   32|      3|
+------+----+-----+-------+

