In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [4]:
df = spark.read.csv("test1.csv", header=True, inferSchema = True)
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



## Filter Operations

In [6]:
## slaary of people <=20000

df.filter('Salary<=20000').show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [7]:
df.filter('Salary<=20000').select(['Name','age']).show()

+-------+---+
|   Name|age|
+-------+---+
|  Sunny| 29|
|   Paul| 24|
| Harsha| 21|
|Shubham| 23|
+-------+---+



In [8]:
df.filter(df['Salary']<=20000).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [9]:
df.filter((df['Salary']<=20000) & (df['age']>=25)).show()

+-----+---+----------+------+
| Name|age|Experience|Salary|
+-----+---+----------+------+
|Sunny| 29|         4| 20000|
+-----+---+----------+------+



## Not Operation: '~'

In [10]:
## not <=20000
df.filter(~(df['Salary']<=20000)).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
+---------+---+----------+------+



## Groupby and Aggregate

In [13]:
df = spark.read.csv('test3.csv', header=True, inferSchema=True)
df.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [14]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- salary: integer (nullable = true)



In [37]:
## maxinmum salary out of all the people

from pyspark.sql.functions import desc
df.groupBy('Name').sum('salary').sort(desc('sum(salary)')).show()


+---------+-----------+
|     Name|sum(salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Krish|      19000|
|    Sunny|      12000|
|   Mahesh|       7000|
+---------+-----------+



In [41]:
## number of employees in each department 

df.groupBy('Departments').count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



## Aggregates

In [44]:
df.agg({'Salary':'sum', 'Name':'count'}).show()

+-----------+-----------+
|sum(Salary)|count(Name)|
+-----------+-----------+
|      73000|         10|
+-----------+-----------+



In [58]:
## maximum salary the person is getting

df.groupBy('Name').min().sort(desc('min(salary)')).show()

+---------+-----------+
|     Name|min(salary)|
+---------+-----------+
|Sudhanshu|       5000|
|    Krish|       4000|
|   Mahesh|       3000|
|    Sunny|       2000|
+---------+-----------+

