# Pyspark Dataframes
- Filter Operation
- &,|,==
- ~

- Groupby and Aggregate functions

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [3]:
import pandas as pd
data = pd.DataFrame({'Name':['Krish','Sudhanshu','Sunny','Paul','Harsha','Shubham'],
                     'Age':[31,30,29,24,21,23],
                     'Experience':[10,8,4,3,1,2],
                     'Salary':[30000,25000,20000,20000,15000,18000]})
data.to_csv('test3_pyspark.csv',index=False)

In [4]:
df_pyspark = spark.read.csv('test3_pyspark.csv',header=True,inferSchema=True)
df_pyspark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



# Filter Operation

In [6]:
df_pyspark.filter("Salary<=20000").show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [8]:
df_pyspark.filter(df_pyspark["Salary"]<=20000).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [7]:
df_pyspark.filter("Salary<=20000").select(['Name','Age']).show()

+-------+---+
|   Name|Age|
+-------+---+
|  Sunny| 29|
|   Paul| 24|
| Harsha| 21|
|Shubham| 23|
+-------+---+



In [10]:
df_pyspark.filter((df_pyspark["Salary"]<20000) & (df_pyspark["Salary"]>=15000)).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [13]:
df_pyspark.filter(df_pyspark["Name"]=='Harsha').show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|Harsha| 21|         1| 15000|
+------+---+----------+------+



In [14]:
df_pyspark.filter(~(df_pyspark["Salary"]<=20000)).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
+---------+---+----------+------+



# Groupby and Aggregate functions

In [17]:
import pandas as pd
data = pd.DataFrame({'Name':['Krish','Krish','Mahesh','Krish','Mahesh','Sudhanshu','Sudhanshu','Sudhanshu','Sunny','Sunny'],
                     'Departments':['Data Science','IOT','Big Data','Big Data','Data Science','Data Science','IOT','Big Data','Data Science','Big Data'],
                     'Salary':[10000,5000,4000,4000,3000,20000,10000,5000,10000,2000]})
data.to_csv('test4_pyspark.csv',index=False)

In [18]:
df_pyspark = spark.read.csv('test4_pyspark.csv',header=True,inferSchema=True)
df_pyspark.show()

+---------+------------+------+
|     Name| Departments|Salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [19]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- Salary: integer (nullable = true)



**Groupby**

In [20]:
df_pyspark.groupBy('Name')

<pyspark.sql.group.GroupedData at 0x20828392310>

In [21]:
df_pyspark.groupBy('Name').sum()

DataFrame[Name: string, sum(Salary): bigint]

In [22]:
df_pyspark.groupBy('Name').sum().show()

+---------+-----------+
|     Name|sum(Salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



In [32]:
df_pyspark.groupBy('Name').mean().show()

+---------+------------------+
|     Name|       avg(Salary)|
+---------+------------------+
|Sudhanshu|11666.666666666666|
|    Sunny|            6000.0|
|    Krish| 6333.333333333333|
|   Mahesh|            3500.0|
+---------+------------------+



In [31]:
df_pyspark.groupBy('Name').avg().show()

+---------+------------------+
|     Name|       avg(Salary)|
+---------+------------------+
|Sudhanshu|11666.666666666666|
|    Sunny|            6000.0|
|    Krish| 6333.333333333333|
|   Mahesh|            3500.0|
+---------+------------------+



In [24]:
df_pyspark.groupBy('Departments').sum().show()

+------------+-----------+
| Departments|sum(Salary)|
+------------+-----------+
|         IOT|      15000|
|    Big Data|      15000|
|Data Science|      43000|
+------------+-----------+



In [25]:
df_pyspark.groupBy('Departments').mean().show()

+------------+-----------+
| Departments|avg(Salary)|
+------------+-----------+
|         IOT|     7500.0|
|    Big Data|     3750.0|
|Data Science|    10750.0|
+------------+-----------+



In [26]:
df_pyspark.groupBy('Departments').count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



In [28]:
df_pyspark.groupBy('Name').max().show()

+---------+-----------+
|     Name|max(Salary)|
+---------+-----------+
|Sudhanshu|      20000|
|    Sunny|      10000|
|    Krish|      10000|
|   Mahesh|       4000|
+---------+-----------+



In [29]:
df_pyspark.groupBy('Name').min().show()

+---------+-----------+
|     Name|min(Salary)|
+---------+-----------+
|Sudhanshu|       5000|
|    Sunny|       2000|
|    Krish|       4000|
|   Mahesh|       3000|
+---------+-----------+



**Aggregate**

In [27]:
df_pyspark.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      73000|
+-----------+

