In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive

# Mount Google Drive with a longer timeout
# drive.mount('/content/drive', force_remount=True, timeout_ms=300000)

# df_employee_data = "/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv"
# employeeSechema = StructType([
#     StructField("ID",IntegerType() ,True),
#     StructField("Name",StringType() ,True),
#     StructField("Age",IntegerType() ,True),
#     StructField("Salary",FloatType() ,True),
#     StructField("Joining_Date",DateType() ,True),
#     StructField("Department",StringType() ,True),
#     StructField("Performance_Rating",IntegerType() ,True),
#     StructField("Email",StringType() ,True),
#     StructField("Address",StringType() ,True),
#     StructField("Phone",StringType() ,True)

# ])
# # Load the DataFrame with the defined schema
# #df = spark.read.csv(path=df_employee_data, header=True, schema=employeeSechema)
# df = spark.read.load(path="/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv", format="csv", header = True, schema=employeeSechema)
# df.printSchema()
# df.show(50)

## Aggregate function in Dataframe – Part 1

In [2]:
# Create sample data
data = [
      Row(id=1, value=10),
      Row(id=2, value=20),
      Row(id=3, value=30),
      Row(id=4, value=None),
      Row(id=5, value=40),
      Row(id=6, value=20)
   ]
# Create DataFrame
df = spark.createDataFrame(data)
# Show the DataFrame
df.show()


+---+-----+
| id|value|
+---+-----+
|  1|   10|
|  2|   20|
|  3|   30|
|  4| NULL|
|  5|   40|
|  6|   20|
+---+-----+



### 1.Summation (sum): Sums up the values in a specified column.

In [3]:
total_sum = df.select(sum("value")).show()

+----------+
|sum(value)|
+----------+
|       120|
+----------+



### 2.average of the values in a specified column.

In [4]:
avarage_value = df.select(avg("value")).show()

+----------+
|avg(value)|
+----------+
|      24.0|
+----------+



### 3.Count (count): Counts the number of non-null values in a specified column.

In [5]:
non_null_count = df.select(count("value")).show()

+------------+
|count(value)|
+------------+
|           5|
+------------+



### 4.Maximum (max) and Minimum (min):
* Finds the maximum and minimum values in a specified column

In [6]:
max_min_value = df.select(max("value"), min("value")).show()

+----------+----------+
|max(value)|min(value)|
+----------+----------+
|        40|        10|
+----------+----------+



### Distinct Values Count (countDistinct):
  * Counts the number of distinct values in a specified column.

In [7]:
distinct_count = df.select(count_distinct("value")).show()

+---------------------+
|count(DISTINCT value)|
+---------------------+
|                    4|
+---------------------+



### Notes:
* Handling Nulls: The count function will count only non-null values, while sum, avg, max, and min will ignore null values in their calculations.
* Performance: Aggregate functions can be resource-intensive, especially on large datasets. Using the appropriate partitioning can improve performance.
* Use Cases:
  * **Summation:** Useful for calculating total sales, total revenue, etc.
  * **Average:** Helpful for finding average metrics like average sales per day.
  * **Count:** Useful for counting occurrences, such as the number of transactions.
  * **Max/Min:** Helps to determine the highest and lowest values, such as maximum  sales on a specific day.
  * **Distinct Count:** Useful for finding unique items, like unique customers or products.


  

## Aggregate function in Dataframe – Part 2

In [8]:
# Create Spark session
spark = SparkSession.builder.appName("AggregationExamples").getOrCreate()
# Sample data
dataItem = [
    ("HR", 10000, 500, "John"),
    ("Finance", 20000, 1500, "Doe"),
    ("HR", 15000, 1000, "Alice"),
    ("Finance", 25000, 2000, "Eve"),
    ("HR", 20000, 1500, "Mark")
    ]
# Define schema
schema = StructType([
     StructField("department", StringType(), True),
     StructField("salary", IntegerType(), True),
     StructField("bonus", IntegerType(), True),
     StructField("employee_name", StringType(), True)
     ])
# Create DataFrame
df = spark.createDataFrame(dataItem, schema)
df.show()


+----------+------+-----+-------------+
|department|salary|bonus|employee_name|
+----------+------+-----+-------------+
|        HR| 10000|  500|         John|
|   Finance| 20000| 1500|          Doe|
|        HR| 15000| 1000|        Alice|
|   Finance| 25000| 2000|          Eve|
|        HR| 20000| 1500|         Mark|
+----------+------+-----+-------------+



### 1. Grouped Aggregation

In [9]:

df.groupBy("department").agg(
    sum("salary").alias("Total_Sum_Salary"),
    avg("salary").alias("Avaarage_Salary"),
    max("salary").alias("Max_Salary"),
    min("salary").alias("Min_Salary")
).show()

+----------+----------------+---------------+----------+----------+
|department|Total_Sum_Salary|Avaarage_Salary|Max_Salary|Min_Salary|
+----------+----------------+---------------+----------+----------+
|        HR|           45000|        15000.0|     20000|     10000|
|   Finance|           45000|        22500.0|     25000|     20000|
+----------+----------------+---------------+----------+----------+



**Explanation:**
* sum: Adds the values in the group for column1.
* avg: Calculates the average value of column1 in each group.
* max: Finds the maximum value.
* min: Finds the minimum value.

### 2. Multiple Aggregations

In [10]:
df.groupBy("department").agg(
    sum("salary").alias("Total_Sum_Salary"),
    avg("bonus").alias("Avaarage_Bonus"),
    max("salary").alias("Max_Salary"),
).show()

+----------+----------------+--------------+----------+
|department|Total_Sum_Salary|Avaarage_Bonus|Max_Salary|
+----------+----------------+--------------+----------+
|        HR|           45000|        1000.0|     20000|
|   Finance|           45000|        1750.0|     25000|
+----------+----------------+--------------+----------+



**Explanation:**
 * count: Counts the number of rows in each group.
 * avg: Computes the average of column2.
 * max: Finds the maximum value in column1.

### 3. Concatenate Strings

In [13]:
df.agg(concat_ws(", ", collect_list("employee_name")).alias("concatenated_names")).show()


+--------------------+
|  concatenated_names|
+--------------------+
|John, Doe, Alice,...|
+--------------------+



In [14]:
df.groupBy("department").agg(
    concat_ws(", ", collect_list("employee_name")).alias("concated_names")
).show()

+----------+-----------------+
|department|   concated_names|
+----------+-----------------+
|        HR|John, Alice, Mark|
|   Finance|         Doe, Eve|
+----------+-----------------+



**Explanation:**

* concat_ws: Concatenates string values within the column, separating them by the specified delimiter (, ).

### 4. First and Last

In [15]:
df.groupBy("department").agg(
    first("employee_name"),
    last("employee_name")
).show()

+----------+--------------------+-------------------+
|department|first(employee_name)|last(employee_name)|
+----------+--------------------+-------------------+
|   Finance|                 Doe|                Eve|
|        HR|                John|               Mark|
+----------+--------------------+-------------------+



**Explanation:**

* first: Retrieves the first value of the name column within each group.
* last: Retrieves the last value of the name column within each group.

## 5. Standard Deviation and Variance

In [16]:
df.show()

+----------+------+-----+-------------+
|department|salary|bonus|employee_name|
+----------+------+-----+-------------+
|        HR| 10000|  500|         John|
|   Finance| 20000| 1500|          Doe|
|        HR| 15000| 1000|        Alice|
|   Finance| 25000| 2000|          Eve|
|        HR| 20000| 1500|         Mark|
+----------+------+-----+-------------+



In [17]:
df.select(stddev("salary"), variance("salary")).show()

+----------------+----------------+
|  stddev(salary)|var_samp(salary)|
+----------------+----------------+
|5700.87712549569|          3.25E7|
+----------------+----------------+



**Explanation:**
* stddev: Calculates the standard deviation of column.
* variance: Calculates the variance of column.

### 6. Aggregation with Alias

In [18]:
df.agg(
    sum("salary").alias("Total_Salary"),
    avg("salary").alias("Avarage_Salary")
).show()

+------------+--------------+
|Total_Salary|Avarage_Salary|
+------------+--------------+
|       90000|       18000.0|
+------------+--------------+



**Explanation:**
* alias(): Used to rename the resulting columns from the aggregation.

### 7. Sum of Distinct Values

In [20]:
df.select(
    sum_distinct("salary")
).show()

+--------------------+
|sum(DISTINCT salary)|
+--------------------+
|               70000|
+--------------------+



**Explanation:**
* sum_distinct: Sums only the distinct values in column. This avoids counting duplicates