## Spark-DataFrame  Fundamental Practice


In [153]:
import os
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

In [154]:
#set the PySpark environment veriable
os.environ['JAVA_HOME']="C:\Program Files\Java\jdk-22"
os.environ['SPARK_HOME'] = "E:\spark-3.5.1-bin-hadoop3"
os.environ['PYSPARK_DRIVER_PYTHON'] = "jupyter"
os.environ['PYSPARK_DRIVER_PYTHON_OPS'] = "notebook"
os.environ['PYSPARK_PYTHON'] = "python"

In [155]:
#Sart the Sparksession
spark = SparkSession.builder.appName("Practice")\
.config("spark.executor.memory","4g")\
.config("spark.sql.shuffle.partitions","4")\
.config("spark.jars","E:\jdbc-driver\postgresql-42.7.3.jar")\
.getOrCreate()

In [156]:
sc = spark.sparkContext

In [157]:
sc

### 1. Creating DataFrame and Loading CSV, Text, Json,  Parquet File Into DataFrame

In [158]:
# Create DataFrame
data = [
  ('James','Smith','1991-04-01','M',3000),
  ('Michael','Rose','2000-05-19','M',4000),
  ('Robert','Williams','1978-09-05','M',4000),
  ('Maria','Jones','1967-12-01','F',4000),
  ('Jen','Brown','1980-02-17','F',8500),
  ('Keith','Lina','1982-09-26','F',8000)
]

columns = ["firstname","lastname","dob","gender","salary"]

pyspark_df = spark.createDataFrame(data=data, schema = columns)

In [159]:
#show the list of Schema
pyspark_df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [160]:
#show data values
pyspark_df.show()

+---------+--------+----------+------+------+
|firstname|lastname|       dob|gender|salary|
+---------+--------+----------+------+------+
|    James|   Smith|1991-04-01|     M|  3000|
|  Michael|    Rose|2000-05-19|     M|  4000|
|   Robert|Williams|1978-09-05|     M|  4000|
|    Maria|   Jones|1967-12-01|     F|  4000|
|      Jen|   Brown|1980-02-17|     F|  8500|
|    Keith|    Lina|1982-09-26|     F|  8000|
+---------+--------+----------+------+------+



In [161]:
#Read git from csv and store pysparkDataFrame
url_github = r"https://raw.githubusercontent.com/muttinenisairohith/Datasets/b0bb96f293adbb803e24c26b7780e078372d3703/data/test2.csv"
pd_df = pd.read_csv(url_github)
df_pyspark1 = spark.createDataFrame(pd_df)
df_pyspark1.show()

+-------+------------+------+
|   Name| Departments|salary|
+-------+------------+------+
| chandu|Data Science| 10000|
| chandu|         IOT|  5000|
| Rohith|    Big Data|  4000|
| chandu|    Big Data|  4000|
| Rohith|Data Science|  3000|
|krishna|Data Science| 20000|
|krishna|         IOT| 10000|
|krishna|    Big Data|  5000|
| rashmi|Data Science| 10000|
| rashmi|    Big Data|  2000|
+-------+------------+------+



In [162]:
#group by Departments which gives summation of salaries
df_pyspark1.groupBy("Departments").sum("salary").show()

+------------+-----------+
| Departments|sum(salary)|
+------------+-----------+
|Data Science|      43000|
|         IOT|      15000|
|    Big Data|      15000|
+------------+-----------+



In [163]:
df_pyspark1.groupBy("Departments").agg(({"salary":"sum"})).show()

+------------+-----------+
| Departments|sum(salary)|
+------------+-----------+
|Data Science|      43000|
|         IOT|      15000|
|    Big Data|      15000|
+------------+-----------+



In [164]:
df_pyspark1.groupBy("Departments").min("salary").show()
df_pyspark1.groupBy("Departments").max("salary").show()
df_pyspark1.groupBy("Departments").avg("salary").show()
df_pyspark1.groupBy("Departments").mean("salary").show()

+------------+-----------+
| Departments|min(salary)|
+------------+-----------+
|Data Science|       3000|
|         IOT|       5000|
|    Big Data|       2000|
+------------+-----------+

+------------+-----------+
| Departments|max(salary)|
+------------+-----------+
|Data Science|      20000|
|         IOT|      10000|
|    Big Data|       5000|
+------------+-----------+

+------------+-----------+
| Departments|avg(salary)|
+------------+-----------+
|Data Science|    10750.0|
|         IOT|     7500.0|
|    Big Data|     3750.0|
+------------+-----------+

+------------+-----------+
| Departments|avg(salary)|
+------------+-----------+
|Data Science|    10750.0|
|         IOT|     7500.0|
|    Big Data|     3750.0|
+------------+-----------+



In [165]:
#Using SparkFiles Load the dataframe
from pyspark import SparkFiles
student_info = r"https://raw.githubusercontent.com/AISCIENCES/course-master-big-data-with-pyspark-and-aws/main/Code/03-Spark%20DFs/StudentData.csv"
spark.sparkContext.addFile(student_info)
student_df = spark.read.csv(SparkFiles.get("StudentData.csv"),inferSchema=True, header=True)
student_df.show()

+---+------+----------------+------+------+-----+--------------------+
|age|gender|            name|course|  roll|marks|               email|
+---+------+----------------+------+------+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB|  2984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud| 12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF| 21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB| 32877|   29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA| 41487|   41|Claude Panos_Judi...|
| 28|  Male|  Margene Moores|   MVC| 52771|   32|Toshiko Hillyard_...|
| 28|  Male|     Neda Briski|   OOP| 61973|   69|Alberta Freund_El...|
| 28|Female|    Claude Panos| Cloud| 72409|   85|Sheryll Towler_Al...|
| 28|  Male|  Celeste Lollis|   MVC| 81492|   64|Nicole Harwood_Cl...|
| 29|  Male|  Cordie Harnois|   OOP| 92882|   51|Judie Chipps_Clem...|
| 29|Female|       Kena Wild|   DSA|102285|   35|Dustin Feagins_Ma...|
| 29| 

In [166]:
print(student_df.count())
print(len(student_df.dtypes))

1000
7


In [167]:
#Dropping rows based on null values
student_df.na.drop().show()

+---+------+----------------+------+------+-----+--------------------+
|age|gender|            name|course|  roll|marks|               email|
+---+------+----------------+------+------+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB|  2984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud| 12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF| 21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB| 32877|   29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA| 41487|   41|Claude Panos_Judi...|
| 28|  Male|  Margene Moores|   MVC| 52771|   32|Toshiko Hillyard_...|
| 28|  Male|     Neda Briski|   OOP| 61973|   69|Alberta Freund_El...|
| 28|Female|    Claude Panos| Cloud| 72409|   85|Sheryll Towler_Al...|
| 28|  Male|  Celeste Lollis|   MVC| 81492|   64|Nicole Harwood_Cl...|
| 29|  Male|  Cordie Harnois|   OOP| 92882|   51|Judie Chipps_Clem...|
| 29|Female|       Kena Wild|   DSA|102285|   35|Dustin Feagins_Ma...|
| 29| 

In [168]:
#show Pandase Data Frame from sparkDataframe
student_df.toPandas()

Unnamed: 0,age,gender,name,course,roll,marks,email
0,28,Female,Hubert Oliveras,DB,2984,59,Annika Hoffman_Naoma Fritts@OOP.com
1,29,Female,Toshiko Hillyard,Cloud,12899,62,Margene Moores_Marylee Capasso@DB.com
2,28,Male,Celeste Lollis,PF,21267,45,Jeannetta Golden_Jenna Montague@DSA.com
3,29,Female,Elenore Choy,DB,32877,29,Billi Clore_Mitzi Seldon@DB.com
4,28,Male,Sheryll Towler,DSA,41487,41,Claude Panos_Judie Chipps@OOP.com
...,...,...,...,...,...,...,...
995,28,Female,Celeste Lollis,DB,9952416,59,Gonzalo Ferebee_Jalisa Swenson@DB.com
996,29,Female,Somer Stoecker,Cloud,9962277,84,Clementina Menke_Paris Hutton@OOP.com
997,28,Male,Tamera Blakley,DSA,9971217,26,Anna Santos_Claude Panos@PF.com
998,28,Female,Tamera Blakley,MVC,9982779,71,Toshiko Hillyard_Anna Santos@DSA.com


## Using RDDs texFile Reading storing data

In [169]:
rdd_data = spark.sparkContext.textFile("./data/data.txt")
result_rdd = rdd_data.flatMap(lambda line: line.split(" "))\
    .map(lambda word: (word, 1))\
    .reduceByKey(lambda a, b: a + b)\
    .sortBy(lambda x: x[1], ascending=False)

In [170]:
result_rdd.take(10)

[('the', 12),
 ('of', 7),
 ('a', 7),
 ('in', 5),
 ('distributed', 5),
 ('Spark', 4),
 ('is', 3),
 ('as', 3),
 ('API', 3),
 ('on', 3)]

### Using DataFrames

In [171]:
from pyspark.sql.functions import desc
df_textData = spark.read.text("./data/data.txt")
result_textData = df_textData.selectExpr("explode(split(value, ' ')) as word") \
    .groupBy("word").count().orderBy(desc("count"))

In [172]:
result_textData.show()

+-----------+-----+
|       word|count|
+-----------+-----+
|        the|   12|
|          a|    7|
|         of|    7|
|         in|    5|
|distributed|    5|
|      Spark|    4|
|        API|    3|
|         as|    3|
|        RDD|    3|
|         is|    3|
|    Dataset|    3|
|         on|    3|
|        its|    2|
|       that|    2|
|        The|    2|
|  MapReduce|    2|
|       API.|    2|
|        and|    2|
|   function|    2|
|    cluster|    2|
+-----------+-----+
only showing top 20 rows



In [173]:
result_textData.take(10)

[Row(word='the', count=12),
 Row(word='a', count=7),
 Row(word='of', count=7),
 Row(word='in', count=5),
 Row(word='distributed', count=5),
 Row(word='Spark', count=4),
 Row(word='RDD', count=3),
 Row(word='API', count=3),
 Row(word='as', count=3),
 Row(word='is', count=3)]

## Read CSV file into DataFrame

In [174]:
%%bash 
head -10 ./data/products.csv

your 131072x1 screen size is bogus. expect trouble
your 131072x1 screen size is bogus. expect trouble


id,name,category,quantity,price
1,iPhone 12,Electronics,10,899.99
2,Nike Air Max 90,Clothing,25,119.99
3,KitchenAid Stand Mixer,Home Appliances,5,299.99
4,The Great Gatsby,Books,50,12.99
5,L'Oreal Paris Mascara,Beauty,100,9.99
6,Yoga Mat,Sports,30,29.99
7,Samsung 4K Smart TV,Electronics,8,799.99
8,Levi's Jeans,Clothing,15,49.99
9,Dyson Vacuum Cleaner,Home Appliances,3,399.99


## Read CSV with header

In [175]:
# Read CSV file into DataFrame
csv_file_path = "./data/products.csv"
product_df = spark.read.csv(csv_file_path, header=True)

In [176]:
# Display schema of DataFrame
product_df.printSchema()

# Display content of DataFrame
product_df.show(5)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- price: string (nullable = true)

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+
only showing top 5 rows



### Read CSV with an explicit schema definition

In [177]:
# import necessary types
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [178]:
# Define the schema
schema = StructType([
    StructField(name="id", dataType=IntegerType(), nullable=True),
    StructField(name="name", dataType=StringType(), nullable=True),
    StructField(name="category", dataType=StringType(), nullable=True),
    StructField(name="quantity", dataType=IntegerType(), nullable=True),
    StructField(name="price", dataType=DoubleType(), nullable=True)
])

# Read CSV file into DataFrame with schema definition
csv_file_path = "./data/products.csv"
product_df1 = spark.read.csv(csv_file_path, header=True, schema=schema)

In [179]:
# Display schema of DataFrame
product_df1.printSchema()

# Display content of DataFrame
product_df1.show(5)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+
only showing top 5 rows



#### Read CSV with using inferSchema Atomatically define schema type by using InferSchema=True

In [180]:
# Read CSV file into DataFrame with inferSchema
csv_file_path = "./data/products.csv"
product_df2 = spark.read.csv(csv_file_path, header=True, inferSchema=True)

In [181]:
# Display schema of DataFrame
product_df2.printSchema()
# Display content of DataFrame
product_df2.show(5)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+
only showing top 5 rows



### Read JSON file into DataFrame
### Single Line JSON

In [182]:
%%bash
head -15 ./data/products_singleline.json

your 131072x1 screen size is bogus. expect trouble
your 131072x1 screen size is bogus. expect trouble


{"id":1,"name":"iPhone 12","category":"Electronics","quantity":10,"price":899.99}
{"id":2,"name":"Nike Air Max 90","category":"Clothing","quantity":25,"price":119.99}
{"id":3,"name":"KitchenAid Stand Mixer","category":"Home Appliances","quantity":5,"price":299.99}
{"id":4,"name":"The Great Gatsby","category":"Books","quantity":50,"price":12.99}
{"id":5,"name":"L'Oreal Paris Mascara","category":"Beauty","quantity":100,"price":9.99}
{"id":6,"name":"Yoga Mat","category":"Sports","quantity":30,"price":29.99}
{"id":7,"name":"Samsung 4K Smart TV","category":"Electronics","quantity":8,"price":799.99}
{"id":8,"name":"Levi's Jeans","category":"Clothing","quantity":15,"price":49.99}
{"id":9,"name":"Dyson Vacuum Cleaner","category":"Home Appliances","quantity":3,"price":399.99}
{"id":10,"name":"Harry Potter Series","category":"Books","quantity":20,"price":15.99}
{"id":11,"name":"MAC Lipstick","category":"Beauty","quantity":75,"price":16.99}
{"id":12,"name":"Adidas Running Shoes","category":"Sport

In [183]:
# Read single line JSON
# Each row is a JSON record, records are separated by new line
json_file_path = "./data/products_singleline.json"
json_data_df = spark.read.json(json_file_path)

In [184]:
# Display schema of DataFrame
json_data_df.printSchema()
# Display content of DataFrame
json_data_df.show(15)

root
 |-- category: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- quantity: long (nullable = true)

+---------------+---+--------------------+------+--------+
|       category| id|                name| price|quantity|
+---------------+---+--------------------+------+--------+
|    Electronics|  1|           iPhone 12|899.99|      10|
|       Clothing|  2|     Nike Air Max 90|119.99|      25|
|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|
|          Books|  4|    The Great Gatsby| 12.99|      50|
|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|
|         Sports|  6|            Yoga Mat| 29.99|      30|
|    Electronics|  7| Samsung 4K Smart TV|799.99|       8|
|       Clothing|  8|        Levi's Jeans| 49.99|      15|
|Home Appliances|  9|Dyson Vacuum Cleaner|399.99|       3|
|          Books| 10| Harry Potter Series| 15.99|      20|
|         Beauty| 11|        MAC Lipstick| 1

## Multi-lines JSON and hass of Array

In [185]:
%%bash
head -20 ./data/products_multiline.json

your 131072x1 screen size is bogus. expect trouble
your 131072x1 screen size is bogus. expect trouble


[
  {
    "id": 1,
    "name": "iPhone 12",
    "category": "Electronics",
    "quantity": 10,
    "price": 899.99
  },
  {
    "id": 2,
    "name": "Nike Air Max 90",
    "category": "Clothing",
    "quantity": 25,
    "price": 119.99
  },
  {
    "id": 3,
    "name": "KitchenAid Stand Mixer",
    "category": "Home Appliances",
    "quantity": 5,


In [186]:
# Read multi-line JSON
# JSON is an array of record, records are separated by a comma.
# each record is defined in multiple lines
json_file_path = "./data/products_multiline.json"
json_data_df = spark.read.json(json_file_path, multiLine=True)

In [187]:
# Display schema of DataFrame
json_data_df.printSchema()
# Display content of DataFrame
json_data_df.show(5)

root
 |-- category: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- quantity: long (nullable = true)

+---------------+---+--------------------+------+--------+
|       category| id|                name| price|quantity|
+---------------+---+--------------------+------+--------+
|    Electronics|  1|           iPhone 12|899.99|      10|
|       Clothing|  2|     Nike Air Max 90|119.99|      25|
|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|
|          Books|  4|    The Great Gatsby| 12.99|      50|
|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|
+---------------+---+--------------------+------+--------+
only showing top 5 rows



## Save in Parquet file 

In [189]:
# write dataframe into parquet file First time execute second time will fail
parquet_file_path = "./data/products.parquet"
json_data_df.write.parquet(parquet_file_path)

## Read parquet file into DataFrame

In [190]:
#Reading the Parquet fiels 
parquet_df = spark.read.parquet(parquet_file_path)

In [191]:
# Display schema of DataFrame
parquet_df.printSchema()
# Display content of DataFrame
parquet_df.show(5)

root
 |-- category: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- quantity: long (nullable = true)

+---------------+---+--------------------+------+--------+
|       category| id|                name| price|quantity|
+---------------+---+--------------------+------+--------+
|    Electronics|  1|           iPhone 12|899.99|      10|
|       Clothing|  2|     Nike Air Max 90|119.99|      25|
|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|
|          Books|  4|    The Great Gatsby| 12.99|      50|
|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|
+---------------+---+--------------------+------+--------+
only showing top 5 rows



## 2. DataFrame Operation

In [192]:
%%bash
head -10 ./data/stocks.txt

your 131072x1 screen size is bogus. expect trouble
your 131072x1 screen size is bogus. expect trouble


id,name,category,quantity,price
1,iPhone,Electronics,10,899.99
2,Macbook,Electronics,5,1299.99
3,iPad,Electronics,15,499.99
4,Samsung TV,Electronics,8,799.99
5,LG TV,Electronics,10,699.99
6,Nike Shoes,Clothing,30,99.99
7,Adidas Shoes,Clothing,25,89.99
8,Sony Headphones,Electronics,12,149.99
9,Beats Headphones,Electronics,20,199.99


In [193]:
# Load the synthetic data into a DataFrame
data_file_path = "./data/stocks.txt"
stocks_data_df = spark.read.csv(data_file_path, header=True, inferSchema=True)

In [194]:
stocks_data_df.printSchema()
stocks_data_df.show(10)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)

+---+----------------+-----------+--------+-------+
| id|            name|   category|quantity|  price|
+---+----------------+-----------+--------+-------+
|  1|          iPhone|Electronics|      10| 899.99|
|  2|         Macbook|Electronics|       5|1299.99|
|  3|            iPad|Electronics|      15| 499.99|
|  4|      Samsung TV|Electronics|       8| 799.99|
|  5|           LG TV|Electronics|      10| 699.99|
|  6|      Nike Shoes|   Clothing|      30|  99.99|
|  7|    Adidas Shoes|   Clothing|      25|  89.99|
|  8| Sony Headphones|Electronics|      12| 149.99|
|  9|Beats Headphones|Electronics|      20| 199.99|
| 10|    Dining Table|  Furniture|      10| 249.99|
+---+----------------+-----------+--------+-------+
only showing top 10 rows



### Select: Choose specific columns.

In [195]:
# Select specific columns 
selected_columns = stocks_data_df.select("id", "name", "price")
print("Selected Columns:")
selected_columns.show(10)

Selected Columns:
+---+----------------+-------+
| id|            name|  price|
+---+----------------+-------+
|  1|          iPhone| 899.99|
|  2|         Macbook|1299.99|
|  3|            iPad| 499.99|
|  4|      Samsung TV| 799.99|
|  5|           LG TV| 699.99|
|  6|      Nike Shoes|  99.99|
|  7|    Adidas Shoes|  89.99|
|  8| Sony Headphones| 149.99|
|  9|Beats Headphones| 199.99|
| 10|    Dining Table| 249.99|
+---+----------------+-------+
only showing top 10 rows



### Filter: Apply conditions to filter rows.

In [196]:
# Filter rows based on a condition
filtered_data = stocks_data_df.filter(stocks_data_df.quantity > 15)
print("Filtered Data:", filtered_data.count())
filtered_data.show()

Filtered Data: 14
+---+----------------+-----------+--------+------+
| id|            name|   category|quantity| price|
+---+----------------+-----------+--------+------+
|  6|      Nike Shoes|   Clothing|      30| 99.99|
|  7|    Adidas Shoes|   Clothing|      25| 89.99|
|  9|Beats Headphones|Electronics|      20|199.99|
| 12|          Apples|       Food|     100|   0.5|
| 13|         Bananas|       Food|     150|  0.25|
| 14|         Oranges|       Food|     120|  0.75|
| 15|  Chicken Breast|       Food|      50|  3.99|
| 16|   Salmon Fillet|       Food|      30|  5.99|
| 19|        Yoga Mat|     Sports|      20| 19.99|
| 24|      Laptop Bag|Accessories|      25| 29.99|
| 25|        Backpack|Accessories|      30| 24.99|
| 28|           Jeans|   Clothing|      30| 59.99|
| 29|         T-shirt|   Clothing|      50| 14.99|
| 30|        Sneakers|   Clothing|      40| 79.99|
+---+----------------+-----------+--------+------+



**GroupBy: Group data based on specific columns**

**Aggregations: Perform functions like sum, average, etc., on grouped data**

In [197]:
# GroupBy and Aggregations
grouped_data = stocks_data_df.groupBy("category").agg({"quantity": "sum", "price": "avg"})
print("Grouped and Aggregated Data:")
grouped_data.show()

print("Count Row no:",grouped_data.count())

Grouped and Aggregated Data:
+-----------+-------------+------------------+
|   category|sum(quantity)|        avg(price)|
+-----------+-------------+------------------+
|Electronics|           98| 586.6566666666665|
|       Food|          450|2.2960000000000003|
|   Clothing|          200|  99.2757142857143|
|  Furniture|           41|            141.99|
|     Sports|           35|             34.99|
|Accessories|           55|             27.49|
+-----------+-------------+------------------+

Count Row no: 6


### Join: Combine multiple DataFrames based on specified columns.

In [198]:
# Join with another DataFrame
df2 = stocks_data_df.select("id", "category").limit(15)
joined_data = stocks_data_df.join(df2, "id", "inner")
print("Joined Data:")
joined_data.show()

Joined Data:
+---+----------------+-----------+--------+-------+-----------+
| id|            name|   category|quantity|  price|   category|
+---+----------------+-----------+--------+-------+-----------+
|  1|          iPhone|Electronics|      10| 899.99|Electronics|
|  2|         Macbook|Electronics|       5|1299.99|Electronics|
|  3|            iPad|Electronics|      15| 499.99|Electronics|
|  4|      Samsung TV|Electronics|       8| 799.99|Electronics|
|  5|           LG TV|Electronics|      10| 699.99|Electronics|
|  6|      Nike Shoes|   Clothing|      30|  99.99|   Clothing|
|  7|    Adidas Shoes|   Clothing|      25|  89.99|   Clothing|
|  8| Sony Headphones|Electronics|      12| 149.99|Electronics|
|  9|Beats Headphones|Electronics|      20| 199.99|Electronics|
| 10|    Dining Table|  Furniture|      10| 249.99|  Furniture|
| 11|      Study Desk|  Furniture|       8| 149.99|  Furniture|
| 12|          Apples|       Food|     100|    0.5|       Food|
| 13|         Bananas|     

### Sort: Arrange rows based on one or more columns.

In [199]:
# Sort by a column
sorted_data = stocks_data_df.orderBy("price")
print("Sorted Data:")
sorted_data.show(10)

Sorted Data:
+---+--------------+-----------+--------+-----+
| id|          name|   category|quantity|price|
+---+--------------+-----------+--------+-----+
| 13|       Bananas|       Food|     150| 0.25|
| 12|        Apples|       Food|     100|  0.5|
| 14|       Oranges|       Food|     120| 0.75|
| 15|Chicken Breast|       Food|      50| 3.99|
| 16| Salmon Fillet|       Food|      30| 5.99|
| 29|       T-shirt|   Clothing|      50|14.99|
| 19|      Yoga Mat|     Sports|      20|19.99|
| 25|      Backpack|Accessories|      30|24.99|
| 24|    Laptop Bag|Accessories|      25|29.99|
| 20|  Dumbbell Set|     Sports|      15|49.99|
+---+--------------+-----------+--------+-----+
only showing top 10 rows



In [200]:
# Sort by a column desc
from pyspark.sql.functions import col, desc
sorted_data = stocks_data_df.orderBy(col("price").desc(), col("id").desc())
print("Sorted Data Descending:")
sorted_data.show(10)

Sorted Data Descending:
+---+----------------+-----------+--------+-------+
| id|            name|   category|quantity|  price|
+---+----------------+-----------+--------+-------+
|  2|         Macbook|Electronics|       5|1299.99|
|  1|          iPhone|Electronics|      10| 899.99|
|  4|      Samsung TV|Electronics|       8| 799.99|
|  5|           LG TV|Electronics|      10| 699.99|
| 26|          Camera|Electronics|      10| 599.99|
|  3|            iPad|Electronics|      15| 499.99|
| 10|    Dining Table|  Furniture|      10| 249.99|
| 17|  Leather Jacket|   Clothing|      15| 199.99|
|  9|Beats Headphones|Electronics|      20| 199.99|
| 18|     Winter Coat|   Clothing|      10| 149.99|
+---+----------------+-----------+--------+-------+
only showing top 10 rows



## Distinct: Get unique rows.

In [201]:
# Get distinct product category
distinct_rows = stocks_data_df.select("category").distinct()
print("Distinct Product Categories:")
distinct_rows.show()

Distinct Product Categories:
+-----------+
|   category|
+-----------+
|Electronics|
|       Food|
|   Clothing|
|  Furniture|
|     Sports|
|Accessories|
+-----------+



### Drop: Remove specified columns.

In [202]:
# Drop columns
dropped_columns = stocks_data_df.drop("quantity", "category")
print("Dropped Columns:")
dropped_columns.show(10)

Dropped Columns:
+---+----------------+-------+
| id|            name|  price|
+---+----------------+-------+
|  1|          iPhone| 899.99|
|  2|         Macbook|1299.99|
|  3|            iPad| 499.99|
|  4|      Samsung TV| 799.99|
|  5|           LG TV| 699.99|
|  6|      Nike Shoes|  99.99|
|  7|    Adidas Shoes|  89.99|
|  8| Sony Headphones| 149.99|
|  9|Beats Headphones| 199.99|
| 10|    Dining Table| 249.99|
+---+----------------+-------+
only showing top 10 rows



## WithColumn: Add new calculated columns.

In [203]:
# Add a new calculated column
df_with_new_column = stocks_data_df.withColumn("revenue", stocks_data_df.quantity * stocks_data_df.price)
print("DataFrame with New Column:")
df_with_new_column.show(10)

DataFrame with New Column:
+---+----------------+-----------+--------+-------+-------+
| id|            name|   category|quantity|  price|revenue|
+---+----------------+-----------+--------+-------+-------+
|  1|          iPhone|Electronics|      10| 899.99| 8999.9|
|  2|         Macbook|Electronics|       5|1299.99|6499.95|
|  3|            iPad|Electronics|      15| 499.99|7499.85|
|  4|      Samsung TV|Electronics|       8| 799.99|6399.92|
|  5|           LG TV|Electronics|      10| 699.99| 6999.9|
|  6|      Nike Shoes|   Clothing|      30|  99.99| 2999.7|
|  7|    Adidas Shoes|   Clothing|      25|  89.99|2249.75|
|  8| Sony Headphones|Electronics|      12| 149.99|1799.88|
|  9|Beats Headphones|Electronics|      20| 199.99| 3999.8|
| 10|    Dining Table|  Furniture|      10| 249.99| 2499.9|
+---+----------------+-----------+--------+-------+-------+
only showing top 10 rows



### Alias: Rename columns for better readability.

In [204]:
# Rename columns using alias
df_with_alias = stocks_data_df.withColumnRenamed("price", "product_price").withColumnRenamed('category', 'Category_Name')
print("DataFrame with Aliased Column:")
df_with_alias.show(10)

DataFrame with Aliased Column:
+---+----------------+-------------+--------+-------------+
| id|            name|Category_Name|quantity|product_price|
+---+----------------+-------------+--------+-------------+
|  1|          iPhone|  Electronics|      10|       899.99|
|  2|         Macbook|  Electronics|       5|      1299.99|
|  3|            iPad|  Electronics|      15|       499.99|
|  4|      Samsung TV|  Electronics|       8|       799.99|
|  5|           LG TV|  Electronics|      10|       699.99|
|  6|      Nike Shoes|     Clothing|      30|        99.99|
|  7|    Adidas Shoes|     Clothing|      25|        89.99|
|  8| Sony Headphones|  Electronics|      12|       149.99|
|  9|Beats Headphones|  Electronics|      20|       199.99|
| 10|    Dining Table|    Furniture|      10|       249.99|
+---+----------------+-------------+--------+-------------+
only showing top 10 rows



## 3. Spark SQL Operation 

In [205]:
%%bash
head -10 ./data/persons.csv

your 131072x1 screen size is bogus. expect trouble
your 131072x1 screen size is bogus. expect trouble


name,age,gender,salary
John Doe,30,Male,50000
Jane Smith,25,Female,45000
David Johnson,35,Male,60000
Emily Davis,28,Female,52000
Michael Wilson,40,Male,75000
Sarah Brown,32,Female,58000
Robert Lee,29,Male,51000
Lisa Garcia,27,Female,49000
James Martinez,38,Male,70000


In [206]:
# Load the synthetic data into a DataFrame
data_file_path = "./data/persons.csv"
person_data_df = spark.read.csv(data_file_path, header=True, inferSchema=True)

In [207]:
# Display schema of DataFrame
person_data_df.printSchema()
# Show the initial DataFrame
print("Initial DataFrame:")
person_data_df.show(10)

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

Initial DataFrame:
+------------------+---+------+------+
|              name|age|gender|salary|
+------------------+---+------+------+
|          John Doe| 30|  Male| 50000|
|        Jane Smith| 25|Female| 45000|
|     David Johnson| 35|  Male| 60000|
|       Emily Davis| 28|Female| 52000|
|    Michael Wilson| 40|  Male| 75000|
|       Sarah Brown| 32|Female| 58000|
|        Robert Lee| 29|  Male| 51000|
|       Lisa Garcia| 27|Female| 49000|
|    James Martinez| 38|  Male| 70000|
|Jennifer Rodriguez| 26|Female| 47000|
+------------------+---+------+------+
only showing top 10 rows



## Register the DataFrame as a Temporary Table

In [208]:
# Register the DataFrame as a Temporary Table
person_data_df.createOrReplaceTempView("person_table")

## Now, We Can Perform SQL-like Queries

In [209]:
# Select all rows where age is greater than 25
result = spark.sql("SELECT * FROM person_table WHERE age > 25")
result.show()

+------------------+---+------+------+
|              name|age|gender|salary|
+------------------+---+------+------+
|          John Doe| 30|  Male| 50000|
|     David Johnson| 35|  Male| 60000|
|       Emily Davis| 28|Female| 52000|
|    Michael Wilson| 40|  Male| 75000|
|       Sarah Brown| 32|Female| 58000|
|        Robert Lee| 29|  Male| 51000|
|       Lisa Garcia| 27|Female| 49000|
|    James Martinez| 38|  Male| 70000|
|Jennifer Rodriguez| 26|Female| 47000|
|  William Anderson| 33|  Male| 62000|
|   Karen Hernandez| 31|Female| 55000|
|Christopher Taylor| 37|  Male| 69000|
|     Matthew Davis| 36|  Male| 67000|
|    Patricia White| 29|Female| 50000|
|     Daniel Miller| 34|  Male| 64000|
| Elizabeth Jackson| 30|Female| 52000|
|     Joseph Harris| 28|  Male| 53000|
|      Linda Martin| 39|Female| 71000|
+------------------+---+------+------+



In [210]:
# Compute the average salary by gender
avg_salary_by_gender = spark.sql("SELECT gender, AVG(salary) as avg_salary FROM person_table GROUP BY gender")
avg_salary_by_gender.show()

+------+----------+
|gender|avg_salary|
+------+----------+
|Female|   52300.0|
|  Male|   62100.0|
+------+----------+



## Creating and managing temporary views.

In [211]:
# Create a temporary view
person_data_df.createOrReplaceTempView("people")

In [213]:
# Query the temporary view
result = spark.sql("SELECT * FROM people WHERE age > 30")
result.show()

+------------------+---+------+------+
|              name|age|gender|salary|
+------------------+---+------+------+
|     David Johnson| 35|  Male| 60000|
|    Michael Wilson| 40|  Male| 75000|
|       Sarah Brown| 32|Female| 58000|
|    James Martinez| 38|  Male| 70000|
|  William Anderson| 33|  Male| 62000|
|   Karen Hernandez| 31|Female| 55000|
|Christopher Taylor| 37|  Male| 69000|
|     Matthew Davis| 36|  Male| 67000|
|     Daniel Miller| 34|  Male| 64000|
|      Linda Martin| 39|Female| 71000|
+------------------+---+------+------+



In [214]:
# Check if a temporary view exists
view_exists = spark.catalog.tableExists("people")
view_exists

True

In [215]:
# Drop a temporary view
spark.catalog.dropTempView("people")

True

In [216]:
# Check if a temporary view exists
view_exists = spark.catalog.tableExists("people")
view_exists

False

## SQL Subquries

In [217]:
# Create DataFrames
employee_data = [
    (1, "John"), (2, "Alice"), (3, "Bob"), (4, "Emily"),
    (5, "David"), (6, "Sarah"), (7, "Michael"), (8, "Lisa"),
    (9, "William")
]
employees = spark.createDataFrame(employee_data, ["id", "name"])

salary_data = [
    ("HR", 1, 60000), ("HR", 2, 55000), ("HR", 3, 58000),
    ("IT", 4, 70000), ("IT", 5, 72000), ("IT", 6, 68000),
    ("Sales", 7, 75000), ("Sales", 8, 78000), ("Sales", 9, 77000)
]
salaries = spark.createDataFrame(salary_data, ["department", "id", "salary"])

employees.show()

salaries.show()

+---+-------+
| id|   name|
+---+-------+
|  1|   John|
|  2|  Alice|
|  3|    Bob|
|  4|  Emily|
|  5|  David|
|  6|  Sarah|
|  7|Michael|
|  8|   Lisa|
|  9|William|
+---+-------+

+----------+---+------+
|department| id|salary|
+----------+---+------+
|        HR|  1| 60000|
|        HR|  2| 55000|
|        HR|  3| 58000|
|        IT|  4| 70000|
|        IT|  5| 72000|
|        IT|  6| 68000|
|     Sales|  7| 75000|
|     Sales|  8| 78000|
|     Sales|  9| 77000|
+----------+---+------+



In [218]:
# Register as temporary views
employees.createOrReplaceTempView("employees")
salaries.createOrReplaceTempView("salaries")

In [220]:
# Subquery to find employees with salaries above average
result = spark.sql("""
    SELECT name
    FROM employees
    WHERE id IN (
        SELECT id
        FROM salaries
        WHERE salary > (SELECT AVG(salary) FROM salaries)
    )
""")
result.show()

+-------+
|   name|
+-------+
|  Emily|
|  David|
|Michael|
|   Lisa|
|William|
+-------+



### Working with Window Functions in PySpark

#### Reference
https://www.analyticsvidhya.com/blog/2024/03/working-with-window-functions-in-pyspark/#:~:text=Approach%20for%20PySpark%20code,sal%E2%80%9D%20column%20in%20descending%20order.

In [221]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

In [222]:
employee_salary = spark.sql("""
    select  salaries.*, employees.name
    from salaries 
    left join employees on salaries.id = employees.id
""")

employee_salary.show()

+----------+---+------+-------+
|department| id|salary|   name|
+----------+---+------+-------+
|        HR|  1| 60000|   John|
|        HR|  2| 55000|  Alice|
|        HR|  3| 58000|    Bob|
|        IT|  4| 70000|  Emily|
|        IT|  5| 72000|  David|
|        IT|  6| 68000|  Sarah|
|     Sales|  7| 75000|Michael|
|     Sales|  8| 78000|   Lisa|
|     Sales|  9| 77000|William|
+----------+---+------+-------+



In [230]:
# Create a window specification
window_spec = Window.partitionBy("department").orderBy(F.desc("salary"))

In [231]:
# Calculate the rank of employees within each department based on salary
employee_salary.withColumn("rank", F.rank().over(window_spec)).show()

+----------+---+------+-------+----+
|department| id|salary|   name|rank|
+----------+---+------+-------+----+
|        HR|  1| 60000|   John|   1|
|        HR|  3| 58000|    Bob|   2|
|        HR|  2| 55000|  Alice|   3|
|        IT|  5| 72000|  David|   1|
|        IT|  4| 70000|  Emily|   2|
|        IT|  6| 68000|  Sarah|   3|
|     Sales|  8| 78000|   Lisa|   1|
|     Sales|  9| 77000|William|   2|
|     Sales|  7| 75000|Michael|   3|
+----------+---+------+-------+----+



In [232]:
# Stop the SparkSession
spark.stop()

### Reference

### pySpark Tutorials 

https://sparkbyexamples.com/pyspark/pyspark-read-csv-file-into-dataframe/

https://medium.com/codex/pyspark-for-begineers-part-2-pyspark-dataframe-60008da53e30

https://www.tutorialspoint.com/pyspark/pyspark_sparkcontext.htm

https://blog.devgenius.io/pyspark-for-begineers-part-3-pyspark-dataframe-db02f0fcd275


### Big Data Hadoop Setup & Configuration and Command Line Data processing 

https://medium.com/@jonty2245/install-apache-hadoop-on-windows-11-a-beginners-guide-45a149f47f8a

https://intellipaat.com/blog/tutorial/hadoop-tutorial/hdfs-operations/

https://www.projectpro.io/hadoop-tutorial/hadoop-hdfs-commands

https://www.geeksforgeeks.org/hdfs-commands/

https://medium.com/@ashwin_kumar_/hadoop-hdfs-commands-with-examples-and-usage-570038cbef07
