In [None]:
import pyspark

In [None]:
# Importing the Libraries
from pyspark import SparkContext
from pyspark.sql import SparkSession

# Creating a SparkContext
sc = SparkContext.getOrCreate()

# Create a SparkSession
spark = SparkSession.builder.appName('PySpark DataFrame From RDD').getOrCreate()

# Create an RDD
rdd = sc.parallelize([('C',85,76,87,91), ('B',85,76,87,91), ("A", 85,78,96,92), ("A", 92,76,89,96)], 4)

print(type(rdd))

# Converting the RDD into PySpark DataFrame
# Define schema
sub = ['Division','English','Mathematics','Physics','Chemistry']

# Create DataFrame
marks_df = spark.createDataFrame(rdd, schema=sub)

# Output the type and content of DataFrame
print(type(marks_df))
marks_df.show()


<class 'pyspark.rdd.RDD'>
<class 'pyspark.sql.dataframe.DataFrame'>
+--------+-------+-----------+-------+---------+
|Division|English|Mathematics|Physics|Chemistry|
+--------+-------+-----------+-------+---------+
|       C|     85|         76|     87|       91|
|       B|     85|         76|     87|       91|
|       A|     85|         78|     96|       92|
|       A|     92|         76|     89|       96|
+--------+-------+-----------+-------+---------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Start Spark session
spark = SparkSession.builder.appName("Life Stage Classification").getOrCreate()

# Create DataFrame
df = spark.createDataFrame(
    [
        ("sue", 32),
        ("lil", 3),
        ("bob", 75),
        ("heo", 13)
    ],
    ["first_name", "age"]
)
# Show result
df1.show()



+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|       sue| 32|     adult|
|       lil|  3|     child|
|       bob| 75|     adult|
|       heo| 13|  teenager|
+----------+---+----------+



In [None]:
# Add 'life_stage' column based on age
df1 = df.withColumn(
    "life_stage",
    when(col("age") < 13, "child")
    .when(col("age").between(13, 19), "teenager")
    .otherwise("adult")
)

# Show result
df1.show()

+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|       sue| 32|     adult|
|       lil|  3|     child|
|       bob| 75|     adult|
|       heo| 13|  teenager|
+----------+---+----------+



In [None]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark create RDD example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Create DataFrame
Employee = spark.createDataFrame([
    ("1", "Joe", "70000", "1"),
    ("2", "Henry", "80000", "2"),
    ("3", "Sam", "60000", "2"),
    ("4", "Max", "90000", "1")
], ["Id", "Name", "Salary", "DepartmentId"])

# Show the DataFrame
Employee.show()



+---+-----+------+------------+
| Id| Name|Salary|DepartmentId|
+---+-----+------+------------+
|  1|  Joe| 70000|           1|
|  2|Henry| 80000|           2|
|  3|  Sam| 60000|           2|
|  4|  Max| 90000|           1|
+---+-----+------+------------+



**practice**

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create SparkSession
spark = SparkSession.builder.appName("FileDFs").getOrCreate()

# Define schema
txt_schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True)
])

# Read txt file (comma-separated)
df_txt = spark.read.option("delimiter", ",").schema(txt_schema).csv("data.txt")

# Print schema and show
df_txt.printSchema()
df_txt.show()



root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)

+-------+----+
|   Name| Age|
+-------+----+
|  Johan|NULL|
|   Amar|NULL|
|Pavitra|NULL|
|   Raja|NULL|
+-------+----+



In [None]:
df_csv = spark.read.option("header", True).csv("data.csv", inferSchema=True)

# Print schema and show
df_csv.printSchema()
df_csv.show()


root
 |-- Name,Age: string (nullable = true)

+--------+
|Name,Age|
+--------+
| John,25|
|Alice,30|
|  Bob,22|
+--------+



In [None]:
df_json = spark.read.json("data.json")

# Print schema and show
df_json.printSchema()
df_json.show()


root
 |-- Age: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- _corrupt_record: string (nullable = true)

+----+-------+---------------+
| Age|   Name|_corrupt_record|
+----+-------+---------------+
|NULL|   NULL|              [|
|  25|  Ramar|           NULL|
|  30|Amritha|           NULL|
|  22|   Babu|           NULL|
|NULL|   NULL|              ]|
+----+-------+---------------+



In [None]:
# For CSV
print("CSV Schema:")
df_csv.printSchema()

# For JSON
print("JSON Schema:")
df_json.printSchema()

# For TXT
print("TXT Schema:")
df_txt.printSchema()

# Or check with dtypes
print("CSV types:", df_csv.dtypes)
print("JSON types:", df_json.dtypes)
print("TXT types:", df_txt.dtypes)


CSV Schema:
root
 |-- Name,Age: string (nullable = true)

JSON Schema:
root
 |-- Age: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- _corrupt_record: string (nullable = true)

TXT Schema:
root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)

CSV types: [('Name,Age', 'string')]
JSON types: [('Age', 'bigint'), ('Name', 'string'), ('_corrupt_record', 'string')]
TXT types: [('Name', 'string'), ('Age', 'int')]


**Reading External Files into PySpark DataFrames (CSV, TXT, JSON)**

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySpark DataFrame From External Files").getOrCreate()
csv_file = spark.read.csv('/content/fish.csv', sep=',', inferSchema=True, header=True)
txt_file = spark.read.text("/content/example.txt")
json_file = spark.read.json("/content/sample.json", multiLine=True)
print("Type of csv_file:", type(csv_file))
print("Type of txt_file:", type(txt_file))
print("Type of json_file:", type(json_file))
print("Schema of csv_file:")
csv_file.printSchema()
print("Schema of txt_file:")
txt_file.printSchema()
print("Schema of json_file:")
json_file.printSchema()


Type of csv_file: <class 'pyspark.sql.dataframe.DataFrame'>
Type of txt_file: <class 'pyspark.sql.dataframe.DataFrame'>
Type of json_file: <class 'pyspark.sql.dataframe.DataFrame'>
Schema of csv_file:
root
 |-- Species: string (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Length1: double (nullable = true)
 |-- Length2: double (nullable = true)
 |-- Length3: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Width: double (nullable = true)

Schema of txt_file:
root
 |-- value: string (nullable = true)

Schema of json_file:
root
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)
 |-- name: string (nullable = true)



In [None]:
df = spark.read.text("/content/example.txt")
df.selectExpr("split(value, ' ') as\
Text_Data_In_Rows_Using_Text").show(4,False)

+-----------------------------------------------------+
|asText_Data_In_Rows_Using_Text                       |
+-----------------------------------------------------+
|[This, is, an, example, line, from, the, text, file.]|
|[Another, line, follows.]                            |
|[PySpark, is, great, for, processing, big, data.]    |
+-----------------------------------------------------+



**Creating and Modifying a DataFrame**

In [None]:
# Importing necessary modules
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

# Creating SparkSession and giving an app name
spark = SparkSession.builder.appName('sparkdf').getOrCreate()

# List of employee data
data = [
    ["1", "sravan", "company 1"],
    ["2", "ojaswi", "company 1"],
    ["3", "rohith", "company 2"],
    ["4", "sridevi", "company 1"],
    ["5", "bobby", "company 1"]
]

# Specifying column names
columns = ['ID', 'NAME', 'Company']

# Creating a DataFrame from the list of data
df = spark.createDataFrame(data, columns)

# Adding a new column 'salary' with a constant value
df.withColumn("salary", lit(30000)).show()


+---+-------+---------+------+
| ID|   NAME|  Company|salary|
+---+-------+---------+------+
|  1| sravan|company 1| 30000|
|  2| ojaswi|company 1| 30000|
|  3| rohith|company 2| 30000|
|  4|sridevi|company 1| 30000|
|  5|  bobby|company 1| 30000|
+---+-------+---------+------+



**Transfrmation & actions**

Action

In [None]:
from pyspark import SparkContext
from operator import add

# Initialize SparkContext
sc = SparkContext("local", "RDD Actions Example")

# Create RDD
rdd = sc.parallelize([1, 2, 3, 4, 5])

# 1. collect() - Returns all elements as a list
print("collect():", rdd.collect())

# 2. count() - Returns the number of elements in the RDD
print("count():", rdd.count())

# 3. first() - Returns the first element of the RDD
print("first():", rdd.first())

# 4. reduce() - Aggregates the elements using a function
print("reduce():", rdd.reduce(add))  # Same as sum of elements

# 5. take(n) - Returns the first 'n' elements
print("take(3):", rdd.take(3))

# 6. saveAsTextFile(path) - Saves the RDD to a text file (each element on a new line)
rdd.saveAsTextFile("output_rdd")
print("RDD saved successfully to 'output_rdd' directory.")

# Stop the SparkContext
sc.stop()


collect(): [1, 2, 3, 4, 5]
count(): 5
first(): 1
reduce(): 15
take(3): [1, 2, 3]
RDD saved successfully to 'output_rdd' directory.


**Spark SQL**

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Fish CSV Reader").getOrCreate()
df = spark.read.option("header", True)	.csv("/content/fish.csv")
df.printSchema()
df.show()

root
 |-- Species: string (nullable = true)
 |-- Weight: string (nullable = true)
 |-- Length1: string (nullable = true)
 |-- Length2: string (nullable = true)
 |-- Length3: string (nullable = true)
 |-- Height: string (nullable = true)
 |-- Width: string (nullable = true)

+-------+------+-------+-------+-------+------+-----+
|Species|Weight|Length1|Length2|Length3|Height|Width|
+-------+------+-------+-------+-------+------+-----+
|  Bream|   242|   23.2|   25.4|   30.0| 11.52| 4.02|
|  Bream|   290|   24.0|   26.3|   31.2| 12.48| 4.30|
|  Roach|   120|   20.0|   22.0|   24.0|  8.02| 3.40|
|  Smelt|    12|    9.5|   10.5|   11.0|  2.00| 1.00|
|   Pike|   500|   35.0|   37.3|   40.0| 15.50| 6.70|
+-------+------+-------+-------+-------+------+-----+



ETL

# ETL stands for

# Extract: extract the data from the different sources

# Transform: Transform the unstructured data into structured data. Transformations like cleaning, manipulation, etc.

# Load : Load the transformed data into a location or date warehouse.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, lit, floor, rand

spark = SparkSession.builder.appName("ETLPractice").getOrCreate()

source_path = "/content/orders (1).csv"  # Correct path
target_path = "/content/order_result.csv"  # Output path

# Use the correct source path here
load_data = spark.read.csv(source_path, header=True, inferSchema=True)


In [None]:
load_data.columns
load_data.show(5)

+-------+----------+----------+----------+-----------+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|
+-------+----------+----------+----------+-----------+
|      1|      john|       doe|         5|     active|
|      2|      jane|     smith|         8|     active|
|      3|   micheal|   jhonson|         3|   inactive|
|      4|      abhi|   wiliams|         1|     active|
|      5|       ram|     brown|         4|   inactive|
+-------+----------+----------+----------+-----------+
only showing top 5 rows



In [None]:
 #Transformation 1: Concatenate First and Last Names
load_data = load_data.withColumn('full_name', concat(col('cust_fname'), lit(' '), col('cust_lname')))
load_data.show(10)

+-------+----------+----------+----------+-----------+---------------+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|      full_name|
+-------+----------+----------+----------+-----------+---------------+
|      1|      john|       doe|         5|     active|       john doe|
|      2|      jane|     smith|         8|     active|     jane smith|
|      3|   micheal|   jhonson|         3|   inactive|micheal jhonson|
|      4|      abhi|   wiliams|         1|     active|   abhi wiliams|
|      5|       ram|     brown|         4|   inactive|      ram brown|
|      6|     emily|  anderson|         2|     active| emily anderson|
|      7|   william|     jones|        10|     active|  william jones|
|      8|     susan|     davis|         7|   inactive|    susan davis|
|      9|     david|    miller|         9|     active|   david miller|
|     10|      sara|     moore|         2|   inactive|     sara moore|
+-------+----------+----------+----------+-----------+---------------+
only s

In [None]:
# Transformation 2: Calculate Net Salary (subtract 10% as taxes)
load_data = load_data.withColumn('net_salary', floor(lit(10000) + rand() * lit(50)))
load_data.show(10)

+-------+----------+----------+----------+-----------+---------------+----------+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|      full_name|net_salary|
+-------+----------+----------+----------+-----------+---------------+----------+
|      1|      john|       doe|         5|     active|       john doe|     10018|
|      2|      jane|     smith|         8|     active|     jane smith|     10044|
|      3|   micheal|   jhonson|         3|   inactive|micheal jhonson|     10006|
|      4|      abhi|   wiliams|         1|     active|   abhi wiliams|     10005|
|      5|       ram|     brown|         4|   inactive|      ram brown|     10033|
|      6|     emily|  anderson|         2|     active| emily anderson|     10002|
|      7|   william|     jones|        10|     active|  william jones|     10035|
|      8|     susan|     davis|         7|   inactive|    susan davis|     10005|
|      9|     david|    miller|         9|     active|   david miller|     10033|
|     10|      s

In [None]:
#adding age column
load_data = load_data.withColumn('age', floor(lit(20) + rand() * lit(31)))
load_data.show(10)

+-------+----------+----------+----------+-----------+---------------+----------+---+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|      full_name|net_salary|age|
+-------+----------+----------+----------+-----------+---------------+----------+---+
|      1|      john|       doe|         5|     active|       john doe|     10018| 45|
|      2|      jane|     smith|         8|     active|     jane smith|     10044| 41|
|      3|   micheal|   jhonson|         3|   inactive|micheal jhonson|     10006| 20|
|      4|      abhi|   wiliams|         1|     active|   abhi wiliams|     10005| 22|
|      5|       ram|     brown|         4|   inactive|      ram brown|     10033| 42|
|      6|     emily|  anderson|         2|     active| emily anderson|     10002| 34|
|      7|   william|     jones|        10|     active|  william jones|     10035| 33|
|      8|     susan|     davis|         7|   inactive|    susan davis|     10005| 31|
|      9|     david|    miller|         9|     active|

In [None]:
# # Transformation 3: Filter by Age (age >= 30)
load_data = load_data.filter(col('age')>= 30)
load_data.show()


+-------+----------+----------+----------+-----------+---------------+----------+---+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|      full_name|net_salary|age|
+-------+----------+----------+----------+-----------+---------------+----------+---+
|      1|      john|       doe|         5|     active|       john doe|     10018| 45|
|      2|      jane|     smith|         8|     active|     jane smith|     10044| 41|
|      5|       ram|     brown|         4|   inactive|      ram brown|     10033| 42|
|      6|     emily|  anderson|         2|     active| emily anderson|     10002| 34|
|      7|   william|     jones|        10|     active|  william jones|     10035| 33|
|      8|     susan|     davis|         7|   inactive|    susan davis|     10005| 31|
|      9|     david|    miller|         9|     active|   david miller|     10033| 33|
|     11|     james|    tailor|         5|   inactive|   james tailor|     10032| 35|
|     13|    robert|     evans|        11|     active|

In [None]:
# Transformation 4: Group by Age and Calculate Average Salary
avg_salary_by_age = load_data.groupBy('age').agg({'net_salary' :'avg'}).withColumnRenamed('avg(salary)', 'avg_salary')
avg_salary_by_age.show()

+---+------------------+
|age|   avg(net_salary)|
+---+------------------+
| 34|           10023.0|
| 32|           10034.0|
| 31|           10008.0|
| 41|           10044.0|
| 33|10033.333333333334|
| 35|           10032.0|
| 42|           10033.0|
| 45|           10017.0|
| 47|           10026.5|
+---+------------------+



In [None]:
load_data = load_data.orderBy("age")
load_data.show()

+-------+----------+----------+----------+-----------+---------------+----------+---+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|      full_name|net_salary|age|
+-------+----------+----------+----------+-----------+---------------+----------+---+
|      8|     susan|     davis|         7|   inactive|    susan davis|     10005| 31|
|     13|    robert|     evans|        11|     active|   robert evans|     10011| 31|
|     19|chrisopher|      basa|         8|   inactive|chrisopher basa|     10034| 32|
|      7|   william|     jones|        10|     active|  william jones|     10035| 33|
|      9|     david|    miller|         9|     active|   david miller|     10033| 33|
|     20|       ava|    joesph|         3|     active|     ava joesph|     10032| 33|
|      6|     emily|  anderson|         2|     active| emily anderson|     10002| 34|
|     16|  isabella|     white|         6|   inactive| isabella white|     10044| 34|
|     11|     james|    tailor|         5|   inactive|

In [None]:
# Save the transformed data to an external CSV file
load_data.write.csv(target_path, mode='overwrite', header=True)

**SparkContext**

In [None]:
# Create SparkSession from builder
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()
print(spark.sparkContext)
print("Spark App Name : "+ spark.sparkContext.appName)

<SparkContext master=local[*] appName=Life Stage Classification>
Spark App Name : Life Stage Classification


In [None]:
# SparkContext stop() method
spark.sparkContext.stop()

In [None]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("SimpleRDDApp") \
    .master("local[*]") \
    .getOrCreate()

# Get the SparkContext
sc = spark.sparkContext
rdd = sc.range(1, 5)
print(rdd.collect())


[1, 2, 3, 4]


In [None]:
rdd = sc.textFile("test.txt")   # Each line in the file becomes one element in the RDD
data = [1,2,3,4,5]
rdd = sc.parallelize(data)

**creating rdd**

In [None]:
data = [1, 2, 3, 4]
rdd = spark.sparkContext.parallelize(data)
print(rdd.collect())

[1, 2, 3, 4]


In [None]:
from pyspark.sql import SparkSession

# Start SparkSession
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("RDDExample") \
    .getOrCreate()

# Get SparkContext
sc = spark.sparkContext


In [None]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("ParallelizeExample") \
    .getOrCreate()
# Create RDD from parallelize
data = [1,2,3,4,5,6,7,8,9,10,11,12]
rdd = spark.sparkContext.parallelize(data)
print(rdd.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [None]:
empty_rdd = sc.emptyRDD()
print(empty_rdd.collect())

[]


In [None]:
rdd_with_partitions = sc.parallelize([], 10)
print(rdd_with_partitions.collect())

[]


In [None]:
rdd = sc.parallelize([1,2,3], 5)
print(rdd.count())

3


In [None]:
rdd_new = rdd.repartition(4)
print(rdd_new.collect())


[1, 2, 3]


In [None]:
rdd_small = rdd.coalesce(2)
print(rdd_small.collect())

[1, 2, 3]


**Transfermation**

In [None]:
rdd = spark.sparkContext.textFile("/content/sample_text.txt")

38


In [None]:
rdd2 = rdd.flatMap(lambda line: line.split(" "))
print("\nWords after flatMap:")
print(rdd2.collect())


Words after flatMap:
['PySpark', 'is', 'an', 'interface', 'for', 'Apache', 'Spark', 'in', 'Python.', 'It', 'allows', 'you', 'to', 'write', 'Spark', 'applications', 'using', 'Python', 'APIs.', 'PySpark', 'helps', 'data', 'scientists', 'and', 'engineers', 'to', 'work', 'with', 'large-scale', 'data', 'processing.', 'RDD', 'is', 'a', 'core', 'component', 'of', 'PySpark.']


In [None]:
rdd3 = rdd2.map(lambda word: (word, 1))
print("\nAfter map (word, 1):")
print(rdd3.collect())


After map (word, 1):
[('PySpark', 1), ('is', 1), ('an', 1), ('interface', 1), ('for', 1), ('Apache', 1), ('Spark', 1), ('in', 1), ('Python.', 1), ('It', 1), ('allows', 1), ('you', 1), ('to', 1), ('write', 1), ('Spark', 1), ('applications', 1), ('using', 1), ('Python', 1), ('APIs.', 1), ('PySpark', 1), ('helps', 1), ('data', 1), ('scientists', 1), ('and', 1), ('engineers', 1), ('to', 1), ('work', 1), ('with', 1), ('large-scale', 1), ('data', 1), ('processing.', 1), ('RDD', 1), ('is', 1), ('a', 1), ('core', 1), ('component', 1), ('of', 1), ('PySpark.', 1)]


In [None]:
rdd4 = rdd3.reduceByKey(lambda a, b: a + b)
print("\nAfter reduceByKey (word counts):")
print(rdd4.collect())


After reduceByKey (word counts):
[('PySpark', 2), ('an', 1), ('interface', 1), ('for', 1), ('It', 1), ('you', 1), ('to', 2), ('write', 1), ('applications', 1), ('using', 1), ('Python', 1), ('helps', 1), ('and', 1), ('engineers', 1), ('work', 1), ('with', 1), ('core', 1), ('of', 1), ('is', 2), ('Apache', 1), ('Spark', 2), ('in', 1), ('Python.', 1), ('allows', 1), ('APIs.', 1), ('data', 2), ('scientists', 1), ('large-scale', 1), ('processing.', 1), ('RDD', 1), ('a', 1), ('component', 1), ('PySpark.', 1)]


In [None]:
rdd5 = rdd4.map(lambda x: (x[1], x[0])).sortByKey()
print("\nSorted by count:")
print(rdd5.collect())


Sorted by count:
[(1, 'an'), (1, 'interface'), (1, 'for'), (1, 'It'), (1, 'you'), (1, 'write'), (1, 'applications'), (1, 'using'), (1, 'Python'), (1, 'helps'), (1, 'and'), (1, 'engineers'), (1, 'work'), (1, 'with'), (1, 'core'), (1, 'of'), (1, 'Apache'), (1, 'in'), (1, 'Python.'), (1, 'allows'), (1, 'APIs.'), (1, 'scientists'), (1, 'large-scale'), (1, 'processing.'), (1, 'RDD'), (1, 'a'), (1, 'component'), (1, 'PySpark.'), (2, 'PySpark'), (2, 'to'), (2, 'is'), (2, 'Spark'), (2, 'data')]


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]")\
        .appName("DataFrame Example")\
        .getOrCreate()
data = [
      ('James', '', 'Smith', '1991-04-01', 'M', 3000),
      ('Michael', 'Rose', '', '2000-05-19', 'M', 4000),
      ('Robert', '', 'Williams', '1978-09-05', 'M', 4000),
      ('Maria', 'Anne', 'Jones', '1967-12-01', 'F', 4000),
      ('Jen', 'Mary', 'Brown', '1980-02-17', 'F', -1)
]
#col names
columns = ["FirstName", "MiddleName","LastName", "DOB", "Gender", "Salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()
'''
--- Note ---
local[1] → Spark will run with 1 core (single-threaded).
local[2] → Spark will run with 2 cores.
local[*] → Spark will run with all cores available.

'''


+---------+----------+--------+----------+------+------+
|FirstName|MiddleName|LastName|       DOB|Gender|Salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



'\n--- Note ---\nlocal[1] → Spark will run with 1 core (single-threaded).\nlocal[2] → Spark will run with 2 cores.\nlocal[*] → Spark will run with all cores available.\n\n'

In [None]:
# Select columns 2 to 4
df.select(df.columns[2:4])
df.show(3)

+---------+----------+--------+----------+------+------+
|FirstName|MiddleName|LastName|       DOB|Gender|Salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
+---------+----------+--------+----------+------+------+
only showing top 3 rows



In [None]:
df.printSchema()

root
 |-- FirstName: string (nullable = true)
 |-- MiddleName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary: long (nullable = true)



**FILTERING**

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName('DataFrame Created For Filtering.com').getOrCreate()
# Data as nested lists
data = [
    [["James","","Smith"], ["Java","Scala","C++"], "OH", "M"],
    [["Anna","Rose",""], ["Spark","Java","C++"], "NY", "F"],
    [["Julia","","Williams"], ["CSharp","VB"], "OH", "F"],
    [["Maria","Anne","Jones"], ["CSharp","VB"], "NY", "M"],
    [["Jen","Mary","Brown"], ["CSharp","VB"], "NY", "M"],
    [["Mike","Mary","Williams"], ["Python","VB"], "OH", "M"]
]
# Column names
columns = ["name", "languages", "state", "gender"]
# Create DataFrame with schema
df = spark.createDataFrame(data, columns)
df.show(truncate=False)
df.printSchema()


+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|[James, , Smith]      |[Java, Scala, C++]|OH   |M     |
|[Anna, Rose, ]        |[Spark, Java, C++]|NY   |F     |
|[Julia, , Williams]   |[CSharp, VB]      |OH   |F     |
|[Maria, Anne, Jones]  |[CSharp, VB]      |NY   |M     |
|[Jen, Mary, Brown]    |[CSharp, VB]      |NY   |M     |
|[Mike, Mary, Williams]|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+

root
 |-- name: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)



In [None]:
#Using != operator
df.filter(~(df.state == "OH")).show()
#Using ~ (Negation) operator
df.where(df.state != "NY").show()


+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      [Anna, Rose, ]|[Spark, Java, C++]|   NY|     F|
|[Maria, Anne, Jones]|      [CSharp, VB]|   NY|     M|
|  [Jen, Mary, Brown]|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    [James, , Smith]|[Java, Scala, C++]|   OH|     M|
| [Julia, , Williams]|      [CSharp, VB]|   OH|     F|
|[Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.master("local[*]").appName("Filter Example").getOrCreate()

data = [
    ("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("Raman", "Finance", 3300),
    ("Scott", "Finance", 3900),
    ("Jen", "Finance", 3000),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
]

columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)

# Filter: Salary > 3000
df.filter(col("Salary") > 3000).show()



+-------+----------+------+
|   Name|Department|Salary|
+-------+----------+------+
|Michael|     Sales|  4600|
| Robert|     Sales|  4100|
|  Raman|   Finance|  3300|
|  Scott|   Finance|  3900|
+-------+----------+------+



In [None]:
df.filter(col("Name").startswith("J")).show()


+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
|James|     Sales|  3000|
|  Jen|   Finance|  3000|
| Jeff| Marketing|  3000|
+-----+----------+------+



In [None]:
df.filter(col("Name").like("J%")).show()   # Names starting with J


+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
|James|     Sales|  3000|
|  Jen|   Finance|  3000|
| Jeff| Marketing|  3000|
+-----+----------+------+



In [None]:
df.filter(col("Department").isin("Sales", "Finance")).show()


+-------+----------+------+
|   Name|Department|Salary|
+-------+----------+------+
|  James|     Sales|  3000|
|Michael|     Sales|  4600|
| Robert|     Sales|  4100|
|  Maria|   Finance|  3000|
|  Raman|   Finance|  3300|
|  Scott|   Finance|  3900|
|    Jen|   Finance|  3000|
+-------+----------+------+



**Dropping**

In [None]:
from os import truncate
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName("Dropping Example").getOrCreate()
simpleData = [
    ("James", "", "Smith", "36636", "NewYork", 3100),
    ("Michael", "Rose", "", "40288", "California", 4300),
    ("Robert", "", "Williams", "42114", "Florida", 1400),
    ("Maria", "Anne", "Jones", "39192", "Florida", 5500),
    ("Jen", "Mary", "Brown", "34561", "NewYork", 3000)
]
columns  = ["firstname", "middlename", "lastname", "id","location", "salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate= False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- location: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+-----+----------+------+
|firstname|middlename|lastname|id   |location  |salary|
+---------+----------+--------+-----+----------+------+
|James    |          |Smith   |36636|NewYork   |3100  |
|Michael  |Rose      |        |40288|California|4300  |
|Robert   |          |Williams|42114|Florida   |1400  |
|Maria    |Anne      |Jones   |39192|Florida   |5500  |
|Jen      |Mary      |Brown   |34561|NewYork   |3000  |
+---------+----------+--------+-----+----------+------+



In [None]:
df.drop(col("firstname")).printSchema()
df.drop(df.firstname).printSchema()

root
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- location: string (nullable = true)
 |-- salary: long (nullable = true)

root
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- location: string (nullable = true)
 |-- salary: long (nullable = true)



Drop rows

In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('sparkdf').getOrCreate()

data = [["1", None, "company 1"],
        ["2", "ojaswi", "company 2"],
        [None, "bobby", "company 3"],
        ["1", "sravan", "company 1"],
        ["2", "ojaswi", None],
        [None, "rohith", "company 2"],
        ["5", "gnanesh", "company 1"],
        ["2", None, "company 2"],
        ["3", "bobby", "company 3"],
        ["4", "rohith", None]]

columns = ['Employee_ID', 'Employee_NAME', 'Company_Name']
df = spark.createDataFrame(data, columns)
df.show()

+-----------+-------------+------------+
|Employee_ID|Employee_NAME|Company_Name|
+-----------+-------------+------------+
|          1|         NULL|   company 1|
|          2|       ojaswi|   company 2|
|       NULL|        bobby|   company 3|
|          1|       sravan|   company 1|
|          2|       ojaswi|        NULL|
|       NULL|       rohith|   company 2|
|          5|      gnanesh|   company 1|
|          2|         NULL|   company 2|
|          3|        bobby|   company 3|
|          4|       rohith|        NULL|
+-----------+-------------+------------+



In [None]:
df.filter(df["Employee_NAME"].isNotNull()).show()

+-----------+-------------+------------+
|Employee_ID|Employee_NAME|Company_Name|
+-----------+-------------+------------+
|          2|       ojaswi|   company 2|
|       NULL|        bobby|   company 3|
|          1|       sravan|   company 1|
|          2|       ojaswi|        NULL|
|       NULL|       rohith|   company 2|
|          5|      gnanesh|   company 1|
|          3|        bobby|   company 3|
|          4|       rohith|        NULL|
+-----------+-------------+------------+



drop rows - duplicates

In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Duplicate_rows').getOrCreate()
data = [["1", "sravan", "company 1"],
        ["2", "ojaswi", "company 2"],
        ["3", "bobby", "company 3"],
        ["1", "sravan", "company 1"],
        ["2", "ojaswi", "company 2"],
        ["6", "rohith", "company 2"],
        ["5", "gnanesh", "company 1"],
        ["2", "ojaswi", "company 2"],
        ["3", "bobby", "company 3"],
        ["4", "rohith", "company 2"]]

columns = ['ID', 'Employee_NAME', 'Company_Name']
df = spark.createDataFrame(data, columns)
df.dropDuplicates().show()

+---+-------------+------------+
| ID|Employee_NAME|Company_Name|
+---+-------------+------------+
|  2|       ojaswi|   company 2|
|  1|       sravan|   company 1|
|  3|        bobby|   company 3|
|  4|       rohith|   company 2|
|  6|       rohith|   company 2|
|  5|      gnanesh|   company 1|
+---+-------------+------------+



Remove duplicate rows by using a distinct function

In [None]:
df.distinct().show()

+---+-------------+------------+
| ID|Employee_NAME|Company_Name|
+---+-------------+------------+
|  2|       ojaswi|   company 2|
|  1|       sravan|   company 1|
|  3|        bobby|   company 3|
|  4|       rohith|   company 2|
|  6|       rohith|   company 2|
|  5|      gnanesh|   company 1|
+---+-------------+------------+



**Using** **Conditons**

In [None]:
from pyspark.sql.functions import col
df.where(col("ID").cast("int")>4).show()

+---+-------------+------------+
| ID|Employee_NAME|Company_Name|
+---+-------------+------------+
|  6|       rohith|   company 2|
|  5|      gnanesh|   company 1|
+---+-------------+------------+



In [None]:
df.where(col("Company_Name") != 'company 1').show()

+---+-------------+------------+
| ID|Employee_NAME|Company_Name|
+---+-------------+------------+
|  2|       ojaswi|   company 2|
|  3|        bobby|   company 3|
|  2|       ojaswi|   company 2|
|  6|       rohith|   company 2|
|  2|       ojaswi|   company 2|
|  3|        bobby|   company 3|
|  4|       rohith|   company 2|
+---+-------------+------------+



filter

In [None]:
df.filter(df.ID != 4).show()

+---+-------------+------------+
| ID|Employee_NAME|Company_Name|
+---+-------------+------------+
|  1|       sravan|   company 1|
|  2|       ojaswi|   company 2|
|  3|        bobby|   company 3|
|  1|       sravan|   company 1|
|  2|       ojaswi|   company 2|
|  6|       rohith|   company 2|
|  5|      gnanesh|   company 1|
|  2|       ojaswi|   company 2|
|  3|        bobby|   company 3|
+---+-------------+------------+



**sorting**

In [None]:
from pyspark.sql import SparkSession
import pandas as pd
from google.colab import drive
spark = SparkSession.builder.master("local[1]").appName("Sorting Example").getOrCreate()
sc = spark.sparkContext
df = spark.read.csv('/content/college_student_placement_dataset (1).csv', header= True, inferSchema=True)
df.show()

+----------+---+---------------+-----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|College_ID| IQ|Prev_Sem_Result| CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Projects_Completed|Placement|
+----------+---+---------------+-----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|   CLG0030|107|           6.61| 6.28|                   8|                   No|                     8|                   8|                 4|       No|
|   CLG0061| 97|           5.52| 5.37|                   8|                   No|                     7|                   8|                 0|       No|
|   CLG0036|109|           5.36| 5.83|                   9|                   No|                     3|                   1|                 1|       No|
|   CLG0055|122|           5.47| 5.75|                   6|           

asc

In [None]:
df.sort("CGPA").show()


+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|College_ID| IQ|Prev_Sem_Result|CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Projects_Completed|Placement|
+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|   CLG0044|113|           5.03|4.54|                   6|                  Yes|                     9|                   5|                 4|       No|
|   CLG0017| 99|           5.01|4.56|                   5|                  Yes|                     0|                   9|                 4|       No|
|   CLG0025| 97|           5.06|4.57|                   6|                   No|                     7|                   5|                 2|       No|
|   CLG0005|123|           5.02|4.58|                   3|                  

In [None]:
from pyspark.sql.functions import col
df.sort("Prev_Sem_Result","CGPA").show()

+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|College_ID| IQ|Prev_Sem_Result|CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Projects_Completed|Placement|
+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|   CLG0069|105|            5.0|4.61|                   8|                   No|                     7|                   3|                 1|       No|
|   CLG0076| 73|            5.0|4.62|                   9|                   No|                     1|                   6|                 4|       No|
|   CLG0004| 88|            5.0|4.69|                   3|                   No|                     0|                   2|                 0|       No|
|   CLG0038|108|            5.0| 4.7|                   6|                  

In [None]:
from pyspark.sql.functions import col
df.sort(col("Prev_Sem_Result").asc(), col("CGPA").desc()).show()

+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|College_ID| IQ|Prev_Sem_Result|CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Projects_Completed|Placement|
+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|   CLG0036|118|            5.0|5.41|                   2|                   No|                     9|                   5|                 1|       No|
|   CLG0046| 92|            5.0|4.95|                   4|                  Yes|                     1|                   5|                 5|       No|
|   CLG0097|117|            5.0|4.87|                   3|                  Yes|                     8|                   6|                 1|       No|
|   CLG0088| 73|            5.0|4.86|                   8|                  

In [None]:
df.orderBy(col("CGPA").asc(), col("Academic_Performance").asc()).show()
df.orderBy(col("CGPA").desc(), col("Academic_Performance").desc()).show()
df.orderBy(col("Academic_Performance").desc()).show()

+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|College_ID| IQ|Prev_Sem_Result|CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Projects_Completed|Placement|
+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|   CLG0044|113|           5.03|4.54|                   6|                  Yes|                     9|                   5|                 4|       No|
|   CLG0017| 99|           5.01|4.56|                   5|                  Yes|                     0|                   9|                 4|       No|
|   CLG0025| 97|           5.06|4.57|                   6|                   No|                     7|                   5|                 2|       No|
|   CLG0005|123|           5.02|4.58|                   3|                  

In [None]:
from pyspark.sql import SparkSession
import pandas as pd
from google.colab import drive
spark = SparkSession.builder.master("local[1]").appName("Sorting Example").getOrCreate()
sc = spark.sparkContext
df = spark.read.csv('/content/LoanData (1).csv', header= True, inferSchema=True)
df.show()

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|LP001002|  Male|     No|         0|    Graduate|           No|           5849|              0.0|      NULL|             360|             1|        Urban|          Y|
|LP001003|  Male|    Yes|         1|    Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N|
|LP001005|  Male|    Yes|         0|    Graduate|          Yes|           3000|              0.0|        66|             360|             1|        Urban|          Y

In [None]:
from pyspark.sql.functions import asc_nulls_first, asc_nulls_last, desc_nulls_first, desc_nulls_last
df.orderBy(asc_nulls_first("Credit_History")).show() # Ascending, nulls at top
df.orderBy(desc_nulls_last("Credit_History")).show() # Descending, nulls at bottom

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|LP001405|  Male|    Yes|         1|    Graduate|           No|           2214|           1398.0|        85|             360|          NULL|        Urban|          Y|
|LP002178|  Male|    Yes|         0|    Graduate|           No|           3013|           3033.0|        95|             300|          NULL|        Urban|          Y|
|LP001443|Female|     No|         0|    Graduate|           No|           3692|              0.0|        93|             360|          NULL|        Rural|          Y

Aggregate function

In [None]:
from pyspark.sql import SparkSession
import pandas as pd
from google.colab import drive
spark = SparkSession.builder.master("local[1]").appName("Sorting Example").getOrCreate()
sc = spark.sparkContext
df = spark.read.csv('/content/salary.csv', header= True, inferSchema=True)
df.show(30, truncate = False)

+------+---+----+----------+--------+
|name  |id |age |department|salary  |
+------+---+----+----------+--------+
|user1 |1  |25.0|Jr manager|98000.0 |
|user2 |2  |30.0|sr manager|100000.0|
|user3 |6  |35.0|sr manager|100000.0|
|user4 |4  |32.0|head      |70000.0 |
|user5 |1  |45.0|Jr manager|60000.0 |
|user6 |6  |47.0|head2     |45000.0 |
|user7 |5  |21.0|worker    |25000.0 |
|user8 |1  |22.0|Jr manager|50000.0 |
|user9 |10 |54.0|lead      |45000.0 |
|user10|59 |52.0|lead2     |50000.0 |
|user11|6  |25.0|head2     |50000.0 |
|user12|2  |27.0|sr manager|70000.0 |
|user13|59 |54.0|lead2     |45000.0 |
|user14|2  |25.0|sr manager|70000.0 |
|user15|1  |32.0|Jr manager|50000.0 |
|user16|3  |37.0|worker    |25000.0 |
|user17|74 |63.0|Manager   |68000.0 |
|user18|7  |25.0|head      |45000.0 |
|user19|10 |32.0|lvl2 head |52000.0 |
|user20|10 |32.0|lvl2 head |52000.0 |
|user21|12 |NULL|NULL      |NULL    |
|user22|13 |NULL|NULL      |12000.0 |
|user23|13 |NULL|NULL      |140000.0|
|user24|99 |

In [None]:
from pyspark.sql import functions
df.groupby('department').agg(functions.min('salary'),
                       functions.max('salary'),
                       functions.sum('salary'),
                       functions.mean('salary'),
                       functions.count('salary'),
                       functions.avg('salary'),
                       functions.first('salary'),
                       functions.last('salary'),
                       functions.kurtosis('salary'),
                       functions.skewness('salary'),
                       functions.stddev('salary'),
                       functions.stddev_pop('salary'),
                       functions.stddev_samp('salary'),
                       functions.sumDistinct('salary'),
                       functions.variance('salary'),
                       functions.var_samp('salary'),
                       functions.var_pop('salary'),).show()



+----------+-----------+-----------+-----------+------------------+-------------+------------------+-------------+------------+-------------------+------------------+------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+
|department|min(salary)|max(salary)|sum(salary)|       avg(salary)|count(salary)|       avg(salary)|first(salary)|last(salary)|   kurtosis(salary)|  skewness(salary)|    stddev(salary)|stddev_pop(salary)|stddev_samp(salary)|sum(DISTINCT salary)|   var_samp(salary)|   var_samp(salary)|     var_pop(salary)|
+----------+-----------+-----------+-----------+------------------+-------------+------------------+-------------+------------+-------------------+------------------+------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+
|      NULL|    12000.0|   140000.0|   164000.0|54666.666666666664|            

In [None]:
from pyspark.sql import functions
df.groupby('department', 'age').agg(functions.min('salary'),
                       functions.max('salary'),
                       functions.sum('salary'),
                       functions.mean('salary'),
                       functions.count('salary'),
                       functions.avg('salary')).show()

+----------+----+-----------+-----------+-----------+-----------+-------------+-----------+
|department| age|min(salary)|max(salary)|sum(salary)|avg(salary)|count(salary)|avg(salary)|
+----------+----+-----------+-----------+-----------+-----------+-------------+-----------+
|      lead|54.0|    45000.0|    45000.0|    45000.0|    45000.0|            1|    45000.0|
|    worker|37.0|    25000.0|    25000.0|    25000.0|    25000.0|            1|    25000.0|
|     head2|25.0|    50000.0|    50000.0|    50000.0|    50000.0|            1|    50000.0|
|    worker|21.0|    25000.0|    25000.0|    25000.0|    25000.0|            1|    25000.0|
|      NULL|NULL|    12000.0|   140000.0|   152000.0|    76000.0|            2|    76000.0|
|   Manager|63.0|    68000.0|    68000.0|    68000.0|    68000.0|            1|    68000.0|
|sr manager|30.0|   100000.0|   100000.0|   100000.0|   100000.0|            1|   100000.0|
|Jr manager|45.0|    60000.0|    60000.0|    60000.0|    60000.0|            1| 

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import approx_count_distinct,collect_list
from pyspark.sql.functions import collect_set,sum,avg,max,countDistinct,count
from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness
from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
from pyspark.sql.functions import variance,var_samp,  var_pop

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]


df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department &amp; Salary: "+str(df2.collect()[0][0]))

print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
df.select(stddev("salary"), stddev_samp("salary"), \
    stddev_pop("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)
df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \
  .show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+

approx_count_distinct: 6
avg: 3400.0
+------------------------------------------------------------+
|collect_list(salary)                                        |
+------------------------------------------------------------+
|[3000, 4600, 4100, 3000, 3000, 3300, 3900, 3000, 2000, 4100]|
+------------------------------------------------------------+

+------------------------------------+
|c

group by

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions
spark = SparkSession.builder \
                    .appName('Group By examples') \
                    .getOrCreate()
data = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NV",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","DE",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","NV",80000,25,18000),
    ("Kumar","Marketing","NJ",91000,50,21000)]
schema = ['employee_name', "department", "state", "salary", "age", "bonus"]
df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show(truncate = False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NV   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |DE   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |NV   |80000 |25 |18000|
|Kumar        |Marketing |NJ   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [None]:
df.groupBy("state").sum("salary").show()

+-----+-----------+
|state|sum(salary)|
+-----+-----------+
|   NV|     166000|
|   CA|     171000|
|   NY|     252000|
|   NJ|      91000|
|   DE|      99000|
+-----+-----------+



In [None]:
dfgroup = df.groupBy("state").agg(functions.sum("salary").alias("sum_salary"))
dfgroup.show()

+-----+----------+
|state|sum_salary|
+-----+----------+
|   NV|    166000|
|   CA|    171000|
|   NY|    252000|
|   NJ|     91000|
|   DE|     99000|
+-----+----------+



In [None]:
df_filter = dfgroup.filter(dfgroup.sum_salary >100000)
df_filter.show()

+-----+----------+
|state|sum_salary|
+-----+----------+
|   NV|    166000|
|   CA|    171000|
|   NY|    252000|
+-----+----------+



In [None]:
df_filter.sort("sum_salary").show()
df_filter.sort(functions.desc("sum_salary")).show()

+-----+----------+
|state|sum_salary|
+-----+----------+
|   NV|    166000|
|   CA|    171000|
|   NY|    252000|
+-----+----------+

+-----+----------+
|state|sum_salary|
+-----+----------+
|   NY|    252000|
|   CA|    171000|
|   NV|    166000|
+-----+----------+



**Joins**

In [None]:
d1 = [["1", "sravan", "company 1"],     #first dataframe
        ["2", "ojaswi", "company 1"],
        ["3", "rohith", "company 2"],
        ["4", "sridevi", "company 1"]]
cols = ['id', 'Name', 'Company']
df1 = spark.createDataFrame(d1, cols)
df1.show()
d2 = [("1", "45000", "IT"),             #second dataframe
      ("2", "145000", "Manager"),
      ("6", "45000", "HR"),
      ("5", "34000", "Sales")]
cols2 = ['id', 'salary', 'department']
df2 = spark.createDataFrame(d2, cols2)
df2.show()

+---+-------+---------+
| id|   Name|  Company|
+---+-------+---------+
|  1| sravan|company 1|
|  2| ojaswi|company 1|
|  3| rohith|company 2|
|  4|sridevi|company 1|
+---+-------+---------+

+---+------+----------+
| id|salary|department|
+---+------+----------+
|  1| 45000|        IT|
|  2|145000|   Manager|
|  6| 45000|        HR|
|  5| 34000|     Sales|
+---+------+----------+



inner

In [None]:
df1.join(df2, df1.id == df2.id, "inner").show()

+---+------+---------+---+------+----------+
| id|  Name|  Company| id|salary|department|
+---+------+---------+---+------+----------+
|  1|sravan|company 1|  1| 45000|        IT|
|  2|ojaswi|company 1|  2|145000|   Manager|
+---+------+---------+---+------+----------+



fullouter

In [None]:
df1.join(df2, df1.id == df2.id, "outer").show()

+----+-------+---------+----+------+----------+
|  id|   Name|  Company|  id|salary|department|
+----+-------+---------+----+------+----------+
|   1| sravan|company 1|   1| 45000|        IT|
|   2| ojaswi|company 1|   2|145000|   Manager|
|   3| rohith|company 2|NULL|  NULL|      NULL|
|   4|sridevi|company 1|NULL|  NULL|      NULL|
|NULL|   NULL|     NULL|   5| 34000|     Sales|
|NULL|   NULL|     NULL|   6| 45000|        HR|
+----+-------+---------+----+------+----------+



In [None]:
df1.join(df2, df1.id == df2.id, "full").show()

+----+-------+---------+----+------+----------+
|  id|   Name|  Company|  id|salary|department|
+----+-------+---------+----+------+----------+
|   1| sravan|company 1|   1| 45000|        IT|
|   2| ojaswi|company 1|   2|145000|   Manager|
|   3| rohith|company 2|NULL|  NULL|      NULL|
|   4|sridevi|company 1|NULL|  NULL|      NULL|
|NULL|   NULL|     NULL|   5| 34000|     Sales|
|NULL|   NULL|     NULL|   6| 45000|        HR|
+----+-------+---------+----+------+----------+



In [None]:
df1.join(df2, df1.id == df2.id, "fullouter").show()

+----+-------+---------+----+------+----------+
|  id|   Name|  Company|  id|salary|department|
+----+-------+---------+----+------+----------+
|   1| sravan|company 1|   1| 45000|        IT|
|   2| ojaswi|company 1|   2|145000|   Manager|
|   3| rohith|company 2|NULL|  NULL|      NULL|
|   4|sridevi|company 1|NULL|  NULL|      NULL|
|NULL|   NULL|     NULL|   5| 34000|     Sales|
|NULL|   NULL|     NULL|   6| 45000|        HR|
+----+-------+---------+----+------+----------+



Left

In [None]:
df1.join(df2, df1.id == df2.id, "left").show()

+---+-------+---------+----+------+----------+
| id|   Name|  Company|  id|salary|department|
+---+-------+---------+----+------+----------+
|  1| sravan|company 1|   1| 45000|        IT|
|  2| ojaswi|company 1|   2|145000|   Manager|
|  3| rohith|company 2|NULL|  NULL|      NULL|
|  4|sridevi|company 1|NULL|  NULL|      NULL|
+---+-------+---------+----+------+----------+



In [None]:
df1.join(df2, df1.id == df2.id, "leftouter").show()

+---+-------+---------+----+------+----------+
| id|   Name|  Company|  id|salary|department|
+---+-------+---------+----+------+----------+
|  1| sravan|company 1|   1| 45000|        IT|
|  2| ojaswi|company 1|   2|145000|   Manager|
|  3| rohith|company 2|NULL|  NULL|      NULL|
|  4|sridevi|company 1|NULL|  NULL|      NULL|
+---+-------+---------+----+------+----------+



right

In [None]:
df1.join(df2, df1.id == df2.id, "right").show()

+----+------+---------+---+------+----------+
|  id|  Name|  Company| id|salary|department|
+----+------+---------+---+------+----------+
|   1|sravan|company 1|  1| 45000|        IT|
|   2|ojaswi|company 1|  2|145000|   Manager|
|NULL|  NULL|     NULL|  5| 34000|     Sales|
|NULL|  NULL|     NULL|  6| 45000|        HR|
+----+------+---------+---+------+----------+



In [None]:
df1.join(df2, df1.id == df2.id, "rightouter").show()

+----+------+---------+---+------+----------+
|  id|  Name|  Company| id|salary|department|
+----+------+---------+---+------+----------+
|   1|sravan|company 1|  1| 45000|        IT|
|   2|ojaswi|company 1|  2|145000|   Manager|
|NULL|  NULL|     NULL|  5| 34000|     Sales|
|NULL|  NULL|     NULL|  6| 45000|        HR|
+----+------+---------+---+------+----------+



Left semi

In [None]:
df1.join(df2, df1.id == df2.id, "leftsemi").show()

+---+------+---------+
| id|  Name|  Company|
+---+------+---------+
|  1|sravan|company 1|
|  2|ojaswi|company 1|
+---+------+---------+



Left anti

In [None]:
df1.join(df2, df1.id == df2.id, "leftanti").show()

+---+-------+---------+
| id|   Name|  Company|
+---+-------+---------+
|  3| rohith|company 2|
|  4|sridevi|company 1|
+---+-------+---------+



Null

In [None]:
from pyspark.sql import SparkSession
# Create Spark session
spark = SparkSession.builder.appName("NullValuesExample").getOrCreate()
# Data with None for NULL values
data = [
    (1, "John", "IT", 29, 4, 5000, 101),
    (2, "Alice", "HR", None, 2, None, 102),
    (3, None, "Sales", 35, None, 7000, None),
    (4, "Bob", None, None, None, None, None),
    (5, "Eve", "Finance", 40, 10, 10000, 105),
    (6, None, None, None, None, None, None),
]
# Column names
columns = ["Employee_ID", "Name", "Department", "Age", "Experience (yrs)", "Salary ($)", "Manager_ID"]
# Create DataFrame
df = spark.createDataFrame(data, columns)
# Show DataFrame
df.show()
df.printSchema()


+-----------+-----+----------+----+----------------+----------+----------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+-----+----------+----+----------------+----------+----------+
|          1| John|        IT|  29|               4|      5000|       101|
|          2|Alice|        HR|NULL|               2|      NULL|       102|
|          3| NULL|     Sales|  35|            NULL|      7000|      NULL|
|          4|  Bob|      NULL|NULL|            NULL|      NULL|      NULL|
|          5|  Eve|   Finance|  40|              10|     10000|       105|
|          6| NULL|      NULL|NULL|            NULL|      NULL|      NULL|
+-----------+-----+----------+----+----------------+----------+----------+

root
 |-- Employee_ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Experience (yrs): long (nullable = true)
 |-- Salary ($): long (nullable = true)
 |-- M

In [None]:
df.printSchema()

root
 |-- Employee_ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Experience (yrs): long (nullable = true)
 |-- Salary ($): long (nullable = true)
 |-- Manager_ID: long (nullable = true)



In [None]:
df.na.drop().show()

+-----------+----+----------+---+----------------+----------+----------+
|Employee_ID|Name|Department|Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+----+----------+---+----------------+----------+----------+
|          1|John|        IT| 29|               4|      5000|       101|
|          5| Eve|   Finance| 40|              10|     10000|       105|
+-----------+----+----------+---+----------------+----------+----------+



In [None]:
df.na.drop(how="any").show()
df.na.drop(how="all").show()

+-----------+----+----------+---+----------------+----------+----------+
|Employee_ID|Name|Department|Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+----+----------+---+----------------+----------+----------+
|          1|John|        IT| 29|               4|      5000|       101|
|          5| Eve|   Finance| 40|              10|     10000|       105|
+-----------+----+----------+---+----------------+----------+----------+

+-----------+-----+----------+----+----------------+----------+----------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+-----+----------+----+----------------+----------+----------+
|          1| John|        IT|  29|               4|      5000|       101|
|          2|Alice|        HR|NULL|               2|      NULL|       102|
|          3| NULL|     Sales|  35|            NULL|      7000|      NULL|
|          4|  Bob|      NULL|NULL|            NULL|      NULL|      NULL|
|          5|  Eve|   Finance|  40| 

In [None]:
df.na.drop(thresh=2).show() #Keeps rows with at least 2 non-null values, drops others.

+-----------+-----+----------+----+----------------+----------+----------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+-----+----------+----+----------------+----------+----------+
|          1| John|        IT|  29|               4|      5000|       101|
|          2|Alice|        HR|NULL|               2|      NULL|       102|
|          3| NULL|     Sales|  35|            NULL|      7000|      NULL|
|          4|  Bob|      NULL|NULL|            NULL|      NULL|      NULL|
|          5|  Eve|   Finance|  40|              10|     10000|       105|
+-----------+-----+----------+----+----------------+----------+----------+



In [None]:
df.na.drop(how="any", subset=["Experience (yrs)"]).show()

+-----------+-----+----------+----+----------------+----------+----------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+-----+----------+----+----------------+----------+----------+
|          1| John|        IT|  29|               4|      5000|       101|
|          2|Alice|        HR|NULL|               2|      NULL|       102|
|          5|  Eve|   Finance|  40|              10|     10000|       105|
+-----------+-----+----------+----+----------------+----------+----------+



In [None]:
df.na.drop(how="any", subset=["Age", "Department"]).show()

+-----------+----+----------+---+----------------+----------+----------+
|Employee_ID|Name|Department|Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+----+----------+---+----------------+----------+----------+
|          1|John|        IT| 29|               4|      5000|       101|
|          3|NULL|     Sales| 35|            NULL|      7000|      NULL|
|          5| Eve|   Finance| 40|              10|     10000|       105|
+-----------+----+----------+---+----------------+----------+----------+



Fill null

In [None]:
df.na.fill('NA', 'Name').show() #Fill String Column
df.na.fill(0, subset =["Age","Experience (yrs)","Salary ($)","Manager_ID"]).show() #Fill Numeric Column
df.na.fill('NA', subset=["Name", "Department"]).show()

+-----------+-----+----------+----+----------------+----------+----------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+-----+----------+----+----------------+----------+----------+
|          1| John|        IT|  29|               4|      5000|       101|
|          2|Alice|        HR|NULL|               2|      NULL|       102|
|          3|   NA|     Sales|  35|            NULL|      7000|      NULL|
|          4|  Bob|      NULL|NULL|            NULL|      NULL|      NULL|
|          5|  Eve|   Finance|  40|              10|     10000|       105|
|          6|   NA|      NULL|NULL|            NULL|      NULL|      NULL|
+-----------+-----+----------+----+----------------+----------+----------+

+-----------+-----+----------+---+----------------+----------+----------+
|Employee_ID| Name|Department|Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+-----+----------+---+----------------+----------+----------+
|          1| John|        

In [None]:
df.na.fill('').show() # Replaces NULL with empty string for all string columns
df.na.fill(0).show()  # Replaces NULL with 0 for all numeric columns

+-----------+-----+----------+----+----------------+----------+----------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+-----+----------+----+----------------+----------+----------+
|          1| John|        IT|  29|               4|      5000|       101|
|          2|Alice|        HR|NULL|               2|      NULL|       102|
|          3|     |     Sales|  35|            NULL|      7000|      NULL|
|          4|  Bob|          |NULL|            NULL|      NULL|      NULL|
|          5|  Eve|   Finance|  40|              10|     10000|       105|
|          6|     |          |NULL|            NULL|      NULL|      NULL|
+-----------+-----+----------+----+----------------+----------+----------+

+-----------+-----+----------+---+----------------+----------+----------+
|Employee_ID| Name|Department|Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+-----+----------+---+----------------+----------+----------+
|          1| John|        

In [None]:
df.na.fill({"Name":"Unknown", "Salary ($)": 0}).show()

+-----------+-------+----------+----+----------------+----------+----------+
|Employee_ID|   Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+-------+----------+----+----------------+----------+----------+
|          1|   John|        IT|  29|               4|      5000|       101|
|          2|  Alice|        HR|NULL|               2|         0|       102|
|          3|Unknown|     Sales|  35|            NULL|      7000|      NULL|
|          4|    Bob|      NULL|NULL|            NULL|         0|      NULL|
|          5|    Eve|   Finance|  40|              10|     10000|       105|
|          6|Unknown|      NULL|NULL|            NULL|         0|      NULL|
+-----------+-------+----------+----+----------------+----------+----------+



Mean


In [None]:
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols = ['Age', 'Experience (yrs)', 'Salary ($)'],
    outputCols=["{}_imputed".format(c) for c in ['Age', 'Experience (yrs)', 'Salary ($)']]).setStrategy("mean")
df_imputed = imputer.fit(df).transform(df)
df_imputed.show()

+-----------+-----+----------+----+----------------+----------+----------+-----------+------------------------+------------------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|Age_imputed|Experience (yrs)_imputed|Salary ($)_imputed|
+-----------+-----+----------+----+----------------+----------+----------+-----------+------------------------+------------------+
|          1| John|        IT|  29|               4|      5000|       101|         29|                       4|              5000|
|          2|Alice|        HR|NULL|               2|      NULL|       102|         34|                       2|              7333|
|          3| NULL|     Sales|  35|            NULL|      7000|      NULL|         35|                       5|              7000|
|          4|  Bob|      NULL|NULL|            NULL|      NULL|      NULL|         34|                       5|              7333|
|          5|  Eve|   Finance|  40|              10|     10000|       105|         

Median

In [None]:
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols = ['Age', 'Experience (yrs)', 'Salary ($)'],
    outputCols=["{}_imputed".format(c) for c in ['Age', 'Experience (yrs)', 'Salary ($)']]).setStrategy("median")
df_imputed = imputer.fit(df).transform(df)
df_imputed.show()

+-----------+-----+----------+----+----------------+----------+----------+-----------+------------------------+------------------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|Age_imputed|Experience (yrs)_imputed|Salary ($)_imputed|
+-----------+-----+----------+----+----------------+----------+----------+-----------+------------------------+------------------+
|          1| John|        IT|  29|               4|      5000|       101|         29|                       4|              5000|
|          2|Alice|        HR|NULL|               2|      NULL|       102|         35|                       2|              7000|
|          3| NULL|     Sales|  35|            NULL|      7000|      NULL|         35|                       4|              7000|
|          4|  Bob|      NULL|NULL|            NULL|      NULL|      NULL|         35|                       4|              7000|
|          5|  Eve|   Finance|  40|              10|     10000|       105|         

Mode


In [None]:
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols = ['Age', 'Experience (yrs)', 'Salary ($)'],
    outputCols=["{}_imputed".format(c) for c in ['Age', 'Experience (yrs)', 'Salary ($)']]).setStrategy("mode")
df_imputed = imputer.fit(df).transform(df)
df_imputed.show()

+-----------+-----+----------+----+----------------+----------+----------+-----------+------------------------+------------------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|Age_imputed|Experience (yrs)_imputed|Salary ($)_imputed|
+-----------+-----+----------+----+----------------+----------+----------+-----------+------------------------+------------------+
|          1| John|        IT|  29|               4|      5000|       101|         29|                       4|              5000|
|          2|Alice|        HR|NULL|               2|      NULL|       102|         29|                       2|              5000|
|          3| NULL|     Sales|  35|            NULL|      7000|      NULL|         35|                       2|              7000|
|          4|  Bob|      NULL|NULL|            NULL|      NULL|      NULL|         29|                       2|              5000|
|          5|  Eve|   Finance|  40|              10|     10000|       105|         

In [None]:
from pyspark.sql.functions import col
df.filter(col("Age").isNull()).show() # Only rows with NULL salary
df.filter(col("Salary ($)").isNotNull()).show() # Only rows without NULL salary

+-----------+-----+----------+----+----------------+----------+----------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+-----+----------+----+----------------+----------+----------+
|          2|Alice|        HR|NULL|               2|      NULL|       102|
|          4|  Bob|      NULL|NULL|            NULL|      NULL|      NULL|
|          6| NULL|      NULL|NULL|            NULL|      NULL|      NULL|
+-----------+-----+----------+----+----------------+----------+----------+

+-----------+----+----------+---+----------------+----------+----------+
|Employee_ID|Name|Department|Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+----+----------+---+----------------+----------+----------+
|          1|John|        IT| 29|               4|      5000|       101|
|          3|NULL|     Sales| 35|            NULL|      7000|      NULL|
|          5| Eve|   Finance| 40|              10|     10000|       105|
+-----------+----+----------+---+---

In [None]:
from pyspark.sql.functions import coalesce
df.withColumn("Final_Salary", coalesce(col("Salary ($)"), col("Experience (yrs)"))).show()

+-----------+-----+----------+----+----------------+----------+----------+------------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|Final_Salary|
+-----------+-----+----------+----+----------------+----------+----------+------------+
|          1| John|        IT|  29|               4|      5000|       101|        5000|
|          2|Alice|        HR|NULL|               2|      NULL|       102|           2|
|          3| NULL|     Sales|  35|            NULL|      7000|      NULL|        7000|
|          4|  Bob|      NULL|NULL|            NULL|      NULL|      NULL|        NULL|
|          5|  Eve|   Finance|  40|              10|     10000|       105|       10000|
|          6| NULL|      NULL|NULL|            NULL|      NULL|      NULL|        NULL|
+-----------+-----+----------+----+----------------+----------+----------+------------+



AGG FUNCTION - NULL

In [None]:
from pyspark.sql.functions import avg
df.select(avg("Salary ($)")).show()

+-----------------+
|  avg(Salary ($))|
+-----------------+
|7333.333333333333|
+-----------------+



null - join

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NullValuesExample").getOrCreate()
d1 = [(1, "John", "IT", 29, 4, 5000, 101),
    (2, "Alice", "HR", None, 2, None, 102),
    (3, None, "Sales", 35, None, 7000, None),
    (4, "Bob", None, None, None, None, None),
    (5, "Eve", "Finance", 40, 10, 10000, 105),
    (6, None, None, None, None, None, None),]
d2 = [(None, "Alice", "HR", None, 2, None, 102),
    (3, None, "Sales", 35, None, 7000, None),
    (5, "Eve", "Finance", 40, 10, 10000, 105),
    (None, None, None, None, None, None, None),]
columns = ["Employee_ID", "Name", "Department", "Age", "Experience (yrs)", "Salary ($)", "Manager_ID"]
df1 = spark.createDataFrame(d1, columns)
df2 = spark.createDataFrame(d1, columns)
df1.show()
df1.printSchema()
df2.show()
df2.printSchema()


+-----------+-----+----------+----+----------------+----------+----------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+-----+----------+----+----------------+----------+----------+
|          1| John|        IT|  29|               4|      5000|       101|
|          2|Alice|        HR|NULL|               2|      NULL|       102|
|          3| NULL|     Sales|  35|            NULL|      7000|      NULL|
|          4|  Bob|      NULL|NULL|            NULL|      NULL|      NULL|
|          5|  Eve|   Finance|  40|              10|     10000|       105|
|          6| NULL|      NULL|NULL|            NULL|      NULL|      NULL|
+-----------+-----+----------+----+----------------+----------+----------+

root
 |-- Employee_ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Experience (yrs): long (nullable = true)
 |-- Salary ($): long (nullable = true)
 |-- M

In [None]:
df1.join(df2, df1.Employee_ID == df2.Employee_ID, "left").show()

+-----------+-----+----------+----+----------------+----------+----------+-----------+-----+----------+----+----------------+----------+----------+
|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|Employee_ID| Name|Department| Age|Experience (yrs)|Salary ($)|Manager_ID|
+-----------+-----+----------+----+----------------+----------+----------+-----------+-----+----------+----+----------------+----------+----------+
|          1| John|        IT|  29|               4|      5000|       101|          1| John|        IT|  29|               4|      5000|       101|
|          3| NULL|     Sales|  35|            NULL|      7000|      NULL|          3| NULL|     Sales|  35|            NULL|      7000|      NULL|
|          2|Alice|        HR|NULL|               2|      NULL|       102|          2|Alice|        HR|NULL|               2|      NULL|       102|
|          6| NULL|      NULL|NULL|            NULL|      NULL|      NULL|          6| NULL|      NULL|NULL|    

When - otherwise

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

data = [("James","M",60000),
        ("Michael","M",70000),
        ("Robert",None,400000),
        ("Maria","F",500000),
        ("Jen","",None)]

columns = ["name","gender","salary"]

df = spark.createDataFrame(data, schema=columns)
df.show()

+-------+------+------+
|   name|gender|salary|
+-------+------+------+
|  James|     M| 60000|
|Michael|     M| 70000|
| Robert|  NULL|400000|
|  Maria|     F|500000|
|    Jen|      |  NULL|
+-------+------+------+



In [None]:
from pyspark.sql.functions import when,col
df2 = df.withColumn(
    "new_gender",
    when(df.gender == "M", "Male")
    .when(df.gender == "F", "Female")
    .when(df.gender.isNull(), "Not Specified")
    .when(df.gender == "", "Not Specified")
    .otherwise(df.gender))
df2.show()

+-------+------+------+-------------+
|   name|gender|salary|   new_gender|
+-------+------+------+-------------+
|  James|     M| 60000|         Male|
|Michael|     M| 70000|         Male|
| Robert|  NULL|400000|Not Specified|
|  Maria|     F|500000|       Female|
|    Jen|      |  NULL|Not Specified|
+-------+------+------+-------------+



In [None]:
df2 = df.select(
    col("*"),
    when(df.gender == "M", "Male")
    .when(df.gender == "F", "Female")
    .when(df.gender.isNull(), "Not Specified")
    .when(df.gender == "", "Not Specified")
    .otherwise(df.gender)
    .alias("New_Gender"))
df2.show()

+-------+------+------+-------------+
|   name|gender|salary|   New_Gender|
+-------+------+------+-------------+
|  James|     M| 60000|         Male|
|Michael|     M| 70000|         Male|
| Robert|  NULL|400000|Not Specified|
|  Maria|     F|500000|       Female|
|    Jen|      |  NULL|Not Specified|
+-------+------+------+-------------+



In [None]:
from pyspark.sql.functions import expr

df3 = df.withColumn(
    "New_gender",
    expr("CASE WHEN gender == 'M' THEN 'Male' "
          "WHEN gender == 'F' THEN 'Female' "
          "WHEN gender IS NULL THEN 'Not Specified' "
          "WHEN gender == '' THEN 'Not Specified' "
          "ELSE gender END")
)
df3.show()

+-------+------+------+-------------+
|   name|gender|salary|   New_gender|
+-------+------+------+-------------+
|  James|     M| 60000|         Male|
|Michael|     M| 70000|         Male|
| Robert|  NULL|400000|Not Specified|
|  Maria|     F|500000|       Female|
|    Jen|      |  NULL|Not Specified|
+-------+------+------+-------------+



In [None]:
df4 = df.select(
    col("*"),
    expr(
        "CASE WHEN gender == 'M' THEN 'Male' "
        "WHEN gender == 'F' THEN 'Female' "
        "WHEN gender IS NULL THEN 'Not Specified' "
        "WHEN gender == '' THEN 'Not Specified' "
        "ELSE gender END").alias("New_Gender"))
df4.show()

+-------+------+------+-------------+
|   name|gender|salary|   New_Gender|
+-------+------+------+-------------+
|  James|     M| 60000|         Male|
|Michael|     M| 70000|         Male|
| Robert|  NULL|400000|Not Specified|
|  Maria|     F|500000|       Female|
|    Jen|      |  NULL|Not Specified|
+-------+------+------+-------------+



In [None]:
df.createOrReplaceTempView("Emp")
spark.sql("""
    select name,
    case when gender = 'M' then 'Male'
         when gender = 'F' then 'Female'
         when gender = '' then 'Not Specified'
         when gender is null then 'Not Specified'
         else gender end as new_gender
    from Emp
""").show()

+-------+-------------+
|   name|   new_gender|
+-------+-------------+
|  James|         Male|
|Michael|         Male|
| Robert|Not Specified|
|  Maria|       Female|
|    Jen|Not Specified|
+-------+-------------+



Union , union all

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

# First DataFrame
data1 = [("James","Sales","NY",90000,34,10000),
         ("Michael","Sales","NY",86000,56,20000),
         ("Robert","Sales","CA",81000,30,23000),
         ("Maria","Finance","CA",90000,24,23000)]
columns = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data1, schema=columns)

# Second DataFrame
data2 = [("James","Sales","NY",90000,34,10000),
         ("Maria","Finance","CA",90000,24,23000),
         ("Jen","Finance","NY",79000,53,15000),
         ("Jeff","Marketing","CA",80000,25,18000),
         ("Kumar","Marketing","NY",91000,50,21000)]
df2 = spark.createDataFrame(data2, schema=columns)


In [None]:
union_df = df.union(df2)
union_df.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [None]:
union_df = df.union(df2).distinct()
union_df.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
+-------------+----------+-----+------+---+-----+



In [None]:
unionall_df = df.unionAll(df2)
unionall_df.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [None]:
unionall_df = df.unionAll(df2).distinct()
unionall_df.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
+-------------+----------+-----+------+---+-----+



window function

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('WindowFunctions').getOrCreate()
data = [
    ("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)]
columns = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data, schema=columns)
df.show()


+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        Maria|   Finance|  3000|
|        James|     Sales|  3000|
|        Scott|   Finance|  3300|
|          Jen|   Finance|  3900|
|         Jeff| Marketing|  3000|
|        Kumar| Marketing|  2000|
|         Saif|     Sales|  4100|
+-------------+----------+------+



In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, rank, dense_rank, percent_rank, ntile

windowSpec = Window.partitionBy("department").orderBy("salary")

#row number
df.withColumn("row_number", row_number().over(windowSpec)).show()

#rank
df.withColumn("rank", rank().over(windowSpec)).show()

#dense rank
df.withColumn("Dense_rank", dense_rank().over(windowSpec)).show()

#percent rank
df.withColumn("Percent_rank", percent_rank().over(windowSpec)).show()

# ntile
df.withColumn("Ntile", ntile(2).over(windowSpec)).show()


+-------------+----------+------+----------+
|employee_name|department|salary|row_number|
+-------------+----------+------+----------+
|        Maria|   Finance|  3000|         1|
|        Scott|   Finance|  3300|         2|
|          Jen|   Finance|  3900|         3|
|        Kumar| Marketing|  2000|         1|
|         Jeff| Marketing|  3000|         2|
|        James|     Sales|  3000|         1|
|        James|     Sales|  3000|         2|
|       Robert|     Sales|  4100|         3|
|         Saif|     Sales|  4100|         4|
|      Michael|     Sales|  4600|         5|
+-------------+----------+------+----------+

+-------------+----------+------+----+
|employee_name|department|salary|rank|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|   1|
|        Scott|   Finance|  3300|   2|
|          Jen|   Finance|  3900|   3|
|        Kumar| Marketing|  2000|   1|
|         Jeff| Marketing|  3000|   2|
|        James|     Sales|  3000|   1|
|        James|   

In [None]:
from pyspark.sql.functions import cume_dist, lag, lead
#cume_dist
df.withColumn("cume_dist", cume_dist().over(windowSpec)).show()
#lag
df.withColumn("Lag", lag("salary", 2).over(windowSpec)).show()
#lead
df.withColumn("Lead", lead("salary",2).over(windowSpec)).show()

+-------------+----------+------+------------------+
|employee_name|department|salary|         cume_dist|
+-------------+----------+------+------------------+
|        Maria|   Finance|  3000|0.3333333333333333|
|        Scott|   Finance|  3300|0.6666666666666666|
|          Jen|   Finance|  3900|               1.0|
|        Kumar| Marketing|  2000|               0.5|
|         Jeff| Marketing|  3000|               1.0|
|        James|     Sales|  3000|               0.4|
|        James|     Sales|  3000|               0.4|
|       Robert|     Sales|  4100|               0.8|
|         Saif|     Sales|  4100|               0.8|
|      Michael|     Sales|  4600|               1.0|
+-------------+----------+------+------------------+

+-------------+----------+------+----+
|employee_name|department|salary| Lag|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|NULL|
|        Scott|   Finance|  3300|NULL|
|          Jen|   Finance|  3900|3000|
|        Kumar| Marketi

In [None]:
from pyspark.sql.functions import col, avg, sum, min, max
windowSpecAgg = Window.partitionBy("department")
df.withColumn("avg_salary", avg(col("salary")).over(windowSpecAgg))\
  .withColumn("total_salary", sum(col("salary")).over(windowSpecAgg))\
  .withColumn("min_salary", min(col("salary")).over(windowSpecAgg))\
  .withColumn("max_salary", max(col("salary")).over(windowSpecAgg))\
  .show()

+-------------+----------+------+----------+------------+----------+----------+
|employee_name|department|salary|avg_salary|total_salary|min_salary|max_salary|
+-------------+----------+------+----------+------------+----------+----------+
|        Maria|   Finance|  3000|    3400.0|       10200|      3000|      3900|
|        Scott|   Finance|  3300|    3400.0|       10200|      3000|      3900|
|          Jen|   Finance|  3900|    3400.0|       10200|      3000|      3900|
|         Jeff| Marketing|  3000|    2500.0|        5000|      2000|      3000|
|        Kumar| Marketing|  2000|    2500.0|        5000|      2000|      3000|
|        James|     Sales|  3000|    3760.0|       18800|      3000|      4600|
|      Michael|     Sales|  4600|    3760.0|       18800|      3000|      4600|
|       Robert|     Sales|  4100|    3760.0|       18800|      3000|      4600|
|        James|     Sales|  3000|    3760.0|       18800|      3000|      4600|
|         Saif|     Sales|  4100|    376