# `What is Dataframe` :

  - A DataFrame is a distributed collection of data organized into columns and rows(similar to a table). 

###  `Properties of DataFrame`:
  1. **Distributed:** Data is distributed across multiple nodes in a cluster.
  2. **Immutable:** Once created, it cannot be changed. Transformations produce new DataFrames.
  3. **Lazy Evaluation:** Transformations on DataFrames are not computed immediately. Spark computes them only when an action requires a result to be returned to the driver program.
  4. **Schema:** Each DataFrame has a schema, representing the structure of the data, including column names and types.
  5. **Supports various data formats:** Can read and write data in various formats like CSV, JSON, Parquet, Avro, etc.


# Create DF from Reading Multiple File Formats

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

# define the custom schema using structtype and structfield
books_shema = StructType([
    StructField("book_id",StringType()),
    StructField("title",StringType()),
    StructField("author",StringType()),
    StructField("category",StringType()),
    StructField("price", IntegerType())
])

# df_csv = spark.read.csv("dbfs:/mnt/adls_container/books-data.csv", header=True, sep=";", inferSchema=True)
df_csv = spark.read.csv("dbfs:/mnt/adls_container/books-data.csv", header=True, sep=";",schema=books_shema)

display(df_csv)
df_csv.printSchema()

book_id,title,author,category,price
B10,Beginning Database Design Solutions,Rod Stephens,Computer Science,44
B11,Business Intelligence for Dummies,Swain Scheps,Computer Science,38
B12,Big Data in Practice,Bernard Marr,Computer Science,30


root
 |-- book_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)



In [0]:
df_json = spark.read.json("dbfs:/mnt/adls_container/customers-data.json")

display(df_json)
df_json.printSchema()

customer_id,email,profile,updated
C00001,dabby2y@japanpost.jp,"{""first_name"":""Dniren"",""last_name"":""Abby"",""gender"":""Female"",""address"":{""street"":""768 Mesta Terrace"",""city"":""Annecy"",""country"":""France""}}",2021-12-14T23:15:43.375Z
C00002,eabbysc1@github.com,"{""first_name"":""Etti"",""last_name"":""Abbys"",""gender"":""Female"",""address"":{""street"":""1748 Vidon Plaza"",""city"":""Varge Mondar"",""country"":""Portugal""}}",2021-12-14T23:15:43.375Z
C00003,rabelovd1@wikispaces.com,"{""first_name"":""Ronnie"",""last_name"":""Abelov"",""gender"":""Male"",""address"":{""street"":""363 Randy Park"",""city"":""San Celestio"",""country"":""Philippines""}}",2021-12-14T23:15:43.375Z
C00004,rabels9g@behance.net,"{""first_name"":""Ray"",""last_name"":""Abels"",""gender"":""Female"",""address"":{""street"":""613 Lyons Way"",""city"":""Oudtshoorn"",""country"":""South Africa""}}",2021-12-14T23:15:43.375Z
C00005,sabendrothin@cargocollective.com,"{""first_name"":""Shanon"",""last_name"":""Abendroth"",""gender"":""Female"",""address"":{""street"":""30292 Manufacturers Junction"",""city"":""Ani-e"",""country"":""Philippines""}}",2021-12-14T23:15:43.375Z
C00006,,"{""first_name"":""Norman"",""last_name"":""Abernethy"",""gender"":""Male"",""address"":{""street"":""9292 Oxford Center"",""city"":""Gibara"",""country"":""Cuba""}}",2021-12-14T23:15:43.375Z
C00007,sabrahmson3h@blinklist.com,"{""first_name"":""Skell"",""last_name"":""Abrahmson"",""gender"":""Male"",""address"":{""street"":""90941 Hallows Park"",""city"":""Huarong Chengguanzhen"",""country"":""China""}}",2021-12-14T23:15:43.375Z
C00008,dacheson2h@mapy.cz,"{""first_name"":""Darsey"",""last_name"":""Acheson"",""gender"":""Non-binary"",""address"":{""street"":""29579 Grim Plaza"",""city"":""Dārayyā"",""country"":""Syria""}}",2021-12-14T23:15:43.375Z
C00009,fackwoodji@gravatar.com,"{""first_name"":""Fredrick"",""last_name"":""Ackwood"",""gender"":""Male"",""address"":{""street"":""67 Dunning Plaza"",""city"":""Santo Domingo"",""country"":""Cuba""}}",2021-12-14T23:15:43.375Z
C00010,,"{""first_name"":""Doralynne"",""last_name"":""Adamkiewicz"",""gender"":""Female"",""address"":{""street"":""84126 Glendale Center"",""city"":""Ugep"",""country"":""Nigeria""}}",2021-12-14T23:15:43.375Z


root
 |-- customer_id: string (nullable = true)
 |-- email: string (nullable = true)
 |-- profile: string (nullable = true)
 |-- updated: string (nullable = true)



In [0]:
df_parquet = spark.read.parquet("dbfs:/FileStore/parquet/")

display(df_parquet)
df_parquet.printSchema()
df_parquet.count()

order_id,order_timestamp,customer_id,quantity,total,books
6341,1657520256,C00788,1,41,"List(List(B08, 1, 41))"
6342,1657520256,C00788,1,41,"List(List(B08, 1, 41))"
6343,1657531717,C00654,1,28,"List(List(B02, 1, 28))"
6344,1657531717,C00654,1,28,"List(List(B02, 1, 28))"
6345,1657543676,C00762,1,49,"List(List(B01, 1, 49))"
6346,1657543676,C00762,1,49,"List(List(B01, 1, 49))"
6347,1657546079,C01014,1,28,"List(List(B02, 1, 28))"
6348,1657546658,C00633,1,24,"List(List(B09, 1, 24))"
6349,1657546658,C00633,1,24,"List(List(B09, 1, 24))"
6350,1657547177,C00638,1,35,"List(List(B03, 1, 35))"


root
 |-- order_id: string (nullable = true)
 |-- order_timestamp: long (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- quantity: long (nullable = true)
 |-- total: integer (nullable = true)
 |-- books: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- book_id: string (nullable = true)
 |    |    |-- quantity: integer (nullable = true)
 |    |    |-- subtotal: long (nullable = true)



3000

In [0]:
driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
url = "jdbc:sqlserver://azure-sql-server-1111.database.windows.net:1433;database=azure-sql-db-111"
username = "myuser"
password = "mypass@123"   # 
table = "[SalesLT].[Customer]"


customer_df = (spark.read
                   .format("jdbc")
                   .option("driver", driver)
                   .option("url", url)
                   .option("dbtable", table)
                   .option("user", username)
                   .option("password", password)
                   .load())

display(customer_df)
customer_df.count()

CustomerID,NameStyle,Title,FirstName,MiddleName,LastName,Suffix,CompanyName,SalesPerson,EmailAddress,Phone,PasswordHash,PasswordSalt,rowguid,ModifiedDate
1,False,Mr.,Orlando,N.,Gee,,A Bike Store,adventure-works\pamela0,orlando0@adventure-works.com,245-555-0173,L/Rlwxzp4w7RWmEgXX+/A7cXaePEPcp+KwQhl2fJL7w=,1KjXYs4=,3F5AE95E-B87D-4AED-95B4-C3797AFCB74F,2005-08-01T00:00:00Z
2,False,Mr.,Keith,,Harris,,Progressive Sports,adventure-works\david8,keith0@adventure-works.com,170-555-0127,YPdtRdvqeAhj6wyxEsFdshBDNXxkCXn+CRgbvJItknw=,fs1ZGhY=,E552F657-A9AF-4A7D-A645-C429D6E02491,2006-08-01T00:00:00Z
3,False,Ms.,Donna,F.,Carreras,,Advanced Bike Components,adventure-works\jillian0,donna0@adventure-works.com,279-555-0130,LNoK27abGQo48gGue3EBV/UrlYSToV0/s87dCRV7uJk=,YTNH5Rw=,130774B1-DB21-4EF3-98C8-C104BCD6ED6D,2005-09-01T00:00:00Z
4,False,Ms.,Janet,M.,Gates,,Modular Cycle Systems,adventure-works\jillian0,janet1@adventure-works.com,710-555-0173,ElzTpSNbUW1Ut+L5cWlfR7MF6nBZia8WpmGaQPjLOJA=,nm7D5e4=,FF862851-1DAA-4044-BE7C-3E85583C054D,2006-07-01T00:00:00Z
5,False,Mr.,Lucy,,Harrington,,Metropolitan Sports Supply,adventure-works\shu0,lucy0@adventure-works.com,828-555-0186,KJqV15wsX3PG8TS5GSddp6LFFVdd3CoRftZM/tP0+R4=,cNFKU4w=,83905BDC-6F5E-4F71-B162-C98DA069F38A,2006-09-01T00:00:00Z
6,False,Ms.,Rosmarie,J.,Carroll,,Aerobic Exercise Company,adventure-works\linda3,rosmarie0@adventure-works.com,244-555-0112,OKT0scizCdIzymHHOtyJKQiC/fCILSooSZ8dQ2Y34VM=,ihWf50M=,1A92DF88-BFA2-467D-BD54-FCB9E647FDD7,2007-09-01T00:00:00Z
7,False,Mr.,Dominic,P.,Gash,,Associated Bikes,adventure-works\shu0,dominic0@adventure-works.com,192-555-0173,ZccoP/jZGQm+Xpzc7RKwDhS11YFNybwcPVRYTSNcnSg=,sPoUBSQ=,03E9273E-B193-448E-9823-FE0C44AEED78,2006-07-01T00:00:00Z
10,False,Ms.,Kathleen,M.,Garza,,Rural Cycle Emporium,adventure-works\josé1,kathleen0@adventure-works.com,150-555-0127,Qa3aMCxNbVLGrc0b99KsbQqiVgwYDfHcsK9GZSUxcTM=,Ls05W3g=,CDB6698D-2FF1-4FBA-8F22-60AD1D11DABD,2006-09-01T00:00:00Z
11,False,Ms.,Katherine,,Harding,,Sharp Bikes,adventure-works\josé1,katherine0@adventure-works.com,926-555-0159,uRlorVzDGNJIX9I+ehTlRK+liT4UKRgWhApJgUMC2d4=,jpHKbqE=,750F3495-59C4-48A0-80E1-E37EC60E77D9,2005-08-01T00:00:00Z
12,False,Mr.,Johnny,A.,Caprio,Jr.,Bikes and Motorbikes,adventure-works\garrett1,johnny0@adventure-works.com,112-555-0191,jtF9jBoFYeJTaET7x+eJDkd7BzMz15Wo9odbGPBaIak=,wVLnvHo=,947BCAF1-1F32-44F3-B9C3-0011F95FBE54,2006-08-01T00:00:00Z


847

In [0]:
emp_data = [
    (1, "John Doe", "Male", 60000.0, "USA"),
    (2, "Jane Smith", "Female", 55000.0, "Canada"),
    (3, "Alice Johnson", "Female", 65000.0, "UK"),
    (4, "Bob Williams", "Male", 62000.0, "Australia"),
    (5, "Eve Davis", "Female", 70000.0, "India"),
    (5, "Eve Davis", "Female", 70000.0, "India"),
    (6, "Charlie Brown", "Male", 58000.0, "Germany"),
    (7, "Diana Miller", "Female", 60000.0, "France"),
    (8, "Frank Johnson", "Male", 62000.0, "Spain"),
    (9, "Grace Wilson", "Female", 54000.0, "Italy"),
    (10, "Henry Davis", "Male", 68000.0, "Japan"),
    (9, "Grace Wilson", "Female", 54000.0, "Italy"),
    (10, "Henry Davis", "Male", 68000.0, "Japan"),
    (11, "Isabel Clark", "Female", 59000.0, "Brazil"),
    (12, "Jack Turner", "Male", 63000.0, "Mexico"),
    (13, "Katherine White", "Female", 67000.0, "South Africa"),
    (14, "Louis Harris", "Male", 56000.0, "Russia"),
    (15, "Mia Lee", "Female", 61000.0, "China"),
    (14, "Louis Harris", "Male", 56000.0, "Russia"),
    (15, "Mia Lee", "Female", 61000.0, "China")
]

emp_schema = StructType([
    StructField("empId", IntegerType(), True),
    StructField("empName", StringType(), True),
    StructField("empGender", StringType(), True),
    StructField("empSalary", FloatType(), True),
    StructField("empCountry", StringType(), True)
])

df_sample = spark.createDataFrame(data=emp_data, schema=emp_schema)

df_sample.show(5)

+-----+-------------+---------+---------+----------+
|empId|      empName|empGender|empSalary|empCountry|
+-----+-------------+---------+---------+----------+
|    1|     John Doe|     Male|  60000.0|       USA|
|    2|   Jane Smith|   Female|  55000.0|    Canada|
|    3|Alice Johnson|   Female|  65000.0|        UK|
|    4| Bob Williams|     Male|  62000.0| Australia|
|    5|    Eve Davis|   Female|  70000.0|     India|
+-----+-------------+---------+---------+----------+
only showing top 5 rows



# Transformations on Dataframes

In [0]:
# add two more columns
    # origin ==> constant column ==> "India"
    # tax ==> derived column ==> "12%(salary)"

# withColumn() : to add columns

from pyspark.sql.functions import lit

df2 = df_sample.withColumn("origin", lit("india")) \
               .withColumn("tax", df_sample.empSalary * 0.12)

display(df2.limit(5))

empId,empName,empGender,empSalary,empCountry,origin,tax
1,John Doe,Male,60000.0,USA,india,7200.0
2,Jane Smith,Female,55000.0,Canada,india,6600.0
3,Alice Johnson,Female,65000.0,UK,india,7800.0
4,Bob Williams,Male,62000.0,Australia,india,7440.0
5,Eve Davis,Female,70000.0,India,india,8400.0


In [0]:
# withColumnRenamed() : to rename the columns
    # two columns => country, tax

df3 = df2.withColumnRenamed("origin", "empOrigin") \
         .withColumnRenamed("tax", "empTax")

display(df3.limit(5))

empId,empName,empGender,empSalary,empCountry,empOrigin,empTax
1,John Doe,Male,60000.0,USA,india,7200.0
2,Jane Smith,Female,55000.0,Canada,india,6600.0
3,Alice Johnson,Female,65000.0,UK,india,7800.0
4,Bob Williams,Male,62000.0,Australia,india,7440.0
5,Eve Davis,Female,70000.0,India,india,8400.0


In [0]:
from pyspark.sql.functions import col

df3.select("empId", df3.empName, col("empGender")).show(5)

+-----+-------------+---------+
|empId|      empName|empGender|
+-----+-------------+---------+
|    1|     John Doe|     Male|
|    2|   Jane Smith|   Female|
|    3|Alice Johnson|   Female|
|    4| Bob Williams|     Male|
|    5|    Eve Davis|   Female|
+-----+-------------+---------+
only showing top 5 rows



In [0]:
df3.show(5)

+-----+-------------+---------+---------+----------+---------+------+
|empId|      empName|empGender|empSalary|empCountry|empOrigin|empTax|
+-----+-------------+---------+---------+----------+---------+------+
|    1|     John Doe|     Male|  60000.0|       USA|    india|7200.0|
|    2|   Jane Smith|   Female|  55000.0|    Canada|    india|6600.0|
|    3|Alice Johnson|   Female|  65000.0|        UK|    india|7800.0|
|    4| Bob Williams|     Male|  62000.0| Australia|    india|7440.0|
|    5|    Eve Davis|   Female|  70000.0|     India|    india|8400.0|
+-----+-------------+---------+---------+----------+---------+------+
only showing top 5 rows



In [0]:
# case conditions : when(), otherwise()
    # male ==> m
    # female ==> f
    # unknown ==> u


from pyspark.sql.functions import when

df4 = df3.select(
    "empId",
    "empName",
    when(df3.empGender == "Male", "m").when(df3.empGender == "Female", "f").otherwise("u").alias("empGender"),
    "empSalary",
    "empCountry", 
    "empOrigin", 
    "empTax"
    )

df4.show(5)

+-----+-------------+---------+---------+----------+---------+------+
|empId|      empName|empGender|empSalary|empCountry|empOrigin|empTax|
+-----+-------------+---------+---------+----------+---------+------+
|    1|     John Doe|        m|  60000.0|       USA|    india|7200.0|
|    2|   Jane Smith|        f|  55000.0|    Canada|    india|6600.0|
|    3|Alice Johnson|        f|  65000.0|        UK|    india|7800.0|
|    4| Bob Williams|        m|  62000.0| Australia|    india|7440.0|
|    5|    Eve Davis|        f|  70000.0|     India|    india|8400.0|
+-----+-------------+---------+---------+----------+---------+------+
only showing top 5 rows



In [0]:
# orderBy() or sort() : to sort the data based on columns

df4.sort(df4.empSalary.desc()).show()

+-----+---------------+---------+---------+------------+---------+------+
|empId|        empName|empGender|empSalary|  empCountry|empOrigin|empTax|
+-----+---------------+---------+---------+------------+---------+------+
|    5|      Eve Davis|        f|  70000.0|       India|    india|8400.0|
|    5|      Eve Davis|        f|  70000.0|       India|    india|8400.0|
|   10|    Henry Davis|        m|  68000.0|       Japan|    india|8160.0|
|   10|    Henry Davis|        m|  68000.0|       Japan|    india|8160.0|
|   13|Katherine White|        f|  67000.0|South Africa|    india|8040.0|
|    3|  Alice Johnson|        f|  65000.0|          UK|    india|7800.0|
|   12|    Jack Turner|        m|  63000.0|      Mexico|    india|7560.0|
|    8|  Frank Johnson|        m|  62000.0|       Spain|    india|7440.0|
|    4|   Bob Williams|        m|  62000.0|   Australia|    india|7440.0|
|   15|        Mia Lee|        f|  61000.0|       China|    india|7320.0|
|   15|        Mia Lee|        f|  610

In [0]:
# dropDuplicates() : to remove the duplicates

df4.dropDuplicates().orderBy(df4.empSalary.desc()).show()

+-----+---------------+---------+---------+------------+---------+------+
|empId|        empName|empGender|empSalary|  empCountry|empOrigin|empTax|
+-----+---------------+---------+---------+------------+---------+------+
|    5|      Eve Davis|        f|  70000.0|       India|    india|8400.0|
|   10|    Henry Davis|        m|  68000.0|       Japan|    india|8160.0|
|   13|Katherine White|        f|  67000.0|South Africa|    india|8040.0|
|    3|  Alice Johnson|        f|  65000.0|          UK|    india|7800.0|
|   12|    Jack Turner|        m|  63000.0|      Mexico|    india|7560.0|
|    4|   Bob Williams|        m|  62000.0|   Australia|    india|7440.0|
|    8|  Frank Johnson|        m|  62000.0|       Spain|    india|7440.0|
|   15|        Mia Lee|        f|  61000.0|       China|    india|7320.0|
|    1|       John Doe|        m|  60000.0|         USA|    india|7200.0|
|    7|   Diana Miller|        f|  60000.0|      France|    india|7200.0|
|   11|   Isabel Clark|        f|  590

In [0]:
## where() or filter() : to filter the data

(df4
    .dropDuplicates()
    .filter((df4.empSalary > 55000) & (df4.empGender == 'f') & (df4.empName.like("%e")))
    .sort(df4.empSalary.desc())
    .show())

+-----+---------------+---------+---------+------------+---------+------+
|empId|        empName|empGender|empSalary|  empCountry|empOrigin|empTax|
+-----+---------------+---------+---------+------------+---------+------+
|   13|Katherine White|        f|  67000.0|South Africa|    india|8040.0|
|   15|        Mia Lee|        f|  61000.0|       China|    india|7320.0|
+-----+---------------+---------+---------+------------+---------+------+



In [0]:
data=[(1, 'anil', 'M', 5000, 'IT'),\
      (2, 'sandeep', 'M',6000, 'IT'),\
      (3, 'riya', 'F',2500, 'payroll'),\
      (4, 'prteek', 'M',4000, 'HR'),\
      (5, 'vani', 'F',2000, 'HR'),\
      (6, 'sunil', 'M', 2000, 'payroll'),\
      (7, 'diksha', 'F',3000, 'IT'),
      (8, 'rajesh', 'M', 4500, 'Finance'),
      (9, 'neha', 'F', 3500, 'Finance'),
      (10, 'amit', 'M', 3000, 'HR'),
      (11, 'pooja', 'F', 5500, 'IT'),
      (12, 'rohit', 'M', 6000, 'IT')
      ]

# Define the schema for the data
schema = StructType([
    StructField("empId", IntegerType(), True),
    StructField("empName", StringType(), True),
    StructField("empGender", StringType(), True),
    StructField("empSalary", IntegerType(), True),
    StructField("empDepartment", StringType(), True)
])

df = spark.createDataFrame(data, schema)
df.show()
df.printSchema()

+-----+-------+---------+---------+-------------+
|empId|empName|empGender|empSalary|empDepartment|
+-----+-------+---------+---------+-------------+
|    1|   anil|        M|     5000|           IT|
|    2|sandeep|        M|     6000|           IT|
|    3|   riya|        F|     2500|      payroll|
|    4| prteek|        M|     4000|           HR|
|    5|   vani|        F|     2000|           HR|
|    6|  sunil|        M|     2000|      payroll|
|    7| diksha|        F|     3000|           IT|
|    8| rajesh|        M|     4500|      Finance|
|    9|   neha|        F|     3500|      Finance|
|   10|   amit|        M|     3000|           HR|
|   11|  pooja|        F|     5500|           IT|
|   12|  rohit|        M|     6000|           IT|
+-----+-------+---------+---------+-------------+

root
 |-- empId: integer (nullable = true)
 |-- empName: string (nullable = true)
 |-- empGender: string (nullable = true)
 |-- empSalary: integer (nullable = true)
 |-- empDepartment: string (nullab

In [0]:
# aggregate functions : count, max, min, sum, avg

from pyspark.sql.functions import count, max, min, sum, avg

df.agg(count("*").alias("totalEmpCount")).show()
df.agg(max("empSalary").alias("maxSalary")).show()
df.agg(min("empSalary").alias("minSalary")).show()
df.agg(avg("empSalary").alias("avgSalary")).show()
df.agg(sum("empSalary").alias("sumSalary")).show()


+-------------+
|totalEmpCount|
+-------------+
|           12|
+-------------+

+---------+
|maxSalary|
+---------+
|     6000|
+---------+

+---------+
|minSalary|
+---------+
|     2000|
+---------+

+------------------+
|         avgSalary|
+------------------+
|3916.6666666666665|
+------------------+

+---------+
|sumSalary|
+---------+
|    47000|
+---------+



In [0]:
df.agg(
    count("*").alias("totalEmpCount"),
    max("empSalary").alias("maxSalary"),
    min("empSalary").alias("minSalary"),
    avg("empSalary").alias("avgSalary"),
    sum("empSalary").alias("sumSalary")
).show()

+-------------+---------+---------+------------------+---------+
|totalEmpCount|maxSalary|minSalary|         avgSalary|sumSalary|
+-------------+---------+---------+------------------+---------+
|           12|     6000|     2000|3916.6666666666665|    47000|
+-------------+---------+---------+------------------+---------+



In [0]:
# groupBy() : to group the data

df.groupBy("empDepartment").agg(
                                count("*").alias("totalEmpCount"),
                                max("empSalary").alias("maxSalary"),
                                min("empSalary").alias("minSalary"),
                                avg("empSalary").alias("avgSalary"),
                                sum("empSalary").alias("sumSalary")
                            ).show()

+-------------+-------------+---------+---------+---------+---------+
|empDepartment|totalEmpCount|maxSalary|minSalary|avgSalary|sumSalary|
+-------------+-------------+---------+---------+---------+---------+
|           IT|            5|     6000|     3000|   5100.0|    25500|
|      payroll|            2|     2500|     2000|   2250.0|     4500|
|           HR|            3|     4000|     2000|   3000.0|     9000|
|      Finance|            2|     4500|     3500|   4000.0|     8000|
+-------------+-------------+---------+---------+---------+---------+



In [0]:
# union / unionAll : to merge the data vertically (both works same way, allows duplicates)

data1 = [(1, 'Anil',27), 
         (2, 'sandeep', 28), 
         (3, 'riya', 29)]  #jan
schema1 = ['id', 'name', 'age']

data2 = [(3, 'riya', 29), 
         (4, 'rani', 26)] #feb
schema2 = ['id', 'name', 'age']

data3 = [(5, 'liya', 29), 
         (6, 'mani', 26)] #march
schema3 = ['id', 'name', 'age']

df1 = spark.createDataFrame(data1, schema1)
df2 = spark.createDataFrame(data2, schema2)
df3 = spark.createDataFrame(data3, schema3)

df1.show()
df2.show()
df3.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|   Anil| 27|
|  2|sandeep| 28|
|  3|   riya| 29|
+---+-------+---+

+---+----+---+
| id|name|age|
+---+----+---+
|  3|riya| 29|
|  4|rani| 26|
+---+----+---+

+---+----+---+
| id|name|age|
+---+----+---+
|  5|liya| 29|
|  6|mani| 26|
+---+----+---+



In [0]:
df_union = df1.union(df2).union(df3)

df_union.dropDuplicates().sort("id").show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|   Anil| 27|
|  2|sandeep| 28|
|  3|   riya| 29|
|  4|   rani| 26|
|  5|   liya| 29|
|  6|   mani| 26|
+---+-------+---+



In [0]:
emp_data = [
    (1, "John", 1, 50000, 1),
    (2, "Alice", 2, 60000, 2),
    (3, "Bob", 3, 55000, 2),
    (2, "Alice", 2, 60000, 2),
    (3, "Bob", 3, 55000, 2),
    (4, "Jane", 4, 52000, 3),
    (5, "Eve", None, 48000, 4),
    (6, "Charlie", 4, 47000, None),
    (7, "David", 2, 55000, 3),
    (8, "Linda", 3, 53000, 1),
    (9, "Frank", None, 59000, 4),
    (7, "David", 2, 55000, 3),
    (8, "Linda", 3, 53000, None),
    (9, "Frank", 1, 59000, 4),
    (10, "Grace",1, 49000, None),
    (10, "Grace",1, 49000, 4)]
emp_schema = ["empId", "empName", "deptId", "empSalary", "cityId"]

dept_data = [
    (1, "HR"),
    (2, "IT"),
    (3, "Sales"),
    (4, "Finance"),
]
dept_schema = ["deptId", "deptName"]

address_data = [
    (1, "hyd"),
    (2, "blr"),
    (3, "chn"),
    (4, "kkt")
]
add_schema = ["cityId", "cityName"]

print("emp_df :")
emp_df = spark.createDataFrame(emp_data,emp_schema)
emp_df.show()

print("dept_df :")
dept_df = spark.createDataFrame(dept_data,dept_schema)
dept_df.show()

print("address_df :")
address_df = spark.createDataFrame(address_data,add_schema)
address_df.show()

emp_df :
+-----+-------+------+---------+------+
|empId|empName|deptId|empSalary|cityId|
+-----+-------+------+---------+------+
|    1|   John|     1|    50000|     1|
|    2|  Alice|     2|    60000|     2|
|    3|    Bob|     3|    55000|     2|
|    2|  Alice|     2|    60000|     2|
|    3|    Bob|     3|    55000|     2|
|    4|   Jane|     4|    52000|     3|
|    5|    Eve|  NULL|    48000|     4|
|    6|Charlie|     4|    47000|  NULL|
|    7|  David|     2|    55000|     3|
|    8|  Linda|     3|    53000|     1|
|    9|  Frank|  NULL|    59000|     4|
|    7|  David|     2|    55000|     3|
|    8|  Linda|     3|    53000|  NULL|
|    9|  Frank|     1|    59000|     4|
|   10|  Grace|     1|    49000|  NULL|
|   10|  Grace|     1|    49000|     4|
+-----+-------+------+---------+------+

dept_df :
+------+--------+
|deptId|deptName|
+------+--------+
|     1|      HR|
|     2|      IT|
|     3|   Sales|
|     4| Finance|
+------+--------+

address_df :
+------+--------+
|cit

In [0]:
## joins types : inner, left, right, full, cross, self

from pyspark.sql.functions import col, current_timestamp

df_inner = (emp_df.join(dept_df, emp_df.deptId == dept_df.deptId, "inner")
                  .join(address_df, emp_df.cityId == address_df.cityId, "inner")
                  .drop(emp_df.cityId, emp_df.deptId)
                  .dropDuplicates()
                  .sort("empId")
                  .filter(col("CityName") == "chn")
                  .withColumn("createdAt", current_timestamp())
)


df_inner.show(truncate=False)

+-----+-------+---------+------+--------+------+--------+-----------------------+
|empId|empName|empSalary|deptId|deptName|cityId|cityName|createdAt              |
+-----+-------+---------+------+--------+------+--------+-----------------------+
|4    |Jane   |52000    |4     |Finance |3     |chn     |2024-10-14 06:41:21.463|
|7    |David  |55000    |2     |IT      |3     |chn     |2024-10-14 06:41:21.463|
+-----+-------+---------+------+--------+------+--------+-----------------------+



In [0]:
df_full = (emp_df.join(dept_df, emp_df.deptId == dept_df.deptId, "full")
                  .join(address_df, emp_df.cityId == address_df.cityId, "full")
                  .drop(emp_df.cityId, emp_df.deptId)
                  .dropDuplicates(["empId"])
                  .sort("empId")
                  .filter(col("CityName") == "chn")
                  .withColumn("createdAt", current_timestamp())
)


df_full.show(truncate=False)

+-----+-------+---------+------+--------+------+--------+-----------------------+
|empId|empName|empSalary|deptId|deptName|cityId|cityName|createdAt              |
+-----+-------+---------+------+--------+------+--------+-----------------------+
|4    |Jane   |52000    |4     |Finance |3     |chn     |2024-10-14 06:45:46.018|
|7    |David  |55000    |2     |IT      |3     |chn     |2024-10-14 06:45:46.018|
+-----+-------+---------+------+--------+------+--------+-----------------------+

