#Spark DF Creation with List

In [0]:
emp_data = [("Sagar","M","70000"),
            ("Bhavya","F","100000"),
            ("Krishna","M","100000"),
            ("Prabha","F","90000"),
            ("Romeo",None,"50000")]
columns = ["name" ,"gender", "Salary"]
emp_df = spark.createDataFrame(data = emp_data, schema = columns)

emp_df.show()

+-------+------+------+
|   name|gender|Salary|
+-------+------+------+
|  Sagar|     M| 70000|
| Bhavya|     F|100000|
|Krishna|     M|100000|
| Prabha|     F| 90000|
|  Romeo|  null| 50000|
+-------+------+------+



#When and Otherwise

In [0]:
from pyspark.sql.functions import when

emp_updated_df = (emp_df.withColumn("sex", when(emp_df.gender == "M", "Male")
                  .when(emp_df.gender == "F", "Female")
                  .when(emp_df.gender.isNull(), "")
                  .otherwise(emp_df.gender)))
emp_updated_df.show()

+-------+------+------+------+
|   name|gender|Salary|   sex|
+-------+------+------+------+
|  Sagar|     M| 70000|  Male|
| Bhavya|     F|100000|Female|
|Krishna|     M|100000|  Male|
| Prabha|     F| 90000|Female|
|  Romeo|  null| 50000|      |
+-------+------+------+------+



In [0]:
emp_new_df = emp_updated_df.drop("gender")

In [0]:
emp_new_df.show()

+-------+------+------+
|   name|Salary|   sex|
+-------+------+------+
|  Sagar| 70000|  Male|
| Bhavya|100000|Female|
|Krishna|100000|  Male|
| Prabha| 90000|Female|
|  Romeo| 50000|      |
+-------+------+------+



#expr

In [0]:
from pyspark.sql.functions import expr

new_df = emp_df.withColumn("sex", expr("""CASE WHEN gender = 'M' THEN "male"
                                       WHEN gender = 'F' THEN "female"
                                       WHEN gender IS NULL THEN ''
                                       ELSE gender
                                       END"""))

new_df.show()

+-------+------+------+------+
|   name|gender|Salary|   sex|
+-------+------+------+------+
|  Sagar|     M| 70000|  male|
| Bhavya|     F|100000|female|
|Krishna|     M|100000|  male|
| Prabha|     F| 90000|female|
|  Romeo|  null| 50000|      |
+-------+------+------+------+



#lit

In [0]:
from pyspark.sql.functions import lit

new_df = emp_df.withColumn("Organization", lit("Facebook"))
new_df.show()

+-------+------+------+------------+
|   name|gender|Salary|Organization|
+-------+------+------+------------+
|  Sagar|     M| 70000|    Facebook|
| Bhavya|     F|100000|    Facebook|
|Krishna|     M|100000|    Facebook|
| Prabha|     F| 90000|    Facebook|
|  Romeo|  null| 50000|    Facebook|
+-------+------+------+------------+



#Split

In [0]:
from pyspark.sql.functions import split

emp_data = [("Siva,Sagar,Kolachina","M","70000"),
            ("Bhavya,Sree,Kolachina","F","100000"),
            ("Krishna,Mohan,Kolachina","M","100000"),
            ("Chandra,Prabha,Kolachina","F","90000"),
            ("Romeo,Mario,Antonio",None,"50000")]
col = ["name", "gender","salary"]
emp_df = spark.createDataFrame(data= emp_data, schema = col)
emp_df.show()

+--------------------+------+------+
|                name|gender|salary|
+--------------------+------+------+
|Siva,Sagar,Kolachina|     M| 70000|
|Bhavya,Sree,Kolac...|     F|100000|
|Krishna,Mohan,Kol...|     M|100000|
|Chandra,Prabha,Ko...|     F| 90000|
| Romeo,Mario,Antonio|  null| 50000|
+--------------------+------+------+



In [0]:
new_df =(emp_df
         .withColumn("First_name",split("name",",")[0])
         .withColumn("Middle_name", split("name", ",")[1])
         .withColumn("Last_name",split("name",",")[2])
         )

new_df.show()

+--------------------+------+------+----------+-----------+---------+
|                name|gender|salary|First_name|Middle_name|Last_name|
+--------------------+------+------+----------+-----------+---------+
|Siva,Sagar,Kolachina|     M| 70000|      Siva|      Sagar|Kolachina|
|Bhavya,Sree,Kolac...|     F|100000|    Bhavya|       Sree|Kolachina|
|Krishna,Mohan,Kol...|     M|100000|   Krishna|      Mohan|Kolachina|
|Chandra,Prabha,Ko...|     F| 90000|   Chandra|     Prabha|Kolachina|
| Romeo,Mario,Antonio|  null| 50000|     Romeo|      Mario|  Antonio|
+--------------------+------+------+----------+-----------+---------+



#explode

In [0]:
emp_data = [("Sagar","M",["python", "sql"]),
            ("Bhavya","F",["C","java"]),
            ("Krishna","M",["c#","c++"]),
            ("Prabha","F",["python"])]
col = ["name","gender", "languages"]

emp_df = spark.createDataFrame(data = emp_data, schema = col)
emp_df.show()

+-------+------+-------------+
|   name|gender|    languages|
+-------+------+-------------+
|  Sagar|     M|[python, sql]|
| Bhavya|     F|    [C, java]|
|Krishna|     M|    [c#, c++]|
| Prabha|     F|     [python]|
+-------+------+-------------+



In [0]:
from pyspark.sql.functions import explode

new_df =emp_df.select(explode("languages"))
new_df.show()


+------+
|   col|
+------+
|python|
|   sql|
|     C|
|  java|
|    c#|
|   c++|
|python|
+------+



In [0]:
emp_data = [("Sagar","Marketing","70000"),
            ("Bhavya","Finance","100000"),
            ("Krishna","Finance","100000"),
            ("Prabha","Sales","90000"),
            ("Romeo","Marketing","50000")]
schema = ["Emp_name", "department", "salary"]

emp_df = spark.createDataFrame(data = emp_data, schema = schema)
emp_df.show()

+--------+----------+------+
|Emp_name|department|salary|
+--------+----------+------+
|   Sagar| Marketing| 70000|
|  Bhavya|   Finance|100000|
| Krishna|   Finance|100000|
|  Prabha|     Sales| 90000|
|   Romeo| Marketing| 50000|
+--------+----------+------+



# collect_list and collect_set

In [0]:
from pyspark.sql.functions import collect_list, collect_set

display(emp_df.select(collect_list("department").alias("department_list")))

department_list
"List(Marketing, Finance, Finance, Sales, Marketing)"


In [0]:
display(emp_df.select(collect_set("department").alias("department_set")))

department_set
"List(Finance, Sales, Marketing)"


#countDistinct, count

In [0]:
from pyspark.sql.functions import countDistinct, count

display(emp_df.select(count("department")))

display(emp_df.select(countDistinct("department")))


count(department)
5


count(DISTINCT department)
3


#first and last

In [0]:
from pyspark.sql.functions import first, last
display(emp_df.select(first("salary")))

display(emp_df.select(last("salary")))

first(salary)
70000


last(salary)
50000


In [0]:
from pyspark.sql.functions import max, min

display(emp_df.select(max("salary")))

display(emp_df.select(min("salary")))


max(salary)
90000


min(salary)
100000


#sum

In [0]:
from pyspark.sql.functions import sum

display(emp_df.select(sum("salary")))

sum(salary)
410000.0


#rank
Second highes salary in each department

In [0]:
emp_data = [("Sagar","Marketing","70000"),
            ("Bhavya","Finance","100000"),
            ("Krishna","Finance","150000"),
            ("Prabha","Sales","90000"),
            ("siva","Sales","50000"),
            ("Chile","Marketing","40000"),
            ("jeff","analyst","70000"),
            ("Romeo","Marketing","50000"),
            ("Dan","Marketing","50000")]

schema = ["Emp_name", "department", "salary"]

emp_df = spark.createDataFrame(data = emp_data, schema = schema)
emp_df.show()

+--------+----------+------+
|Emp_name|department|salary|
+--------+----------+------+
|   Sagar| Marketing| 70000|
|  Bhavya|   Finance|100000|
| Krishna|   Finance|150000|
|  Prabha|     Sales| 90000|
|    siva|     Sales| 50000|
|   Chile| Marketing| 40000|
|    jeff|   analyst| 70000|
|   Romeo| Marketing| 50000|
|     Dan| Marketing| 50000|
+--------+----------+------+



In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, rank

windowSpec = Window.partitionBy("department").orderBy(desc("salary") )
display(windowSpec)

<pyspark.sql.window.WindowSpec at 0x7fd37660d520>

In [0]:
new_df = emp_df.withColumn("rank",rank().over(windowSpec))
new_df.show()

+--------+----------+------+----+
|Emp_name|department|salary|rank|
+--------+----------+------+----+
| Krishna|   Finance|150000|   1|
|  Bhavya|   Finance|100000|   2|
|   Sagar| Marketing| 70000|   1|
|   Romeo| Marketing| 50000|   2|
|     Dan| Marketing| 50000|   2|
|   Chile| Marketing| 40000|   4|
|  Prabha|     Sales| 90000|   1|
|    siva|     Sales| 50000|   2|
|    jeff|   analyst| 70000|   1|
+--------+----------+------+----+



filter

In [0]:
final_df = new_df.filter("rank = 2").drop("rank")
final_df.show()

+--------+----------+------+
|Emp_name|department|salary|
+--------+----------+------+
|  Bhavya|   Finance|100000|
|   Romeo| Marketing| 50000|
|     Dan| Marketing| 50000|
|    siva|     Sales| 50000|
+--------+----------+------+



#dense_rank

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, dense_rank

windowSpec = Window.partitionBy("department").orderBy(desc("salary") )
display(windowSpec)

<pyspark.sql.window.WindowSpec at 0x7fd374fd4100>

In [0]:
new_df = emp_df.withColumn("dense_rank",dense_rank().over(windowSpec))
display(new_df)

Emp_name,department,salary,dense_rank
Krishna,Finance,150000,1
Bhavya,Finance,100000,2
Sagar,Marketing,70000,1
Romeo,Marketing,50000,2
Dan,Marketing,50000,2
Chile,Marketing,40000,3
Prabha,Sales,90000,1
siva,Sales,50000,2
jeff,analyst,70000,1


#row_number

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, row_number

windowSpec = Window.partitionBy("department").orderBy(desc("salary") )
display(windowSpec)

<pyspark.sql.window.WindowSpec at 0x7fd37c180790>

In [0]:
new_df = emp_df.withColumn("row_number",row_number().over(windowSpec))
display(new_df)

Emp_name,department,salary,row_number
Krishna,Finance,150000,1
Bhavya,Finance,100000,2
Sagar,Marketing,70000,1
Romeo,Marketing,50000,2
Dan,Marketing,50000,3
Chile,Marketing,40000,4
Prabha,Sales,90000,1
siva,Sales,50000,2
jeff,analyst,70000,1


#lag

In [0]:
from pyspark.sql.functions import lag

new_df = emp_df.withColumn("lag",lag("salary",1).over(windowSpec))
display(new_df)

Emp_name,department,salary,lag
Krishna,Finance,150000,
Bhavya,Finance,100000,150000.0
Sagar,Marketing,70000,
Romeo,Marketing,50000,70000.0
Dan,Marketing,50000,50000.0
Chile,Marketing,40000,50000.0
Prabha,Sales,90000,
siva,Sales,50000,90000.0
jeff,analyst,70000,


#lead

In [0]:
from pyspark.sql.functions import lead

new_df = emp_df.withColumn("lead",lead("salary",1).over(windowSpec))
display(new_df)

Emp_name,department,salary,lead
Krishna,Finance,150000,100000.0
Bhavya,Finance,100000,
Sagar,Marketing,70000,50000.0
Romeo,Marketing,50000,50000.0
Dan,Marketing,50000,40000.0
Chile,Marketing,40000,
Prabha,Sales,90000,50000.0
siva,Sales,50000,
jeff,analyst,70000,


#partitionBy

In [0]:
type(emp_df.write)

Out[71]: pyspark.sql.readwriter.DataFrameWriter

In [0]:
(emp_df
 .write
 .format("csv")
 .option("header","true")
 .partitionBy("department")
 .save("/FileStore/tables/partition_example"))

In [0]:
%fs ls /FileStore/tables/partition_example

path,name,size,modificationTime
dbfs:/FileStore/tables/partition_example/_SUCCESS,_SUCCESS,0,1708491636000
dbfs:/FileStore/tables/partition_example/department=Finance/,department=Finance/,0,0
dbfs:/FileStore/tables/partition_example/department=Marketing/,department=Marketing/,0,0
dbfs:/FileStore/tables/partition_example/department=Sales/,department=Sales/,0,0
dbfs:/FileStore/tables/partition_example/department=analyst/,department=analyst/,0,0


In [0]:
%fs ls dbfs:/FileStore/tables/partition_example/department=Marketing/

path,name,size,modificationTime
dbfs:/FileStore/tables/partition_example/department=Marketing/_SUCCESS,_SUCCESS,0,1708491635000
dbfs:/FileStore/tables/partition_example/department=Marketing/_committed_6056881838087723233,_committed_6056881838087723233,291,1708491635000
dbfs:/FileStore/tables/partition_example/department=Marketing/_started_6056881838087723233,_started_6056881838087723233,0,1708491634000
dbfs:/FileStore/tables/partition_example/department=Marketing/part-00000-tid-6056881838087723233-536c4efb-6780-4ea6-b85c-dff1517090b2-341-1.c000.csv,part-00000-tid-6056881838087723233-536c4efb-6780-4ea6-b85c-dff1517090b2-341-1.c000.csv,28,1708491635000
dbfs:/FileStore/tables/partition_example/department=Marketing/part-00005-tid-6056881838087723233-536c4efb-6780-4ea6-b85c-dff1517090b2-346-1.c000.csv,part-00005-tid-6056881838087723233-536c4efb-6780-4ea6-b85c-dff1517090b2-346-1.c000.csv,28,1708491635000
dbfs:/FileStore/tables/partition_example/department=Marketing/part-00007-tid-6056881838087723233-536c4efb-6780-4ea6-b85c-dff1517090b2-348-1.c000.csv,part-00007-tid-6056881838087723233-536c4efb-6780-4ea6-b85c-dff1517090b2-348-1.c000.csv,38,1708491635000


In [0]:
%fs head dbfs:/FileStore/tables/partition_example/department=Marketing/part-00000-tid-6056881838087723233-536c4efb-6780-4ea6-b85c-dff1517090b2-341-1.c000.csv

#Student Highest Marks

In [0]:
student_data = [("Sagar","85,81,82"),
            ("Bhavya","89,95,100"),
            ("Krishna","95,96,99"),
            ("Prabha","94,91,92")]
col = ["name","marks"]

student_df = spark.createDataFrame(data = student_data, schema = col)
student_df.show()

+-------+---------+
|   name|    marks|
+-------+---------+
|  Sagar| 85,81,82|
| Bhavya|89,95,100|
|Krishna| 95,96,99|
| Prabha| 94,91,92|
+-------+---------+



In [0]:
from pyspark.sql.functions import split, explode, col

new_df = student_df.select("name", explode(split("marks",",")).alias("mark"))
display(new_df)

name,mark
Sagar,85
Sagar,81
Sagar,82
Bhavya,89
Bhavya,95
Bhavya,100
Krishna,95
Krishna,96
Krishna,99
Prabha,94


In [0]:
new_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- mark: string (nullable = false)



In [0]:
dtype_df = new_df.select("name", col("mark").cast("int"))
display(dtype_df)

name,mark
Sagar,85
Sagar,81
Sagar,82
Bhavya,89
Bhavya,95
Bhavya,100
Krishna,95
Krishna,96
Krishna,99
Prabha,94


In [0]:
dtype_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- mark: integer (nullable = true)



In [0]:
final_df = dtype_df.groupBy("name").max("mark")
display(final_df)

name,max(mark)
Sagar,85
Bhavya,100
Krishna,99
Prabha,94


#Department wise Salary

In [0]:
emp_data = [("Sagar","Marketing",70000),
            ("Bhavya","Finance",100000),
            ("Krishna","Finance",150000),
            ("Prabha","Sales",90000),
            ("siva","Sales",50000),
            ("Chile","Marketing",40000),
            ("jeff","analyst",70000),
            ("Romeo","Marketing",50000),
            ("Dan","Marketing",50000)]

schema = ["Emp_name", "department", "salary"]

emp_df = spark.createDataFrame(data = emp_data, schema = schema)
emp_df.show()

+--------+----------+------+
|Emp_name|department|salary|
+--------+----------+------+
|   Sagar| Marketing| 70000|
|  Bhavya|   Finance|100000|
| Krishna|   Finance|150000|
|  Prabha|     Sales| 90000|
|    siva|     Sales| 50000|
|   Chile| Marketing| 40000|
|    jeff|   analyst| 70000|
|   Romeo| Marketing| 50000|
|     Dan| Marketing| 50000|
+--------+----------+------+



In [0]:
final_df = emp_df.groupBy("department").sum("salary")
display(final_df)

department,sum(salary)
Marketing,210000
Finance,250000
Sales,140000
analyst,70000
