In [None]:
pip install pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()

In [3]:
data1 = [("Alice", 1, 1000, "Address1"),
         ("Bob", 2, 2000, "Address2"),
         ("Charlie", 3, 3000, "Address3"),
         ("David", 4, 4000, "Address4"),
         ("Eve", 5, 5000, "Address5")]

data2 = [(1, 25, "Married"),
         (2, 30, "Single"),
         (3, 28, "Married"),
         (4, 32, "Single"),
         (5, 27, "Married")]

In [4]:
df1 = spark.createDataFrame(data1, ["student_name", "id", "salary", "address"])
df2 = spark.createDataFrame(data2, ["id", "age", "married_status"])


In [24]:
joined_df=df1.join(df2,"id")
joined_df.show()

+---+------------+------+--------+---+--------------+
| id|student_name|salary| address|age|married_status|
+---+------------+------+--------+---+--------------+
|  1|       Alice|  1000|Address1| 25|       Married|
|  2|         Bob|  2000|Address2| 30|        Single|
|  3|     Charlie|  3000|Address3| 28|       Married|
|  4|       David|  4000|Address4| 32|        Single|
|  5|         Eve|  5000|Address5| 27|       Married|
+---+------------+------+--------+---+--------------+



# <mark>_<u>**Broad Case Join**</u>_</mark>

In [25]:
data3 = [(1, "DeptA"),
         (2, "DeptB"),
         (3, "DeptC"),
         (4, "DeptD"),
         (5, "DeptE")]

df3=spark.createDataFrame(data3, ["id","department"])

joined_df = df1.join(df2, "id").join(df3.hint("broadcast"), "id")

joined_df.show()

+---+------------+------+--------+---+--------------+----------+
| id|student_name|salary| address|age|married_status|department|
+---+------------+------+--------+---+--------------+----------+
|  1|       Alice|  1000|Address1| 25|       Married|     DeptA|
|  2|         Bob|  2000|Address2| 30|        Single|     DeptB|
|  3|     Charlie|  3000|Address3| 28|       Married|     DeptC|
|  4|       David|  4000|Address4| 32|        Single|     DeptD|
|  5|         Eve|  5000|Address5| 27|       Married|     DeptE|
+---+------------+------+--------+---+--------------+----------+



# <mark>_<u>**Distinct Function **</u>_</mark>

In [27]:
distinctDF = joined_df.distinct()
print("Distinct count: "+str(distinctDF.count()))
distinctDF.show(truncate=False)

Distinct count: 5
+---+------------+------+--------+---+--------------+----------+
|id |student_name|salary|address |age|married_status|department|
+---+------------+------+--------+---+--------------+----------+
|1  |Alice       |1000  |Address1|25 |Married       |DeptA     |
|2  |Bob         |2000  |Address2|30 |Single        |DeptB     |
|3  |Charlie     |3000  |Address3|28 |Married       |DeptC     |
|4  |David       |4000  |Address4|32 |Single        |DeptD     |
|5  |Eve         |5000  |Address5|27 |Married       |DeptE     |
+---+------------+------+--------+---+--------------+----------+



# <mark>_<u>**dropDuplicates**</u>_</mark>

In [28]:
df2 = joined_df.dropDuplicates()
print("Distinct count: "+str(df2.count()))
df2.show(truncate=False)

Distinct count: 5


[Stage 57:>                                                         (0 + 2) / 2]                                                                                

# <mark>_<u>**dropDuplicates_2_Columns**</u>_</mark>

In [29]:
dropDisDF = joined_df.dropDuplicates(["department","salary"])
print("Distinct count of department & salary : "+str(dropDisDF.count()))
dropDisDF.show(truncate=False)

Distinct count of department & salary : 5
+---+------------+------+--------+---+--------------+----------+
|id |student_name|salary|address |age|married_status|department|
+---+------------+------+--------+---+--------------+----------+
|1  |Alice       |1000  |Address1|25 |Married       |DeptA     |
|2  |Bob         |2000  |Address2|30 |Single        |DeptB     |
|3  |Charlie     |3000  |Address3|28 |Married       |DeptC     |
|4  |David       |4000  |Address4|32 |Single        |DeptD     |
|5  |Eve         |5000  |Address5|27 |Married       |DeptE     |
+---+------------+------+--------+---+--------------+----------+



# <mark>_<u>**repartition**</u>_</mark>

In [30]:
df2 = joined_df.repartition(6)
print(df2.rdd.getNumPartitions())

6


# <mark>_<u>**UDF Example**</u>_</mark>

In [32]:
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

df = spark.createDataFrame(data=data,schema=columns)

df.show(truncate=False)

from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType

def convertCase(str):
    resStr=""
    arr = str.split(" ")
    for x in arr:
       resStr= resStr + x[0:1].upper() + x[1:len(x)] + " "
    return resStr 

spark.udf.register("convertUDF", convertCase,StringType())
df.createOrReplaceTempView("NAME_TABLE")
spark.sql("select Seqno, convertUDF(Name) as Name from NAME_TABLE") \
     .show(truncate=False)

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
+-----+------------+

+-----+-------------+
|Seqno|Name         |
+-----+-------------+
|1    |John Jones   |
|2    |Tracey Smith |
|3    |Amy Sanders  |
+-----+-------------+



[Stage 94:>                                                         (0 + 1) / 1]