<a href="https://colab.research.google.com/github/sirishaallarapu/AdvancedPySpark-/blob/main/Day5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, BooleanType

spark = SparkSession.builder.appName("SparkUDFExample").getOrCreate()

def to_uppercase(s):
    return s.upper() if s else None

uppercase_udf = udf(to_uppercase, StringType())

def is_prime(n):
    if n is None or n < 2:
        return False
    for i in range(2, int(n ** 0.5) + 1):
        if n % i == 0:
            return False
    return True

is_prime_udf = udf(is_prime, BooleanType())

df1 = spark.createDataFrame([("hello",), ("world",)], ["text"])
df1 = df1.withColumn("uppercase_text", uppercase_udf(df1["text"]))
df1.show()

df2 = spark.createDataFrame([(2,), (4,), (7,), (9,), (11,)], ["number"])
df2 = df2.withColumn("is_prime", is_prime_udf(df2["number"]))
df2.show()


+-----+--------------+
| text|uppercase_text|
+-----+--------------+
|hello|         HELLO|
|world|         WORLD|
+-----+--------------+

+------+--------+
|number|is_prime|
+------+--------+
|     2|    true|
|     4|   false|
|     7|    true|
|     9|   false|
|    11|    true|
+------+--------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
from pyspark.sql.types import StringType, BooleanType

spark = SparkSession.builder.appName("Spark4Features").getOrCreate()

df1 = spark.range(5)
df1.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [None]:
df2 = spark.createDataFrame([("10",), ("abc",)], ["value"])
df2.selectExpr("CAST(value AS INT)").show()

+-----+
|value|
+-----+
|   10|
| NULL|
+-----+



In [None]:
df3 = spark.readStream.format("rate").option("rowsPerSecond", 1).load()
df3.selectExpr("timestamp", "value", "value % 2 as is_even").writeStream.format("console").start()

<pyspark.sql.streaming.query.StreamingQuery at 0x7f927babc1d0>

In [None]:
df4 = spark.createDataFrame([("Zimmy",), ("Snoopi",), ("puffy",)], ["name"])
df4.orderBy(col("name").asc()).show()


+------+
|  name|
+------+
|Snoopi|
| Zimmy|
| puffy|
+------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr

spark = SparkSession.builder.appName("Spark4Features").getOrCreate()

df5 = spark.createDataFrame([({"key": "value"},), ({"number": 10},)], ["data"])
df5.withColumn("data_as_string", expr("CAST(data AS STRING)")).show()


+--------------+--------------+
|          data|data_as_string|
+--------------+--------------+
|{key -> value}|{key -> value}|
|{number -> 10}|{number -> 10}|
+--------------+--------------+



In [None]:
df6 = spark.createDataFrame([("hello",), ("world",)], ["text"])
df6.selectExpr("upper(text)").show()


+-----------+
|upper(text)|
+-----------+
|      HELLO|
|      WORLD|
+-----------+



In [None]:
df7 = spark.sql("SELECT 'Spark 4.0' AS version")
df7.show()

+---------+
|  version|
+---------+
|Spark 4.0|
+---------+

