In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Initialize Spark session
spark = SparkSession.builder.appName("PySparkExamples").getOrCreate()

# Define Schema
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType, BinaryType, FloatType

schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Salary", FloatType(), True),
    StructField("City", StringType(), True),
    StructField("JoinDate", StringType(), True),  # Date stored as string
    StructField("Skills1", ArrayType(StringType()), True),
    StructField("Skills2", ArrayType(StringType()), True),
    StructField("Scores", ArrayType(IntegerType()), True),
    StructField("Phone", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("Height", FloatType(), True),
    StructField("Weight", FloatType(), True),
    StructField("BinaryData", BinaryType(), True),
    StructField("ID", IntegerType(), True)
])

# Sample Data
data = [
    ("Alice", 30, 60000.50, "New York", "2020-01-15", ["Python", "SQL"], ["SQL", "Java"], [90, 85, 88], None, "alice@example.com", 165.2, 60.5, b"AliceBinary", 1),
    ("Bob", 25, 50000.75, "London", "2019-07-21", ["Java", "C++"], ["Python", "C++"], [78, 80, 82], "1234567890", None, 170.4, 68.2, b"BobBinary", 2),
    ("Charlie", 35, 75000.00, "San Francisco", "2018-05-10", ["JavaScript", "Scala"], ["Scala", "Rust"], [95, 92, 89], "0987654321", "charlie@example.com", 180.3, 75.1, b"CharlieBinary", 3)
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)
df.show(truncate=False)

# # 🔹 Numeric Functions
# df.select("Name", "Age", abs(df["Age"]).alias("AbsoluteAge")).show()
# df.select("Name", "Salary", ceil(df["Salary"]).alias("CeilingSalary")).show()
# df.select("Name", "Age", cbrt(df["Age"]).alias("CubeRootAge")).show()
# df.select("Name", "Age", acos(df["Age"]/100).alias("AcosAge")).show()
# df.select("Name", "Age", asin(df["Age"]/100).alias("AsinAge")).show()
# df.select("Name", "Age", atan(df["Age"]/100).alias("AtanAge")).show()
# df.select("Name", "Height", "Weight", atan2(df["Height"], df["Weight"]).alias("Atan2HeightWeight")).show()

# # 🔹 Array Functions
# df.select("Name", "Skills1", "Skills2", array_intersect(df["Skills1"], df["Skills2"]).alias("CommonSkills")).show()
# df.select("Name", "Scores", array_max(df["Scores"]).alias("MaxScore")).show()
# df.select("Name", "Scores", array_min(df["Scores"]).alias("MinScore")).show()
# df.select("Name", "Skills1", array_join(df["Skills1"], ", ").alias("SkillsAsString")).show()
# df.select("Name", "Skills1", array_repeat(df["Skills1"], 2).alias("RepeatedSkills")).show()
# df.select("Name", "Skills1", array_sort(df["Skills1"]).alias("SortedSkills")).show()
# df.select("Name", "Skills1", "Skills2", arrays_zip(df["Skills1"], df["Skills2"]).alias("ZippedSkills")).show()

# # 🔹 String Functions
# df.select("Name", ascii(df["Name"]).alias("ASCII_FirstChar")).show()
# df.select("Name", base64(df["BinaryData"]).alias("Base64Encoded")).show()

# # 🔹 Bitwise & Binary Functions
# df.select("Name", "ID", bin(df["ID"]).alias("BinaryRepresentation")).show()
# df.select("Name", "ID", bitwise_not(df["ID"]).alias("BitwiseNotID")).show()

# # 🔹 Date Functions
# df.select("Name", "JoinDate", add_months(df["JoinDate"], 3).alias("DateAfter3Months")).show()

# # 🔹 Aggregation Functions
# df.groupBy("City").agg(avg(df["Salary"]).alias("AverageSalary")).show()
# df.groupBy("City").agg(collect_list(df["Name"]).alias("PeopleInCity")).show()
# df.groupBy("City").agg(collect_set(df["Name"]).alias("UniquePeopleInCity")).show()


SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/01 23:21:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                       

+-------+---+--------+-------------+----------+-------------------+-------------+------------+----------+-------------------+------+------+----------------------------------------+---+
|Name   |Age|Salary  |City         |JoinDate  |Skills1            |Skills2      |Scores      |Phone     |Email              |Height|Weight|BinaryData                              |ID |
+-------+---+--------+-------------+----------+-------------------+-------------+------------+----------+-------------------+------+------+----------------------------------------+---+
|Alice  |30 |60000.5 |New York     |2020-01-15|[Python, SQL]      |[SQL, Java]  |[90, 85, 88]|null      |alice@example.com  |165.2 |60.5  |[41 6C 69 63 65 42 69 6E 61 72 79]      |1  |
|Bob    |25 |50000.75|London       |2019-07-21|[Java, C++]        |[Python, C++]|[78, 80, 82]|1234567890|null               |170.4 |68.2  |[42 6F 62 42 69 6E 61 72 79]            |2  |
|Charlie|35 |75000.0 |San Francisco|2018-05-10|[JavaScript, Scala]|[Scala, 

                                                                                

In [2]:
df.show()

+-------+---+--------+-------------+----------+-------------------+-------------+------------+----------+-------------------+------+------+--------------------+---+
|   Name|Age|  Salary|         City|  JoinDate|            Skills1|      Skills2|      Scores|     Phone|              Email|Height|Weight|          BinaryData| ID|
+-------+---+--------+-------------+----------+-------------------+-------------+------------+----------+-------------------+------+------+--------------------+---+
|  Alice| 30| 60000.5|     New York|2020-01-15|      [Python, SQL]|  [SQL, Java]|[90, 85, 88]|      null|  alice@example.com| 165.2|  60.5|[41 6C 69 63 65 4...|  1|
|    Bob| 25|50000.75|       London|2019-07-21|        [Java, C++]|[Python, C++]|[78, 80, 82]|1234567890|               null| 170.4|  68.2|[42 6F 62 42 69 6...|  2|
|Charlie| 35| 75000.0|San Francisco|2018-05-10|[JavaScript, Scala]|[Scala, Rust]|[95, 92, 89]|0987654321|charlie@example.com| 180.3|  75.1|[43 68 61 72 6C 6...|  3|
+-------+-

In [None]:
df.select("Name", "JoinDate", add_months(df["2012-12-12"], 3).alias("DateAfter3Months")).show()
