In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import initcap
from pyspark.sql.functions import split, col, expr

# Initialize Spark session
spark = SparkSession.builder.appName("SampleData").getOrCreate()

# Sample data
data = [("virat kohli",), ("p v sindhu",)]

# Columns
columns = ["name"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.display()

name
virat kohli
p v sindhu


In [0]:
# Applying initcap to capitalize names
df_capitalized = df.select(initcap("name").alias("name"))

# Showing the transformed DataFrame
df_capitalized.display()

name
Virat Kohli
P V Sindhu


In [0]:
# Split the name into words, capitalize each word, and join them back
df_capitalized = df.select(
    expr(
        "concat_ws(' ', transform(split(name, ' '), x -> concat(upper(substring(x, 1, 1)), lower(substring(x, 2, length(x) - 1)))))"
    ).alias("name")
)

# Show the result
df_capitalized.show()

+-----------+
|       name|
+-----------+
|Virat Kohli|
| P V Sindhu|
+-----------+



In [0]:
# Sample data for df1 and df2
data1 = [(1, "Bob"), (2, "Alice"), (3, "Tom")]
data2 = [(1, "Bob"), (3, "Tom")]

# Create DataFrame df1
df1 = spark.createDataFrame(data1, ["id", "name"])

# Create DataFrame df2
df2 = spark.createDataFrame(data2, ["id", "name"])

# Show the DataFrames
df1.display()
df2.display()

id,name
1,Bob
2,Alice
3,Tom


id,name
1,Bob
3,Tom


In [0]:
df.createOrReplaceTempView("dual")

In [0]:
%sql
with t as (
  select
    'virat kohil' as name
  from
    dual
  union
  select
    'p v sindhu' as name
  from
    dual
)
select
  initcap(name)
from
  t;

initcap(name)
Virat Kohil
P V Sindhu


In [0]:
df1.createOrReplaceTempView("t1")
df2.createOrReplaceTempView("t2")

# SQL query to find rows in t1 that are not in t2
query = """
SELECT *
FROM t1
WHERE id NOT IN (SELECT id FROM t2)
"""

# Execute the SQL query
result_df = spark.sql(query)

# Show the result
result_df.display()

id,name
2,Alice


In [0]:
result_df = df1.join(df2, on="id", how="left_anti")

# Show the result
result_df.display()

id,name
2,Alice
