In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Create a Spark session

# Sample DataFrame creation
data = [("A", 10),
        ("B", 20),
        ("C", 30),
        ("D", 40)]

columns = ["id", "value"]
 
df = spark.createDataFrame(data, columns)

# Define a Window specification
window_spec = Window.orderBy("id").rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Add a new column for cumulative sum
df = df.withColumn("cumulative_sum", F.sum("value").over(window_spec))

# Show the result
df.show()


+---+-----+--------------+
| id|value|cumulative_sum|
+---+-----+--------------+
|  A|   10|            10|
|  B|   20|            30|
|  C|   30|            60|
|  D|   40|           100|
+---+-----+--------------+



In [0]:
from pyspark.sql import Window
from pyspark.sql import functions as F

# Sample DataFrame creation
data = [("Alice", 100),
        ("Bob", 150),
        ("Charlie", 120),
        ("David", 200),
        ("Eve", 180)]

columns = ["employee_name", "salary"]

df = spark.createDataFrame(data, columns)

# Specify the desired rank
nth_rank = 2  # Change this to the desired rank

# Define a Window specification ordering by salary in descending order
window_spec = Window.orderBy(F.desc("salary"))

# Add a new column with the row number based on the window specification
df = df.withColumn("salary_rank", F.row_number().over(window_spec))

# Filter the DataFrame to get the nth highest salary
result = df.filter(F.col("salary_rank") == nth_rank).select("employee_name", "salary")

# Show the result
result.show()


+-------------+------+
|employee_name|salary|
+-------------+------+
|          Eve|   180|
+-------------+------+



In [0]:
txt = "apple#banana#cherry#orange"

# setting the maxsplit parameter to 1, will return a list with 2 elements!
x = txt.split("#", 1)

print(x)

['apple', 'banana#cherry#orange']


In [0]:
from pyspark.sql.functions import split, col
jsonString='{"Id":"2","Name":"Maheer:Basha", "City": "Hyd"}'
data= [(2,jsonString)]
cols=["c01", "c02"]
df=spark.createDataFrame(data,cols)
df.display()
df.printSchema()

c01,c02
2,"{""Id"":""2"",""Name"":""Maheer:Basha"", ""City"": ""Hyd""}"


root
 |-- c01: long (nullable = true)
 |-- c02: string (nullable = true)



In [0]:
from pyspark.sql.functions import lit, concat, concat_ws
df1=df.withColumn("c03",split(df.c02, '"Name":"')[0])\
.withColumn("c04",lit('"Name":"'))\
.withColumn("c05", split(col("c02"),'"Name":"')[1])\
.withColumn("c06", split("c05",':',2))\
.withColumn("c07",concat_ws('',col("c06")))\
.withColumn("c08", concat(col("c03"),col("c04"),col("c07"))).select(col("c07"),col("c08"))
df1.display()

c07,c08
"MaheerBasha"", ""City"": ""Hyd""}","{""Id"":""2"",""Name"":""MaheerBasha"", ""City"": ""Hyd""}"
