In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import col, length, regexp_replace
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Create tbl_cnt Table") \
    .getOrCreate()

# Define the schema for the tbl_cnt table
schema = StructType([
    StructField("col1", IntegerType(), True),
    StructField("col2", StringType(), True)
])

# Insert records into tbl_cnt
data = [
    (1, 'a,b,c'),
    (2, 'a,b')
]

# Create a DataFrame from the data and schema
df = spark.createDataFrame(data, schema)

# Show the DataFrame contents
df.display()




col1,col2
1,"a,b,c"
2,"a,b"


In [0]:
# Optionally, register the DataFrame as a temporary view to run SQL queries
df.createOrReplaceTempView("tbl_cnt")

# Example SQL query (optional)
spark.sql("SELECT * FROM tbl_cnt").display()

col1,col2
1,"a,b,c"
2,"a,b"


In [0]:
%sql
select col1,len(col3) as count from
(
select col1,col2,replace(col2,',','') as col3 from tbl_cnt 
) as a

col1,count
1,3
2,2


In [0]:
df = df.withColumn("col3", regexp_replace(col("col2"), ",", ""))

# Step 2: Calculate the length of col3
df = df.withColumn("count", length(col("col3")))

# Step 3: Select col1 and the count
result = df.select("col1", "count")

# Show the result
result.display()

col1,count
1,3
2,2
