In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.window import Window
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.show()
df.printSchema()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|  Chennai|10-03-2025 14:01|
|           205|      3|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullable = true)



In [4]:
from pyspark.sql import functions
df.groupBy('user_id').agg(functions.sum('amount')).show()

+-------+-----------+
|user_id|sum(amount)|
+-------+-----------+
|      1|       1200|
|      3|       2500|
|      2|        450|
+-------+-----------+



In [10]:
window_spec = Window.partitionBy('user_id')
print(window_spec)

<pyspark.sql.window.WindowSpec object at 0x00000223498EFC20>


In [11]:
window_spec = Window.partitionBy('user_id')
print(dir(window_spec))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_jspec', 'orderBy', 'partitionBy', 'rangeBetween', 'rowsBetween']


In [12]:
df.orderBy('user_id').show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|  Chennai|10-03-2025 14:01|
|           205|      3|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+



In [14]:
df.orderBy(col('user_id').desc()).show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           205|      3|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|  Chennai|10-03-2025 14:01|
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
+--------------+-------+------+---------+----------------+



In [15]:
w = Window.partitionBy('user_id').orderBy('amount')
print(w)

<pyspark.sql.window.WindowSpec object at 0x000002234AB5B800>


In [16]:
w = Window.partitionBy('user_id').orderBy('amount')
w.show()

AttributeError: 'WindowSpec' object has no attribute 'show'

In [17]:
w = Window.partitionBy('user_id').orderBy('amount')
df.withColumn('row_numer', functions.row_number().over(w)).show()

+--------------+-------+------+---------+----------------+---------+
|transaction_id|user_id|amount| location|transaction_time|row_numer|
+--------------+-------+------+---------+----------------+---------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|        1|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|        2|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|        1|
|           204|      2|   250|  Chennai|10-03-2025 14:01|        2|
|           205|      3|  1000| Banglore|10-03-2025 15:30|        1|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|        2|
+--------------+-------+------+---------+----------------+---------+



In [18]:
w = Window.partitionBy('user_id').orderBy(col('amount').desc())
df.withColumn('row_numer', functions.row_number().over(w)).show()

+--------------+-------+------+---------+----------------+---------+
|transaction_id|user_id|amount| location|transaction_time|row_numer|
+--------------+-------+------+---------+----------------+---------+
|           202|      1|   700|Hyderabad|10-03-2025 12:04|        1|
|           201|      1|   500|Hyderabad|10-03-2025 12:00|        2|
|           204|      2|   250|  Chennai|10-03-2025 14:01|        1|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|        2|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|        1|
|           205|      3|  1000| Banglore|10-03-2025 15:30|        2|
+--------------+-------+------+---------+----------------+---------+



In [19]:
w = Window.partitionBy('user_id').orderBy(col('amount').desc())
df.withColumn('row_numer', functions.rank().over(w)).show()

+--------------+-------+------+---------+----------------+---------+
|transaction_id|user_id|amount| location|transaction_time|row_numer|
+--------------+-------+------+---------+----------------+---------+
|           202|      1|   700|Hyderabad|10-03-2025 12:04|        1|
|           201|      1|   500|Hyderabad|10-03-2025 12:00|        2|
|           204|      2|   250|  Chennai|10-03-2025 14:01|        1|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|        2|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|        1|
|           205|      3|  1000| Banglore|10-03-2025 15:30|        2|
+--------------+-------+------+---------+----------------+---------+



In [20]:
w = Window.orderBy(col('amount').desc())
df.withColumn('row_numer', functions.rank().over(w)).show()

+--------------+-------+------+---------+----------------+---------+
|transaction_id|user_id|amount| location|transaction_time|row_numer|
+--------------+-------+------+---------+----------------+---------+
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|        1|
|           205|      3|  1000| Banglore|10-03-2025 15:30|        2|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|        3|
|           201|      1|   500|Hyderabad|10-03-2025 12:00|        4|
|           204|      2|   250|  Chennai|10-03-2025 14:01|        5|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|        6|
+--------------+-------+------+---------+----------------+---------+



In [21]:
w = Window.orderBy(col('amount').desc())
df.withColumn('row_numer', functions.rank()).show()

AnalysisException: [WINDOW_FUNCTION_WITHOUT_OVER_CLAUSE] Window function "RANK()" requires an OVER clause.;
Project [transaction_id#158, user_id#159, amount#160, location#161, transaction_time#162, rank() AS row_numer#415]
+- Relation [transaction_id#158,user_id#159,amount#160,location#161,transaction_time#162] csv


In [23]:
w = Window.orderBy(col('amount').desc())
df.withColumn('row_numer', functions.rank().over(w)).show()

+--------------+-------+------+---------+----------------+---------+
|transaction_id|user_id|amount| location|transaction_time|row_numer|
+--------------+-------+------+---------+----------------+---------+
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|        1|
|           205|      3|  1000| Banglore|10-03-2025 15:30|        2|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|        3|
|           201|      1|   500|Hyderabad|10-03-2025 12:00|        4|
|           204|      2|   250|  Chennai|10-03-2025 14:01|        5|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|        6|
+--------------+-------+------+---------+----------------+---------+



In [26]:
w = Window.orderBy(col('amount').desc())
df.withColumn('row_numer', functions.dense_rank().over(w)).show()

+--------------+-------+------+---------+----------------+---------+
|transaction_id|user_id|amount| location|transaction_time|row_numer|
+--------------+-------+------+---------+----------------+---------+
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|        1|
|           205|      3|  1000| Banglore|10-03-2025 15:30|        2|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|        3|
|           201|      1|   500|Hyderabad|10-03-2025 12:00|        4|
|           204|      2|   250|  Chennai|10-03-2025 14:01|        5|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|        6|
+--------------+-------+------+---------+----------------+---------+



In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.window import Window
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.show()
df.printSchema()


+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|  Chennai|10-03-2025 14:01|
|           205|      3|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullable = true)



In [4]:
from pyspark.sql.functions import col, sum
from pyspark.sql.window import Window
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)

spec = Window.partitionBy('user_id')
df.withColumn("sum", sum('amount').over(spec)).show()
df.groupBy("user_id").agg(sum('amount')).show()

+--------------+-------+------+---------+----------------+----+
|transaction_id|user_id|amount| location|transaction_time| sum|
+--------------+-------+------+---------+----------------+----+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|1200|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|1200|
|           203|      2|   200|Hyderabad|10-03-2025 14:00| 450|
|           204|      2|   250|  Chennai|10-03-2025 14:01| 450|
|           205|      3|  1000| Banglore|10-03-2025 15:30|2500|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|2500|
+--------------+-------+------+---------+----------------+----+

+-------+-----------+
|user_id|sum(amount)|
+-------+-----------+
|      1|       1200|
|      3|       2500|
|      2|        450|
+-------+-----------+



In [1]:
from pyspark.sql.window import Window
print(dir(Window))

['_FOLLOWING_THRESHOLD', '_JAVA_MAX_LONG', '_JAVA_MIN_LONG', '_PRECEDING_THRESHOLD', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'currentRow', 'orderBy', 'partitionBy', 'rangeBetween', 'rowsBetween', 'unboundedFollowing', 'unboundedPreceding']


In [13]:
from pyspark.sql.functions import col, sum
from pyspark.sql.window import Window
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
# df.agg(functions.sum('amount')).show()
# df.groupBy('user_id').agg(f.sum('amount')).show()
window_spec = Window.partitionBy('user_id')
#df.agg(functions.sum('amount').over(window_spec)).show()
df.withColumn('sum_amount',f.sum('amount').over(window_spec)).show()

+--------------+-------+------+---------+----------------+----------+
|transaction_id|user_id|amount| location|transaction_time|sum_amount|
+--------------+-------+------+---------+----------------+----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|      1200|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|      1200|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|       450|
|           204|      2|   250|  Chennai|10-03-2025 14:01|       450|
|           205|      3|  1000| Banglore|10-03-2025 15:30|      2500|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|      2500|
+--------------+-------+------+---------+----------------+----------+



In [14]:
from pyspark.sql.functions import col, sum
from pyspark.sql.window import Window
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.agg(functions.sum('amount')).show()
df.groupBy('user_id').agg(f.sum('amount')).show()
window_spec = Window.rowsBetween(Window.unboundedPreceding, Window.currentRow)
df.withColumn('cum_sum_amount',f.sum('amount').over(window_spec)).show()

+-----------+
|sum(amount)|
+-----------+
|       4150|
+-----------+

+-------+-----------+
|user_id|sum(amount)|
+-------+-----------+
|      1|       1200|
|      3|       2500|
|      2|        450|
+-------+-----------+

+--------------+-------+------+---------+----------------+----------+
|transaction_id|user_id|amount| location|transaction_time|sum_amount|
+--------------+-------+------+---------+----------------+----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|       500|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|      1200|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|      1400|
|           204|      2|   250|  Chennai|10-03-2025 14:01|      1650|
|           205|      3|  1000| Banglore|10-03-2025 15:30|      2650|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|      4150|
+--------------+-------+------+---------+----------------+----------+



In [15]:
from pyspark.sql.functions import col, sum
from pyspark.sql.window import Window
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.withColumn('amount+2', lambda x:x+2).show()

PySparkTypeError: [NOT_COLUMN] Argument `col` should be a Column, got function.

In [16]:
print(dir(functions))



Py4JJavaError: An error occurred while calling o345.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 68.0 failed 1 times, most recent failure: Lost task 0.0 in stage 68.0 (TID 56) (192.168.100.3 executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.io.EOFException
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:398)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
	... 26 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4333)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3316)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4323)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4321)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4321)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3316)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3539)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
Caused by: java.io.EOFException
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:398)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
	... 26 more
