## Reading libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, regexp_replace, sum, lit, max, min, first,to_timestamp, trim, upper,to_date, date_format, when
from pyspark.sql.types import DateType

from pyspark.sql.types import IntegerType, FloatType

In [2]:
spark = SparkSession.builder.appName("read_csv").getOrCreate()

In [3]:
spark

# helper functions

In [4]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DateType
from datetime import datetime

# Define a UDF to parse the date string
@udf(DateType())
def parse_date(date_str):
    try:
        return datetime.strptime(date_str, "%m/%d/%Y").date()
    except ValueError:
        return None  # Handle invalid date strings as needed


In [5]:
def change_dtype(dataframe, column_name,change_to):
    return dataframe.withColumn(column_name, col(column_name).cast(change_to))

In [6]:
def remove_illegal_char(dataframe,column_name):
    illegal_character = ['Can','\$',',',"\(","\)"]
    for character in illegal_character:
        dataframe = dataframe.withColumn(column_name, regexp_replace(col(column_name), character, ''))
#                              .withColumn('Open Balance', regexp_replace(col('Open Balance'), 'Can', ''))
    return dataframe

In [7]:
def trim_col(dataframe, column_name):
    return dataframe.withColumn(column_name, trim(col(column_name)))

## Reading CSV

In [8]:
sales_report_kmpg_july=spark.read.options(inferSchema='True').csv('csv_converted\saleReprtKmpg_july - SalesReportKPMGAuditResults.csv', header=True, inferSchema=True, sep=',')

## Cleaning and Transforming

In [9]:
sales_report_kmpg_july.columns

['Transaction ID',
 'Document Number',
 'Transaction Type',
 'Project ID',
 'Customer',
 'Terms',
 'Currency',
 'Gross Amount',
 'Payment',
 'Open Balance',
 'Transaction Create Date',
 'Document Date',
 'Due Date',
 'Days Open',
 'Service Dates',
 'Payment Receipt Date']

In [10]:
object_columns = ['Transaction ID','Document Number','Transaction Type', 'Project ID','Customer','Terms','Currency',]
numeric_columns = ['Gross Amount','Payment','Open Balance','Days Open']
date_columns = ['Transaction Create Date','Document Date','Due Date','Service Dates','Payment Receipt Date']

### Trim column

In [11]:
# sales_report_kmpg_july.show()

In [12]:
# sales_report_kmpg_july = trim_col(sales_report_kmpg_july,column_name='Open Balance')
for column in sales_report_kmpg_july.columns:
    sales_report_kmpg_july = trim_col(sales_report_kmpg_july,column_name=column)
    

### Remove illegal characters from numeric columns

In [13]:
# sales_report_kmpg_july = remove_illegal_char(sales_report_kmpg_july,column_name='Open Balance')
for column in numeric_columns:
#     print(column)
    sales_report_kmpg_july = remove_illegal_char(sales_report_kmpg_july,column_name=column)
#     print(sales_report_kmpg_july.show(1))
    

### Change datatype

In [14]:
# sales_report_kmpg_july = sales_report_kmpg_july.withColumn('Open Balance', col('Open Balance').cast('float'))
# for column in date_columns:
#     sales_report_kmpg_july = sales_report_kmpg_july.withColumn(column, col(column).cast(to_timestamp(column)))
    

In [15]:
sales_report_kmpg_july.dtypes

[('Transaction ID', 'string'),
 ('Document Number', 'string'),
 ('Transaction Type', 'string'),
 ('Project ID', 'string'),
 ('Customer', 'string'),
 ('Terms', 'string'),
 ('Currency', 'string'),
 ('Gross Amount', 'string'),
 ('Payment', 'string'),
 ('Open Balance', 'string'),
 ('Transaction Create Date', 'string'),
 ('Document Date', 'string'),
 ('Due Date', 'string'),
 ('Days Open', 'string'),
 ('Service Dates', 'string'),
 ('Payment Receipt Date', 'string')]

In [16]:
sales_report_kmpg_july.show(1)

+--------------+---------------+----------------+----------+--------------------+-----+--------+------------+-------+------------+-----------------------+-------------+--------+---------+--------------------+--------------------+
|Transaction ID|Document Number|Transaction Type|Project ID|            Customer|Terms|Currency|Gross Amount|Payment|Open Balance|Transaction Create Date|Document Date|Due Date|Days Open|       Service Dates|Payment Receipt Date|
+--------------+---------------+----------------+----------+--------------------+-----+--------+------------+-------+------------+-----------------------+-------------+--------+---------+--------------------+--------------------+
|       3190973|      CUMC15527|     Credit Memo|   1277343|CUMC : Columbia U...| null|     USA|      145.83| 145.83|        0.00|          7/31/23 11:02|    7/31/2023|    null|        0|07/01/2023 to 07/...|           7/30/2023|
+--------------+---------------+----------------+----------+--------------------

## Operation 1:
- 3 lines
    - 1. Open Balance != 0----------------------output ---> temp_df , Sum_Open_Balance
    - 2. Tranasaction Type  == "Credit Memo"----output ----> process_df_result
    - 3. Tranasaction Type  != "Credit Memo"----output-----> temp_df_process_3

### 1. Open Balance != 0 
    - filtering the recored `Open Balance !=0`
    - Adding Column `Outstanding = "Open"`
    - Sum of `Open Balance`

In [17]:
sales_report_kmpg_july.filter(sales_report_kmpg_july['Open Balance'] != '0.00').count()

3948

In [18]:
temp_df = sales_report_kmpg_july.filter(sales_report_kmpg_july['Open Balance'] != '0.00')

In [19]:
print("type of df",type(temp_df), "no of records",temp_df.count())

type of df <class 'pyspark.sql.dataframe.DataFrame'> no of records 3948


#### adding column Outstanding = open

In [20]:
# df_spark.withColumn('year after 2', df_spark['year']+1000).show() refrence
temp_df = temp_df.withColumn('Outstanding', lit('Open'))

In [21]:
temp_df = change_dtype(temp_df, column_name='Open Balance',change_to=FloatType())

In [22]:
Sum_Open_Balance = temp_df.select(sum(col('Open Balance'))).collect()[0][0]
Sum_Open_Balance

29295611.528552473

In [23]:
sales_report_kmpg_july.filter(sales_report_kmpg_july['Open Balance'] != '0.00').select(sum(col('Open Balance')))

DataFrame[sum(Open Balance): double]

#### Output
Sum_Open_Balance, temp_df

In [24]:
print(temp_df.count(),Sum_Open_Balance)

3948 29295611.528552473


### 2. Tranasaction Type  = "Credit Memo"

In [25]:
temp_df_process_2=sales_report_kmpg_july.filter(sales_report_kmpg_july['Transaction Type'] == 'Credit Memo')
print("Input Parameters: ", temp_df_process_2.count())

Input Parameters:  15459


In [26]:
temp_df_process_2 = temp_df_process_2.withColumn('Gross Amount', col('Gross Amount').cast('float'))
temp_df_process_2 = temp_df_process_2.withColumn('Payment', col('Payment').cast('float'))
temp_df_process_2 = temp_df_process_2.withColumn('Days Open', col('Days Open').cast('float'))

In [27]:
groupby_df_process_2 =temp_df_process_2.groupBy(['Transaction ID',"Document Number"])

In [28]:
temp_df_process_2.dtypes

[('Transaction ID', 'string'),
 ('Document Number', 'string'),
 ('Transaction Type', 'string'),
 ('Project ID', 'string'),
 ('Customer', 'string'),
 ('Terms', 'string'),
 ('Currency', 'string'),
 ('Gross Amount', 'float'),
 ('Payment', 'float'),
 ('Open Balance', 'string'),
 ('Transaction Create Date', 'string'),
 ('Document Date', 'string'),
 ('Due Date', 'string'),
 ('Days Open', 'float'),
 ('Service Dates', 'string'),
 ('Payment Receipt Date', 'string')]

In [29]:
#     "Transaction Type": first(col("Transaction Type")),  # Pick the first occurrence of "Payment Type"
#     "Project ID": max(col("Profit"))  # Calculate the maximum profit
#     "Customer": first(col("Customer")),
#     "Terms": first(col("Terms")),
#     "Currency": first(col("Currency")),
#     "Gross Amount": min(col("Gross Amount")),
#     "Payment": max(col("Payment")),
#     "Open Balance": max(col("Open Balance")),
#     "Transaction Create Date": max(col("Transaction Create Date")),
#     "Document Date": max(col("Document Date")),
#     "Due Date": max(col("Due Date")),
#     "Days Open": max(col("Days Open")),
#     "Service Dates": max(col("Service Dates")),
#     "Payment Receipt Date": col("Payment Receipt Date")

In [30]:
agg_exprs = {
    "Transaction Type": 'first',  # Pick the first occurrence of "Payment Type"
    "Project ID": 'first',  # Calculate the maximum profit
    "Customer": 'first',
    "Terms": 'first',
    "Currency": 'first',
    "Gross Amount": 'min',
    "Payment": 'max',
    "Open Balance": 'max',
    "Transaction Create Date": 'max',
    "Document Date": 'max',
    "Due Date": 'max',
    "Days Open": 'max',
    "Service Dates": 'max',
    "Payment Receipt Date": 'first'}


In [31]:
type(first(col("Transaction Type")))

pyspark.sql.column.Column

In [32]:
# groupby_df_process_2.agg(agg_exprs).count()

In [33]:
process_df_result = groupby_df_process_2.agg(agg_exprs)

In [34]:
# column got renamed need to look for other way, there is one commented above but causing some error
for column in process_df_result.columns:
#     print(column)
    new_column = column
    remove_list = ["max","min","(",")","first"]
    for char in remove_list:
#         print(char)
        if char in new_column:
            new_column=new_column.replace(char,"")
    process_df_result=process_df_result.withColumnRenamed(column,new_column)
    
    

In [35]:
# process_df_result.columns

In [36]:
process_df_result = process_df_result.withColumn('Outstanding', lit('Cleared'))
process_df_result=process_df_result.withColumn('Applying Link Amount', -col("Payment"))

In [37]:
process_df_result.select(["Transaction ID", "Outstanding","Applying Link Amount"]).show()

+--------------+-----------+--------------------+
|Transaction ID|Outstanding|Applying Link Amount|
+--------------+-----------+--------------------+
|       3191846|    Cleared|              -84.87|
|       3196377|    Cleared|              -243.0|
|       3152869|    Cleared|            -1394.96|
|       3136755|    Cleared|                -0.0|
|       3117490|    Cleared|             -111.29|
|       3119866|    Cleared|            -1397.43|
|       3051990|    Cleared|             -525.98|
|       3034098|    Cleared|              -19.48|
|       2979232|    Cleared|              -119.6|
|       2889570|    Cleared|            -2857.77|
|       2840729|    Cleared|             -1000.0|
|       3195531|    Cleared|                -0.0|
|       3198467|    Cleared|             -446.38|
|       3154622|    Cleared|                -0.0|
|       3114388|    Cleared|               -0.08|
|       3049865|    Cleared|                -0.0|
|       3050589|    Cleared|                -0.0|


In [38]:
print("Output Result :", process_df_result.count())

Output Result : 14286


### Output : process_df_result

## 3. Tranasaction Type  != "Credit Memo"

In [39]:
sales_report_kmpg_july.count()

382148

In [40]:
sales_report_kmpg_july.select("Transaction Type").distinct().show()

+----------------+
|Transaction Type|
+----------------+
|     Credit Memo|
|         Invoice|
|            null|
+----------------+



In [41]:
temp_df_process_3=sales_report_kmpg_july.filter(sales_report_kmpg_july['Transaction Type'] != 'Credit Memo')
print("Input :", temp_df_process_3.count())

Input : 366688


In [42]:
temp_df_process_3 = temp_df_process_3.withColumn('Customer', upper(trim(col('Customer'))))

In [43]:
temp_df_process_3.select('Customer').count()

366688

In [44]:
temp_df_process_3.filter(col('Transaction Type').like('% %')).select('Transaction Type')

DataFrame[Transaction Type: string]

In [45]:
temp_df_process_3.count()

366688

# customer_payment_history 
customer_payment_history =customer_payment_history.withColumn('Date', date_format(to_date(customer_payment_history['Date'],'M/d/yyyy'),'yyyy-MM-dd' ))

In [46]:
# customer_payment_history.filter(col("payment Date").isNull()).count()
# customer_payment_history.filter(col("Payment Date").rlike(r"^\d{1,2}/\d{1,2}/\d{4}$")).count()
# customer_payment_history.filter(col('Payment Date').like('%1/1/1900%')).select('Payment Date').show()
# customer_payment_history =customer_payment_history.withColumn('Date', to_date(customer_payment_history['Date'],'M/d/yyyy'))
# customer_payment_history =customer_payment_history.withColumn('Date', date_format(to_date(customer_payment_history['Date'],'M/d/yyyy'),'yyyy-MM-dd' ))
# customer_payment_history =customer_payment_history.withColumn('Date', date_format(customer_payment_history['Date'], 'MM/dd/yyyy'))
# customer_payment_history =customer_payment_history.withColumn('Date', to_date(customer_payment_history['Date'],'yyyy-MM-dd' ))
# customer_payment_history = customer_payment_history.fillna('1/1/1900',subset=['Payment Date'])
# customer_payment_history =customer_payment_history.withColumn('Payment Date', to_date(date_format(customer_payment_history['Payment Date'],'yyyy-MM-dd'),'M/d/yyyy' ))
# customer_payment_history = customer_payment_history.withColumn('Document Number', upper(trim(col('Document Number'))))

In [47]:
customer_payment_history = spark.read.options(inferSchema='True').csv('csv_converted\Customer_Payment_history_july.csv', header=True, inferSchema=True, sep=',')

In [48]:
customer_payment_history=customer_payment_history.withColumnRenamed('Currency0','Currency')
customer_payment_history=customer_payment_history.withColumnRenamed('Currency24','Currency2')
customer_payment_history=customer_payment_history.withColumnRenamed('Approved for Email10','Approved for Email')
customer_payment_history=customer_payment_history.withColumnRenamed('Approved for Email44','Approved for Email')

In [49]:
customer_payment_history =customer_payment_history.withColumn('Date', to_date(customer_payment_history['Date'],'M/d/yyyy'))

In [50]:
customer_payment_history =customer_payment_history.withColumn('Payment Date', to_date(customer_payment_history['Payment Date'],'M/d/yyyy'))

In [51]:
customer_payment_history= customer_payment_history.withColumnRenamed("Payment Date", "Payment Date2")

In [52]:
customer_payment_history= customer_payment_history.withColumn("Payment Date", customer_payment_history["Date"])

In [53]:
customer_payment_history.filter((customer_payment_history['Payment Date'] >= "2018-10-01") & (customer_payment_history['Payment Date'] <= "2023-07-31")).count()

315676

# Inner join output of 2 and 3

In [82]:
type(df)

NameError: name 'df' is not defined

In [62]:
process_df_result_copy = process_df_result.select("*")
customer_payment_history_copy = customer_payment_history.select("*")
temp_df_copy = temp_df.select("*")

In [63]:
print("Rows : ",process_df_result_copy.count(),customer_payment_history_copy.count(), temp_df_copy.count())
print("columns : ",len(process_df_result_copy.columns), len(customer_payment_history_copy.columns),len(temp_df_copy.columns))

Rows :  14286 315677 3948
columns :  18 48 17


In [64]:
column_names=set()
for df in [process_df_result, temp_df,customer_payment_history]:
    for column in df.columns:
        column_names.add(column)
del df
column_names = list(column_names)
print(len(column_names))

62


In [65]:
# column_names

In [66]:
# temp_df.columns

In [67]:
# column_names

In [68]:
df.columns

NameError: name 'df' is not defined

In [69]:
process_df_result_copy.

SyntaxError: invalid syntax (3203806025.py, line 1)

In [87]:
spark.stop()

In [3]:
# Create a Spark session
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, regexp_replace, lit
spark = SparkSession.builder.appName("example").getOrCreate()

# Create a sample DataFrame
data = [("Alice", 30), ("Bob", 25), ("Charlie", 35)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

# List of column names to add
column_names = ["Address", "Email"]

# Default value for the new columns
default_value = "Unknown"

# Loop through the column names and add them if they don't exist
for column_name in column_names:
    if column_name not in df.columns:
        df = df.withColumn(column_name, lit(default_value).cast("string"))

# Show the DataFrame with the added columns
df.show()


Py4JJavaError: An error occurred while calling o69.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0) (LAPTOP-3UT4836R executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1623)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:699)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:743)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:690)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:655)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:631)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:588)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:546)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 30 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4177)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3161)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4167)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4165)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4165)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3161)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3382)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:284)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:323)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1623)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:699)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:743)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:690)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:655)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:631)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:588)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:546)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 30 more


In [83]:
for i,df in enumerate([process_df_result, temp_df,customer_payment_history]):
    for column in column_names:
        if column not in df.columns:
            df = df.withColumn(column,lit(None).cast("string"))


In [84]:
print("Rows : ",process_df_result_copy.count(),customer_payment_history_copy.count(), temp_df_copy.count())
print("columns : ",len(process_df_result.columns), len(customer_payment_history_copy.columns),len(temp_df_copy.columns))

Rows :  14286 315677 3948
columns :  18 48 17


In [72]:
temp_df_process_3_copy = temp_df_process_3_copy.withColumn('apples',lit("yes"))

NameError: name 'temp_df_process_3_copy' is not defined

In [109]:
temp_df_process_3_copy.select("apples").show()

+------+
|apples|
+------+
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
|   yes|
+------+
only showing top 20 rows



In [102]:
temp_df_process_3_copy.columns

['Transaction ID',
 'Document Number',
 'Transaction Type',
 'Project ID',
 'Customer',
 'Terms',
 'Currency',
 'Gross Amount',
 'Payment',
 'Open Balance',
 'Transaction Create Date',
 'Document Date',
 'Due Date',
 'Days Open',
 'Service Dates',
 'Payment Receipt Date',
 'apples']

In [75]:
process3_paymenthistory_inner = temp_df_process_3.join(customer_payment_history, on='Document Number', how='inner')

In [76]:
process3_paymenthistory_inner.count()

315678

In [77]:
# process3_paymenthistory_inner.tail(1)

In [78]:
process3_paymenthistory_inner = process3_paymenthistory_inner.withColumn('Outstanding', lit('Cleared'))

In [79]:
print(process_df_result.count(), temp_df.count(),process3_paymenthistory_inner.count() )

14286 3948 315678


In [80]:
column_names=set()
for df in [process_df_result, temp_df,process3_paymenthistory_inner]:
    for column in df.columns:
        column_names.add(column)
column_names = list(column_names)

In [81]:
for df in [process_df_result, temp_df,process3_paymenthistory_inner]:
    for column in column_names:
        if column not in df.columns:
            df = df.withColumn(column,lit(None).cast("string"))

In [83]:
temp_df

DataFrame[Transaction ID: string, Document Number: string, Transaction Type: string, Project ID: string, Customer: string, Terms: string, Currency: string, Gross Amount: string, Payment: string, Open Balance: float, Transaction Create Date: string, Document Date: string, Due Date: string, Days Open: string, Service Dates: string, Payment Receipt Date: string, Outstanding: string]

In [82]:
print(len(process_df_result.columns), len(temp_df.columns),len(process3_paymenthistory_inner.columns))

18 17 64


In [62]:
process_df_result.columns

['Transaction ID',
 'Document Number',
 'Payment Receipt Date',
 'Terms',
 'Due Date',
 'Days Open',
 'Service Dates',
 'Transaction Type',
 'Currency',
 'Customer',
 'Open Balance',
 'Gross Amount',
 'Payment',
 'Project ID',
 'Document Date',
 'Transaction Create Date',
 'Outstanding',
 'Applying Link Amount']

In [63]:
temp_df.unionByName

<bound method DataFrame.unionByName of DataFrame[Transaction ID: string, Document Number: string, Transaction Type: string, Project ID: string, Customer: string, Terms: string, Currency: string, Gross Amount: string, Payment: string, Open Balance: float, Transaction Create Date: string, Document Date: string, Due Date: string, Days Open: string, Service Dates: string, Payment Receipt Date: string, Outstanding: string]>

In [64]:
combined_df = temp_df.unionByName(process_df_result)

AnalysisException: [NUM_COLUMNS_MISMATCH] UNION can only be performed on inputs with the same number of columns, but the first input has 17 columns and the second input has 18 columns.;
'Union false, false
:- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, cast(Open Balance#560 as float) AS Open Balance#805, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#645, Service Dates#288, Payment Receipt Date#305, Open AS Outstanding#787]
:  +- Filter NOT (Open Balance#560 = 0.00)
:     +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, regexp_replace(Days Open#628, \), , 1) AS Days Open#645, Service Dates#288, Payment Receipt Date#305]
:        +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, regexp_replace(Days Open#611, \(, , 1) AS Days Open#628, Service Dates#288, Payment Receipt Date#305]
:           +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, regexp_replace(Days Open#594, ,, , 1) AS Days Open#611, Service Dates#288, Payment Receipt Date#305]
:              +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, regexp_replace(Days Open#577, \$, , 1) AS Days Open#594, Service Dates#288, Payment Receipt Date#305]
:                 +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, regexp_replace(Days Open#271, Can, , 1) AS Days Open#577, Service Dates#288, Payment Receipt Date#305]
:                    +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, regexp_replace(Open Balance#543, \), , 1) AS Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                       +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, regexp_replace(Open Balance#526, \(, , 1) AS Open Balance#543, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                          +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, regexp_replace(Open Balance#509, ,, , 1) AS Open Balance#526, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                             +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, regexp_replace(Open Balance#492, \$, , 1) AS Open Balance#509, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                                +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, regexp_replace(Open Balance#203, Can, , 1) AS Open Balance#492, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                                   +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, regexp_replace(Payment#458, \), , 1) AS Payment#475, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                                      +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, regexp_replace(Payment#441, \(, , 1) AS Payment#458, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                                         +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, regexp_replace(Payment#424, ,, , 1) AS Payment#441, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                                            +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, regexp_replace(Payment#407, \$, , 1) AS Payment#424, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                                               +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, regexp_replace(Payment#186, Can, , 1) AS Payment#407, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                                                  +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, regexp_replace(Gross Amount#373, \), , 1) AS Gross Amount#390, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                                                     +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, regexp_replace(Gross Amount#356, \(, , 1) AS Gross Amount#373, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                                                        +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, regexp_replace(Gross Amount#339, ,, , 1) AS Gross Amount#356, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                                                           +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, regexp_replace(Gross Amount#322, \$, , 1) AS Gross Amount#339, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                                                              +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, regexp_replace(Gross Amount#169, Can, , 1) AS Gross Amount#322, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
:                                                                 +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, trim(Payment Receipt Date#32, None) AS Payment Receipt Date#305]
:                                                                    +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, trim(Service Dates#31, None) AS Service Dates#288, Payment Receipt Date#32]
:                                                                       +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, trim(cast(Days Open#30 as string), None) AS Days Open#271, Service Dates#31, Payment Receipt Date#32]
:                                                                          +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, trim(Due Date#29, None) AS Due Date#254, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                             +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, Open Balance#203, Transaction Create Date#220, trim(Document Date#28, None) AS Document Date#237, Due Date#29, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                                +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, Open Balance#203, trim(Transaction Create Date#27, None) AS Transaction Create Date#220, Document Date#28, Due Date#29, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                                   +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, trim(Open Balance#26, None) AS Open Balance#203, Transaction Create Date#27, Document Date#28, Due Date#29, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                                      +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, trim(Payment#25, None) AS Payment#186, Open Balance#26, Transaction Create Date#27, Document Date#28, Due Date#29, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                                         +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, trim(Gross Amount#24, None) AS Gross Amount#169, Payment#25, Open Balance#26, Transaction Create Date#27, Document Date#28, Due Date#29, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                                            +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, trim(Currency#23, None) AS Currency#152, Gross Amount#24, Payment#25, Open Balance#26, Transaction Create Date#27, Document Date#28, Due Date#29, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                                               +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, trim(Terms#22, None) AS Terms#135, Currency#23, Gross Amount#24, Payment#25, Open Balance#26, Transaction Create Date#27, Document Date#28, Due Date#29, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                                                  +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, trim(Customer#21, None) AS Customer#118, Terms#22, Currency#23, Gross Amount#24, Payment#25, Open Balance#26, Transaction Create Date#27, Document Date#28, Due Date#29, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                                                     +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, trim(cast(Project ID#20 as string), None) AS Project ID#101, Customer#21, Terms#22, Currency#23, Gross Amount#24, Payment#25, Open Balance#26, Transaction Create Date#27, Document Date#28, Due Date#29, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                                                        +- Project [Transaction ID#49, Document Number#67, trim(Transaction Type#19, None) AS Transaction Type#84, Project ID#20, Customer#21, Terms#22, Currency#23, Gross Amount#24, Payment#25, Open Balance#26, Transaction Create Date#27, Document Date#28, Due Date#29, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                                                           +- Project [Transaction ID#49, trim(Document Number#18, None) AS Document Number#67, Transaction Type#19, Project ID#20, Customer#21, Terms#22, Currency#23, Gross Amount#24, Payment#25, Open Balance#26, Transaction Create Date#27, Document Date#28, Due Date#29, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                                                              +- Project [trim(cast(Transaction ID#17 as string), None) AS Transaction ID#49, Document Number#18, Transaction Type#19, Project ID#20, Customer#21, Terms#22, Currency#23, Gross Amount#24, Payment#25, Open Balance#26, Transaction Create Date#27, Document Date#28, Due Date#29, Days Open#30, Service Dates#31, Payment Receipt Date#32]
:                                                                                                                 +- Relation [Transaction ID#17,Document Number#18,Transaction Type#19,Project ID#20,Customer#21,Terms#22,Currency#23,Gross Amount#24,Payment#25,Open Balance#26,Transaction Create Date#27,Document Date#28,Due Date#29,Days Open#30,Service Dates#31,Payment Receipt Date#32] csv
+- Project [Transaction ID#49 AS Transaction ID#991, Document Number#67 AS Document Number#1008, first(Transaction Type)#952 AS Transaction Type#1110, first(Project ID)#958 AS Project ID#1212, first(Customer)#954 AS Customer#1144, first(Terms)#948 AS Terms#1042, first(Currency)#953 AS Currency#1127, min(Gross Amount)#956 AS Gross Amount#1178, max(Payment)#957 AS Payment#1195, max(Open Balance)#955 AS Open Balance#1161, max(Transaction Create Date)#960 AS Transaction Create Date#1246, max(Document Date)#959 AS Document Date#1229, max(Due Date)#949 AS Due Date#1059, max(Days Open)#950 AS Days Open#1076, max(Service Dates)#951 AS Service Dates#1093, first(Payment Receipt Date)#947 AS Payment Receipt Date#1025, Cleared AS Outstanding#1263, -max(Payment)#957 AS Applying Link Amount#1281]
   +- Aggregate [Transaction ID#49, Document Number#67], [Transaction ID#49, Document Number#67, first(Payment Receipt Date#305, false) AS first(Payment Receipt Date)#947, first(Terms#135, false) AS first(Terms)#948, max(Due Date#254) AS max(Due Date)#949, max(Days Open#913) AS max(Days Open)#950, max(Service Dates#288) AS max(Service Dates)#951, first(Transaction Type#84, false) AS first(Transaction Type)#952, first(Currency#152, false) AS first(Currency)#953, first(Customer#118, false) AS first(Customer)#954, max(Open Balance#560) AS max(Open Balance)#955, min(Gross Amount#879) AS min(Gross Amount)#956, max(Payment#896) AS max(Payment)#957, first(Project ID#101, false) AS first(Project ID)#958, max(Document Date#237) AS max(Document Date)#959, max(Transaction Create Date#220) AS max(Transaction Create Date)#960]
      +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#879, Payment#896, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, cast(Days Open#645 as float) AS Days Open#913, Service Dates#288, Payment Receipt Date#305]
         +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#879, cast(Payment#475 as float) AS Payment#896, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#645, Service Dates#288, Payment Receipt Date#305]
            +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, cast(Gross Amount#390 as float) AS Gross Amount#879, Payment#475, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#645, Service Dates#288, Payment Receipt Date#305]
               +- Filter (Transaction Type#84 = Credit Memo)
                  +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, regexp_replace(Days Open#628, \), , 1) AS Days Open#645, Service Dates#288, Payment Receipt Date#305]
                     +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, regexp_replace(Days Open#611, \(, , 1) AS Days Open#628, Service Dates#288, Payment Receipt Date#305]
                        +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, regexp_replace(Days Open#594, ,, , 1) AS Days Open#611, Service Dates#288, Payment Receipt Date#305]
                           +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, regexp_replace(Days Open#577, \$, , 1) AS Days Open#594, Service Dates#288, Payment Receipt Date#305]
                              +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, regexp_replace(Days Open#271, Can, , 1) AS Days Open#577, Service Dates#288, Payment Receipt Date#305]
                                 +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, regexp_replace(Open Balance#543, \), , 1) AS Open Balance#560, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                    +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, regexp_replace(Open Balance#526, \(, , 1) AS Open Balance#543, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                       +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, regexp_replace(Open Balance#509, ,, , 1) AS Open Balance#526, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                          +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, regexp_replace(Open Balance#492, \$, , 1) AS Open Balance#509, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                             +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, Payment#475, regexp_replace(Open Balance#203, Can, , 1) AS Open Balance#492, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                                +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, regexp_replace(Payment#458, \), , 1) AS Payment#475, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                                   +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, regexp_replace(Payment#441, \(, , 1) AS Payment#458, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                                      +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, regexp_replace(Payment#424, ,, , 1) AS Payment#441, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                                         +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, regexp_replace(Payment#407, \$, , 1) AS Payment#424, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                                            +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#390, regexp_replace(Payment#186, Can, , 1) AS Payment#407, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                                               +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, regexp_replace(Gross Amount#373, \), , 1) AS Gross Amount#390, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                                                  +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, regexp_replace(Gross Amount#356, \(, , 1) AS Gross Amount#373, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                                                     +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, regexp_replace(Gross Amount#339, ,, , 1) AS Gross Amount#356, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                                                        +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, regexp_replace(Gross Amount#322, \$, , 1) AS Gross Amount#339, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                                                           +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, regexp_replace(Gross Amount#169, Can, , 1) AS Gross Amount#322, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, Payment Receipt Date#305]
                                                                              +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, Service Dates#288, trim(Payment Receipt Date#6080, None) AS Payment Receipt Date#305]
                                                                                 +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, Days Open#271, trim(Service Dates#6079, None) AS Service Dates#288, Payment Receipt Date#6080]
                                                                                    +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, Due Date#254, trim(cast(Days Open#6078 as string), None) AS Days Open#271, Service Dates#6079, Payment Receipt Date#6080]
                                                                                       +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, Open Balance#203, Transaction Create Date#220, Document Date#237, trim(Due Date#6077, None) AS Due Date#254, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                          +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, Open Balance#203, Transaction Create Date#220, trim(Document Date#6076, None) AS Document Date#237, Due Date#6077, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                             +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, Open Balance#203, trim(Transaction Create Date#6075, None) AS Transaction Create Date#220, Document Date#6076, Due Date#6077, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                                +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, Payment#186, trim(Open Balance#6074, None) AS Open Balance#203, Transaction Create Date#6075, Document Date#6076, Due Date#6077, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                                   +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, Gross Amount#169, trim(Payment#6073, None) AS Payment#186, Open Balance#6074, Transaction Create Date#6075, Document Date#6076, Due Date#6077, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                                      +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, Currency#152, trim(Gross Amount#6072, None) AS Gross Amount#169, Payment#6073, Open Balance#6074, Transaction Create Date#6075, Document Date#6076, Due Date#6077, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                                         +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, Terms#135, trim(Currency#6071, None) AS Currency#152, Gross Amount#6072, Payment#6073, Open Balance#6074, Transaction Create Date#6075, Document Date#6076, Due Date#6077, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                                            +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, Customer#118, trim(Terms#6070, None) AS Terms#135, Currency#6071, Gross Amount#6072, Payment#6073, Open Balance#6074, Transaction Create Date#6075, Document Date#6076, Due Date#6077, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                                               +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, Project ID#101, trim(Customer#6069, None) AS Customer#118, Terms#6070, Currency#6071, Gross Amount#6072, Payment#6073, Open Balance#6074, Transaction Create Date#6075, Document Date#6076, Due Date#6077, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                                                  +- Project [Transaction ID#49, Document Number#67, Transaction Type#84, trim(cast(Project ID#6068 as string), None) AS Project ID#101, Customer#6069, Terms#6070, Currency#6071, Gross Amount#6072, Payment#6073, Open Balance#6074, Transaction Create Date#6075, Document Date#6076, Due Date#6077, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                                                     +- Project [Transaction ID#49, Document Number#67, trim(Transaction Type#6067, None) AS Transaction Type#84, Project ID#6068, Customer#6069, Terms#6070, Currency#6071, Gross Amount#6072, Payment#6073, Open Balance#6074, Transaction Create Date#6075, Document Date#6076, Due Date#6077, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                                                        +- Project [Transaction ID#49, trim(Document Number#6066, None) AS Document Number#67, Transaction Type#6067, Project ID#6068, Customer#6069, Terms#6070, Currency#6071, Gross Amount#6072, Payment#6073, Open Balance#6074, Transaction Create Date#6075, Document Date#6076, Due Date#6077, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                                                           +- Project [trim(cast(Transaction ID#6065 as string), None) AS Transaction ID#49, Document Number#6066, Transaction Type#6067, Project ID#6068, Customer#6069, Terms#6070, Currency#6071, Gross Amount#6072, Payment#6073, Open Balance#6074, Transaction Create Date#6075, Document Date#6076, Due Date#6077, Days Open#6078, Service Dates#6079, Payment Receipt Date#6080]
                                                                                                                              +- Relation [Transaction ID#6065,Document Number#6066,Transaction Type#6067,Project ID#6068,Customer#6069,Terms#6070,Currency#6071,Gross Amount#6072,Payment#6073,Open Balance#6074,Transaction Create Date#6075,Document Date#6076,Due Date#6077,Days Open#6078,Service Dates#6079,Payment Receipt Date#6080] csv
