## Reading libraries

In [76]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, regexp_replace, sum, lit, max, min, first,to_timestamp, trim, upper,to_date, date_format, when
from pyspark.sql.types import DateType

from pyspark.sql.types import IntegerType, FloatType

In [77]:
spark = SparkSession.builder.appName("read_csv").getOrCreate()

In [78]:
spark

# helper functions

In [129]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DateType
from datetime import datetime

# Define a UDF to parse the date string
@udf(DateType())
def parse_date(date_str):
    try:
        return datetime.strptime(date_str, "%m/%d/%Y").date()
    except ValueError:
        return None  # Handle invalid date strings as needed


In [79]:
def change_dtype(dataframe, column_name,change_to):
    return dataframe.withColumn(column_name, col(column_name).cast(change_to))

In [80]:
def remove_illegal_char(dataframe,column_name):
    illegal_character = ['Can','\$',',',"\(","\)"]
    for character in illegal_character:
        dataframe = dataframe.withColumn(column_name, regexp_replace(col(column_name), character, ''))
#                              .withColumn('Open Balance', regexp_replace(col('Open Balance'), 'Can', ''))
    return dataframe

In [81]:
def trim_col(dataframe, column_name):
    return dataframe.withColumn(column_name, trim(col(column_name)))

## Reading CSV

In [82]:
sales_report_kmpg_july=spark.read.options(inferSchema='True').csv('csv_converted\saleReprtKmpg_july - SalesReportKPMGAuditResults.csv', header=True, inferSchema=True, sep=',')

## Cleaning and Transforming

In [83]:
sales_report_kmpg_july.columns

['Transaction ID',
 'Document Number',
 'Transaction Type',
 'Project ID',
 'Customer',
 'Terms',
 'Currency',
 'Gross Amount',
 'Payment',
 'Open Balance',
 'Transaction Create Date',
 'Document Date',
 'Due Date',
 'Days Open',
 'Service Dates',
 'Payment Receipt Date']

In [84]:
object_columns = ['Transaction ID','Document Number','Transaction Type', 'Project ID','Customer','Terms','Currency',]
numeric_columns = ['Gross Amount','Payment','Open Balance','Days Open']
date_columns = ['Transaction Create Date','Document Date','Due Date','Service Dates','Payment Receipt Date']

### Trim column

In [85]:
# sales_report_kmpg_july.show()

In [86]:
# sales_report_kmpg_july = trim_col(sales_report_kmpg_july,column_name='Open Balance')
for column in sales_report_kmpg_july.columns:
    sales_report_kmpg_july = trim_col(sales_report_kmpg_july,column_name=column)
    

### Remove illegal characters from numeric columns

In [87]:
# sales_report_kmpg_july = remove_illegal_char(sales_report_kmpg_july,column_name='Open Balance')
for column in numeric_columns:
#     print(column)
    sales_report_kmpg_july = remove_illegal_char(sales_report_kmpg_july,column_name=column)
#     print(sales_report_kmpg_july.show(1))
    

### Change datatype

In [88]:
# sales_report_kmpg_july = sales_report_kmpg_july.withColumn('Open Balance', col('Open Balance').cast('float'))
# for column in date_columns:
#     sales_report_kmpg_july = sales_report_kmpg_july.withColumn(column, col(column).cast(to_timestamp(column)))
    

In [89]:
sales_report_kmpg_july.dtypes

[('Transaction ID', 'string'),
 ('Document Number', 'string'),
 ('Transaction Type', 'string'),
 ('Project ID', 'string'),
 ('Customer', 'string'),
 ('Terms', 'string'),
 ('Currency', 'string'),
 ('Gross Amount', 'string'),
 ('Payment', 'string'),
 ('Open Balance', 'string'),
 ('Transaction Create Date', 'string'),
 ('Document Date', 'string'),
 ('Due Date', 'string'),
 ('Days Open', 'string'),
 ('Service Dates', 'string'),
 ('Payment Receipt Date', 'string')]

In [90]:
sales_report_kmpg_july.show(1)

+--------------+---------------+----------------+----------+--------------------+-----+--------+------------+-------+------------+-----------------------+-------------+--------+---------+--------------------+--------------------+
|Transaction ID|Document Number|Transaction Type|Project ID|            Customer|Terms|Currency|Gross Amount|Payment|Open Balance|Transaction Create Date|Document Date|Due Date|Days Open|       Service Dates|Payment Receipt Date|
+--------------+---------------+----------------+----------+--------------------+-----+--------+------------+-------+------------+-----------------------+-------------+--------+---------+--------------------+--------------------+
|       3190973|      CUMC15527|     Credit Memo|   1277343|CUMC : Columbia U...| null|     USA|      145.83| 145.83|        0.00|          7/31/23 11:02|    7/31/2023|    null|        0|07/01/2023 to 07/...|           7/30/2023|
+--------------+---------------+----------------+----------+--------------------

## Operation 1:
- 3 lines
    - 1. Open Balance != 0----------------------output ---> temp_df , Sum_Open_Balance
    - 2. Tranasaction Type  == "Credit Memo"----output ----> process_df_result
    - 3. Tranasaction Type  != "Credit Memo"----output-----> temp_df_process_3

### 1. Open Balance != 0 
    - filtering the recored `Open Balance !=0`
    - Adding Column `Outstanding = "Open"`
    - Sum of `Open Balance`

In [91]:
sales_report_kmpg_july.filter(sales_report_kmpg_july['Open Balance'] != '0.00').count()

3948

In [92]:
temp_df = sales_report_kmpg_july.filter(sales_report_kmpg_july['Open Balance'] != '0.00')

In [93]:
print("type of df",type(temp_df), "no of records",temp_df.count())

type of df <class 'pyspark.sql.dataframe.DataFrame'> no of records 3948


#### adding column Outstanding = open

In [94]:
# df_spark.withColumn('year after 2', df_spark['year']+1000).show() refrence
temp_df = temp_df.withColumn('Outstanding', lit('Open'))

In [95]:
temp_df = change_dtype(temp_df, column_name='Open Balance',change_to=FloatType())

In [96]:
Sum_Open_Balance = temp_df.select(sum(col('Open Balance'))).collect()[0][0]
Sum_Open_Balance

29295611.528552473

In [97]:
sales_report_kmpg_july.filter(sales_report_kmpg_july['Open Balance'] != '0.00').select(sum(col('Open Balance')))

DataFrame[sum(Open Balance): double]

#### Output
Sum_Open_Balance, temp_df

In [98]:
print(temp_df.count(),Sum_Open_Balance)

3948 29295611.528552473


### 2. Tranasaction Type  = "Credit Memo"

In [99]:
temp_df_process_2=sales_report_kmpg_july.filter(sales_report_kmpg_july['Transaction Type'] == 'Credit Memo')
print("Input Parameters: ", temp_df_process_2.count())

Input Parameters:  15459


In [100]:
temp_df_process_2 = temp_df_process_2.withColumn('Gross Amount', col('Gross Amount').cast('float'))
temp_df_process_2 = temp_df_process_2.withColumn('Payment', col('Payment').cast('float'))
temp_df_process_2 = temp_df_process_2.withColumn('Days Open', col('Days Open').cast('float'))

In [101]:
groupby_df_process_2 =temp_df_process_2.groupBy(['Transaction ID',"Document Number"])

In [102]:
temp_df_process_2.dtypes

[('Transaction ID', 'string'),
 ('Document Number', 'string'),
 ('Transaction Type', 'string'),
 ('Project ID', 'string'),
 ('Customer', 'string'),
 ('Terms', 'string'),
 ('Currency', 'string'),
 ('Gross Amount', 'float'),
 ('Payment', 'float'),
 ('Open Balance', 'string'),
 ('Transaction Create Date', 'string'),
 ('Document Date', 'string'),
 ('Due Date', 'string'),
 ('Days Open', 'float'),
 ('Service Dates', 'string'),
 ('Payment Receipt Date', 'string')]

In [103]:
#     "Transaction Type": first(col("Transaction Type")),  # Pick the first occurrence of "Payment Type"
#     "Project ID": max(col("Profit"))  # Calculate the maximum profit
#     "Customer": first(col("Customer")),
#     "Terms": first(col("Terms")),
#     "Currency": first(col("Currency")),
#     "Gross Amount": min(col("Gross Amount")),
#     "Payment": max(col("Payment")),
#     "Open Balance": max(col("Open Balance")),
#     "Transaction Create Date": max(col("Transaction Create Date")),
#     "Document Date": max(col("Document Date")),
#     "Due Date": max(col("Due Date")),
#     "Days Open": max(col("Days Open")),
#     "Service Dates": max(col("Service Dates")),
#     "Payment Receipt Date": col("Payment Receipt Date")

In [104]:
agg_exprs = {
    "Transaction Type": 'first',  # Pick the first occurrence of "Payment Type"
    "Project ID": 'first',  # Calculate the maximum profit
    "Customer": 'first',
    "Terms": 'first',
    "Currency": 'first',
    "Gross Amount": 'min',
    "Payment": 'max',
    "Open Balance": 'max',
    "Transaction Create Date": 'max',
    "Document Date": 'max',
    "Due Date": 'max',
    "Days Open": 'max',
    "Service Dates": 'max',
    "Payment Receipt Date": 'first'}


In [105]:
type(first(col("Transaction Type")))

pyspark.sql.column.Column

In [106]:
# groupby_df_process_2.agg(agg_exprs).count()

In [107]:
process_df_result = groupby_df_process_2.agg(agg_exprs)

In [108]:
# column got renamed need to look for other way, there is one commented above but causing some error
for column in process_df_result.columns:
#     print(column)
    new_column = column
    remove_list = ["max","min","(",")","first"]
    for char in remove_list:
#         print(char)
        if char in new_column:
            new_column=new_column.replace(char,"")
    process_df_result=process_df_result.withColumnRenamed(column,new_column)
    
    

In [109]:
# process_df_result.columns

In [110]:
process_df_result = process_df_result.withColumn('Outstanding', lit('Cleared'))
process_df_result=process_df_result.withColumn('Applying Link Amount', -col("Payment"))

In [111]:
process_df_result.select(["Transaction ID", "Outstanding","Applying Link Amount"]).show()

+--------------+-----------+--------------------+
|Transaction ID|Outstanding|Applying Link Amount|
+--------------+-----------+--------------------+
|       3191846|    Cleared|              -84.87|
|       3196377|    Cleared|              -243.0|
|       3152869|    Cleared|            -1394.96|
|       3136755|    Cleared|                -0.0|
|       3117490|    Cleared|             -111.29|
|       3119866|    Cleared|            -1397.43|
|       3051990|    Cleared|             -525.98|
|       3034098|    Cleared|              -19.48|
|       2979232|    Cleared|              -119.6|
|       2889570|    Cleared|            -2857.77|
|       2840729|    Cleared|             -1000.0|
|       3195531|    Cleared|                -0.0|
|       3198467|    Cleared|             -446.38|
|       3154622|    Cleared|                -0.0|
|       3114388|    Cleared|               -0.08|
|       3049865|    Cleared|                -0.0|
|       3050589|    Cleared|                -0.0|


In [112]:
print("Output Result :", process_df_result.count())

Output Result : 14286


### Output : process_df_result

## 3. Tranasaction Type  != "Credit Memo"

In [113]:
sales_report_kmpg_july.count()

382148

In [114]:
sales_report_kmpg_july.select("Transaction Type").distinct().show()

+----------------+
|Transaction Type|
+----------------+
|     Credit Memo|
|         Invoice|
|            null|
+----------------+



In [115]:
temp_df_process_3=sales_report_kmpg_july.filter(sales_report_kmpg_july['Transaction Type'] != 'Credit Memo')
print("Input :", temp_df_process_3.count())

Input : 366688


In [116]:
temp_df_process_3 = temp_df_process_3.withColumn('Customer', upper(trim(col('Customer'))))

In [117]:
temp_df_process_3.select('Customer').count()

366688

In [118]:
temp_df_process_3.filter(col('Transaction Type').like('% %')).select('Transaction Type')

DataFrame[Transaction Type: string]

In [119]:
temp_df_process_3

DataFrame[Transaction ID: string, Document Number: string, Transaction Type: string, Project ID: string, Customer: string, Terms: string, Currency: string, Gross Amount: string, Payment: string, Open Balance: string, Transaction Create Date: string, Document Date: string, Due Date: string, Days Open: string, Service Dates: string, Payment Receipt Date: string]

In [None]:
customer_payment_history = customer_payment_history.withColumn('Payment Date', date_format(to_date(df['Payment Date'], 'M/d/yyyy'), 'yyyy-MM-dd'))


In [55]:
customer_payment_history = customer_payment_history.withColumn()

0

In [212]:
# customer_payment_history = customer_payment_history.limit(customer_payment_history.count() - 1)

In [211]:
customer_payment_history = spark.read.options(inferSchema='True').csv('csv_converted\Customer_Payment_history_july.csv', header=True, inferSchema=True, sep=',')
customer_payment_history.count()

315677

In [201]:
# customer_payment_history.filter(col("payment Date").isNull()).count()
# customer_payment_history.filter(col("Payment Date").rlike(r"^\d{1,2}/\d{1,2}/\d{4}$")).count()
# customer_payment_history.filter(col('Payment Date').like('%1/1/1900%')).select('Payment Date').show()

# customer_payment_history 

In [221]:
customer_payment_history = spark.read.options(inferSchema='True').csv('csv_converted\Customer_Payment_history_july.csv', header=True, inferSchema=True, sep=',')

In [222]:
customer_payment_history=customer_payment_history.withColumnRenamed('Currency0','Currency')
customer_payment_history=customer_payment_history.withColumnRenamed('Currency24','Currency2')
customer_payment_history=customer_payment_history.withColumnRenamed('Approved for Email10','Approved for Email')
customer_payment_history=customer_payment_history.withColumnRenamed('Approved for Email44','Approved for Email')

In [223]:
customer_payment_history.tail(1)

[Row(Currency=None, Custom Form=None, Detailed Invoice=None, Region=None, Period=None, Billing Cycle=None, Date=None, Date Deposited/Refunded=None, Invoice to Time Action Queue=None, Revised Invoice=None, Approved for Email=None, Print=None, Type=None, Payroll Batch=None, Document Number=None, SA Entity=None, Scribe Period Display on Printout=None, Transaction Number=None, Name=None, Account=None, PO/Check Number=None, Status=None, Tracking Numbers=None, Memo=None, Currency2=None, Amount (Foreign Currency)='$1,627,361,480.84 ', Amount='1,624,379,826.75 ', Posting=None, Applying Link Amount='1,343,025,945.55 ', Applying Link Amount (Foreign Currency)='1,343,920,193.55 ', Applying Link Type=None, Applying Transaction=None, Payment Date=None, Contract Signature Date=None, On Hold Reason=None, JE Convert=None, Implementation Status=None, Actual End Date=None, Processed Rules=None, Pipeline=None, City=None, State=None, FACILITY NAME=None, Pre-EVP Approval Opp=None, Approved for Email=None, 

In [821]:
customer_payment_history = customer_payment_history.withColumnRenamed('Payment Date2','temp name')
customer_payment_history = customer_payment_history.withColumnRenamed('Payment Date','Payment Date2')
customer_payment_history = customer_payment_history.withColumnRenamed('temp name','Payment Date')

In [817]:
customer_payment_history.select('Payment Date').dtypes

[('Payment Date', 'date')]

In [816]:
customer_payment_history = customer_payment_history.withColumn('Document Number', upper(trim(col('Document Number'))))

In [815]:
customer_payment_history =customer_payment_history.withColumn('Payment Date', to_date(customer_payment_history['Payment Date'],'M/d/yyyy'))

In [873]:
customer_payment_history.select('Date').show()

+----------+
|      Date|
+----------+
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
+----------+
only showing top 20 rows



In [883]:
customer_payment_history.select('Date').dtypes

[('Date', 'date')]

In [973]:
from datetime import datetime
date_filter = datetime.strptime("2019-01-15", "%Y-%m-%d")
customer_payment_history.filter(customer_payment_history['Payment Date'].isNull(

SyntaxError: incomplete input (2203485108.py, line 3)

In [972]:
customer_payment_history = customer_payment_history.withColumn('Date', date_format(customer_payment_history['Date'], 'yyyy-MM-dd'))

In [971]:
customer_payment_history = customer_payment_history.withColumn('Date', date_format(to_date(customer_payment_history['Date'], 'MM/dd/yyyy'), 'yyyy-MM-dd'))


In [970]:

#  customer_payment_history =customer_payment_history.withColumn('Date', date_format(customer_payment_history['Date'], 'MM/dd/yyyy'))
# customer_payment_history = customer_payment_history.fillna('1/1/1900',subset=['Payment Date'])


In [969]:
# customer_payment_history =customer_payment_history.withColumn('Payment Date', to_date(customer_payment_history['Payment Date'],'M/d/yyyy'))
# customer_payment_history = customer_payment_history.withColumn('Document Number', upper(trim(col('Document Number'))))

In [984]:
# customer_payment_history =customer_payment_history.withColumn('Date', date_format(customer_payment_history['Date'], 'MM/dd/yyyy'))
# customer_payment_history =customer_payment_history.withColumn('Date', to_date(customer_payment_history['Date'],'yyyy-MM-dd' ))
# customer_payment_history = customer_payment_history.fillna('1/1/1900',subset=['Payment Date'])
# customer_payment_history =customer_payment_history.withColumn('Payment Date', to_date(date_format(customer_payment_history['Payment Date'],'yyyy-MM-dd'),'M/d/yyyy' ))
# customer_payment_history = customer_payment_history.withColumn('Document Number', upper(trim(col('Document Number'))))

In [1033]:
customer_payment_history.select(variable).show()

+----------+
|      Date|
+----------+
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
|2019-01-15|
+----------+
only showing top 20 rows



In [1032]:
variable = 'Date'
print(customer_payment_history.select(variable).count(), customer_payment_history.select(variable).dtypes)

315676 [('Date', 'date')]


In [1031]:
customer_payment_history =customer_payment_history.withColumn('Date', to_date(customer_payment_history['Date'],'M/dd/yyyy' ))

In [1023]:
#correct
customer_payment_history =customer_payment_history.withColumn('Date', date_format(customer_payment_history['Date'], 'M/dd/yyyy'))


In [50]:
customer_payment_history.show(100)

+--------+------------------+----------------+----------+--------+-------------+---------+-----------------------+----------------------------+---------------+------------------+-----+-------+-------------+--------------------+---------+---------------------------------+------------------+--------------------+--------------------+---------------+------------+----------------+-----------------+---------+-------------------------+----------+-------+--------------------+---------------------------------------+------------------+--------------------+------------+-----------------------+--------------+----------+---------------------+---------------+---------------+--------+----+-----+-------------+--------------------+------------------+----------+-----------+
|Currency|       Custom Form|Detailed Invoice|    Region|  Period|Billing Cycle|     Date|Date Deposited/Refunded|Invoice to Time Action Queue|Revised Invoice|Approved for Email|Print|   Type|Payroll Batch|     Document Number|SA Ent

0

In [865]:
customer_payment_history =customer_payment_history.withColumn('Date', date_format(customer_payment_history['Date'],'M/dd/yyyy' ))

In [311]:
customer_payment_history.count()

315677

In [313]:
customer_payment_history.count()

315677

In [359]:
customer_payment_history.count()

315677

In [268]:
customer_payment_history.columns

['Currency',
 'Custom Form',
 'Detailed Invoice',
 'Region',
 'Period',
 'Billing Cycle',
 'Date',
 'Date Deposited/Refunded',
 'Invoice to Time Action Queue',
 'Revised Invoice',
 'Approved for Email',
 'Print',
 'Type',
 'Payroll Batch',
 'Document Number',
 'SA Entity',
 'Scribe Period Display on Printout',
 'Transaction Number',
 'Name',
 'Account',
 'PO/Check Number',
 'Status',
 'Tracking Numbers',
 'Memo',
 'Currency2',
 'Amount (Foreign Currency)',
 'Amount',
 'Posting',
 'Applying Link Amount',
 'Applying Link Amount (Foreign Currency)',
 'Applying Link Type',
 'Applying Transaction',
 'Payment Date',
 'Contract Signature Date',
 'On Hold Reason',
 'JE Convert',
 'Implementation Status',
 'Actual End Date',
 'Processed Rules',
 'Pipeline',
 'City',
 'State',
 'FACILITY NAME',
 'Pre-EVP Approval Opp',
 'Approved for Email',
 'Email Sent',
 'Internal ID']

In [372]:
#converting date format
customer_payment_history =customer_payment_history.withColumn('Date', date_format(to_date(customer_payment_history['Date'],'M/dd/yyyy'), 'yyyy-MM-dd'))

In [384]:
# customer_payment_history.dtypes

In [542]:
customer_payment_history.select('Payment Date').show()

+------------+
|Payment Date|
+------------+
|  2019-01-24|
|  2019-02-15|
|  2018-11-21|
|  2019-02-15|
|  2019-02-15|
|  2019-02-19|
|  2019-02-19|
|  2019-02-15|
|  2019-02-15|
|  2019-02-15|
|  2019-02-14|
|  2019-02-15|
|  2019-02-15|
|  2019-02-15|
|  2019-04-24|
|  2019-02-19|
|  2019-02-15|
|  2019-02-15|
|  2019-02-15|
|  2019-04-08|
+------------+
only showing top 20 rows



In [541]:
customer_payment_history.select('Payment Date').show()

+------------+
|Payment Date|
+------------+
|  2019-01-24|
|  2019-02-15|
|  2018-11-21|
|  2019-02-15|
|  2019-02-15|
|  2019-02-19|
|  2019-02-19|
|  2019-02-15|
|  2019-02-15|
|  2019-02-15|
|  2019-02-14|
|  2019-02-15|
|  2019-02-15|
|  2019-02-15|
|  2019-04-24|
|  2019-02-19|
|  2019-02-15|
|  2019-02-15|
|  2019-02-15|
|  2019-04-08|
+------------+
only showing top 20 rows



In [540]:
customer_payment_history.columns

['Currency',
 'Custom Form',
 'Detailed Invoice',
 'Region',
 'Period',
 'Billing Cycle',
 'Date',
 'Date Deposited/Refunded',
 'Invoice to Time Action Queue',
 'Revised Invoice',
 'Approved for Email',
 'Print',
 'Type',
 'Payroll Batch',
 'Document Number',
 'SA Entity',
 'Scribe Period Display on Printout',
 'Transaction Number',
 'Name',
 'Account',
 'PO/Check Number',
 'Status',
 'Tracking Numbers',
 'Memo',
 'Currency2',
 'Amount (Foreign Currency)',
 'Amount',
 'Posting',
 'Applying Link Amount',
 'Applying Link Amount (Foreign Currency)',
 'Applying Link Type',
 'Applying Transaction',
 'Payment Date',
 'Contract Signature Date',
 'On Hold Reason',
 'JE Convert',
 'Implementation Status',
 'Actual End Date',
 'Processed Rules',
 'Pipeline',
 'City',
 'State',
 'FACILITY NAME',
 'Pre-EVP Approval Opp',
 'Approved for Email',
 'Email Sent',
 'Internal ID']

In [598]:
customer_payment_history.select('Payment Date').dtypes

[('Payment Date', 'date')]

0

In [592]:
# customer_payment_history.filter(customer_payment_history['Paymen Date']=='1/01/1900').select('Payment Date').count()


In [537]:
customer_payment_history.dtypes

[('Currency', 'string'),
 ('Custom Form', 'string'),
 ('Detailed Invoice', 'string'),
 ('Region', 'string'),
 ('Period', 'string'),
 ('Billing Cycle', 'string'),
 ('Date', 'string'),
 ('Date Deposited/Refunded', 'string'),
 ('Invoice to Time Action Queue', 'string'),
 ('Revised Invoice', 'string'),
 ('Approved for Email', 'string'),
 ('Print', 'string'),
 ('Type', 'string'),
 ('Payroll Batch', 'string'),
 ('Document Number', 'string'),
 ('SA Entity', 'string'),
 ('Scribe Period Display on Printout', 'string'),
 ('Transaction Number', 'int'),
 ('Name', 'string'),
 ('Account', 'string'),
 ('PO/Check Number', 'string'),
 ('Status', 'string'),
 ('Tracking Numbers', 'string'),
 ('Memo', 'string'),
 ('Currency2', 'string'),
 ('Amount (Foreign Currency)', 'string'),
 ('Amount', 'string'),
 ('Posting', 'string'),
 ('Applying Link Amount', 'string'),
 ('Applying Link Amount (Foreign Currency)', 'string'),
 ('Applying Link Type', 'string'),
 ('Applying Transaction', 'string'),
 ('Payment Date', 

In [657]:
customer_payment_history.select('Date').dtypes

[('Date', 'date')]

In [666]:
customer_payment_history.select('Payment Date').show()

+------------+
|Payment Date|
+------------+
|   1/24/2019|
|   2/15/2019|
|  11/21/2018|
|   2/15/2019|
|   2/15/2019|
|   2/19/2019|
|   2/19/2019|
|   2/15/2019|
|   2/15/2019|
|   2/15/2019|
|   2/14/2019|
|   2/15/2019|
|   2/15/2019|
|   2/15/2019|
|   4/24/2019|
|   2/19/2019|
|   2/15/2019|
|   2/15/2019|
|   2/15/2019|
|    4/8/2019|
+------------+
only showing top 20 rows



In [None]:
customer_payment_history =customer_payment_history.withColumn('Date', to_date(date_format(customer_payment_history['Date'],'M/dd/yyyy'),'yyyy-MM-dd'))


0

In [700]:
customer_payment_history.select('Payment Date').dtypes

[('Payment Date', 'date')]

In [685]:
customer_payment_history.select('Payment Date').dtypes

[('Payment Date', 'date')]

In [590]:
# .filter(col('Open Balance').like('% %')).select('Open Balance').count()


0

315677

+------------+
|Payment Date|
+------------+
|   1/24/2019|
|   2/15/2019|
|  11/21/2018|
|   2/15/2019|
|   2/15/2019|
|   2/19/2019|
|   2/19/2019|
|   2/15/2019|
|   2/15/2019|
|   2/15/2019|
|   2/14/2019|
|   2/15/2019|
|   2/15/2019|
|   2/15/2019|
|   4/24/2019|
|   2/19/2019|
|   2/15/2019|
|   2/15/2019|
|   2/15/2019|
|    4/8/2019|
+------------+
only showing top 20 rows

