## Reading libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, regexp_replace, sum, lit, max, min, first,to_timestamp, trim, upper,to_date, date_format, when
from pyspark.sql.types import DateType

from pyspark.sql.types import IntegerType, FloatType

In [2]:
# spark = SparkSession.builder.master("local[*]").appName("read_csv").getOrCreate()
spark = SparkSession.builder.appName("read_csv").getOrCreate()

In [3]:
spark

# helper functions

In [4]:
def dub_column(df):
    col = dict()
    res = list()
    for column in df.columns:
        if column in col:
            col[column] +=1
        else:
            col[column]=1
#     for column in col:
#         if col[column]>2:
            
    return col

In [5]:
# show and remove duplicate row
def remove_dup_rows(df, show= True):
    df_grou = df.groupby(*df.columns).count()
    dup = df_grou.filter(col("count")>1)
    if show == True:
        print(dup.show())
#     df = df.drop_duplicates()
    return df.drop_duplicates()

In [6]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DateType
from datetime import datetime

# Define a UDF to parse the date string
@udf(DateType())
def parse_date(date_str):
    try:
        return datetime.strptime(date_str, "%m/%d/%Y").date()
    except ValueError:
        return None  # Handle invalid date strings as needed


In [7]:
def change_dtype(dataframe, column_name,change_to):
    return dataframe.withColumn(column_name, col(column_name).cast(change_to))

In [8]:
def remove_illegal_char(dataframe,column_name):
    illegal_character = ['Can','\$',',',"\(","\)"]
    for character in illegal_character:
        dataframe = dataframe.withColumn(column_name, regexp_replace(col(column_name), character, ''))
#                              .withColumn('Open Balance', regexp_replace(col('Open Balance'), 'Can', ''))
    return dataframe

In [9]:
def trim_col(dataframe, column_name):
    try:
        return dataframe.withColumn(column_name, trim(col(column_name)))
    except Exception as e:
        print(e)
        return None

## Reading CSV

In [10]:
sales_report_kmpg_july=spark.read.options(inferSchema='True').csv('csv_converted\saleReprtKmpg_july - SalesReportKPMGAuditResults.csv', header=True, inferSchema=True, sep=',')

In [11]:
dub_column(sales_report_kmpg_july)

{'Transaction ID': 1,
 'Document Number': 1,
 'Transaction Type': 1,
 'Project ID': 1,
 'Customer': 1,
 'Terms': 1,
 'Currency': 1,
 'Gross Amount': 1,
 'Payment': 1,
 'Open Balance': 1,
 'Transaction Create Date': 1,
 'Document Date': 1,
 'Due Date': 1,
 'Days Open': 1,
 'Service Dates': 1,
 'Payment Receipt Date': 1}

## Cleaning and Transforming

In [12]:
object_columns = ['Transaction ID','Document Number','Transaction Type', 'Project ID','Customer','Terms','Currency']
numeric_columns = ['Gross Amount','Payment','Open Balance','Days Open']
date_columns = ['Transaction Create Date','Document Date','Due Date','Service Dates','Payment Receipt Date']

### Trim column

In [13]:
# sales_report_kmpg_july.show()

In [14]:
sales_report_kmpg_july = trim_col(sales_report_kmpg_july,column_name='Open Balance')
for column in sales_report_kmpg_july.columns:
    sales_report_kmpg_july = trim_col(sales_report_kmpg_july,column_name=column)
    

### Remove illegal characters from numeric columns

In [15]:
# sales_report_kmpg_july = remove_illegal_char(sales_report_kmpg_july,column_name='Open Balance')
for column in numeric_columns:
#     print(column)
    sales_report_kmpg_july = remove_illegal_char(sales_report_kmpg_july,column_name=column)
#     print(sales_report_kmpg_july.show(1))
    

### Change datatype

In [16]:
# sales_report_kmpg_july = sales_report_kmpg_july.withColumn('Open Balance', col('Open Balance').cast('float'))
# for column in date_columns:
#     sales_report_kmpg_july = sales_report_kmpg_july.withColumn(column, col(column).cast(to_timestamp(column)))
    

In [17]:
sales_report_kmpg_july.dtypes

[('Transaction ID', 'string'),
 ('Document Number', 'string'),
 ('Transaction Type', 'string'),
 ('Project ID', 'string'),
 ('Customer', 'string'),
 ('Terms', 'string'),
 ('Currency', 'string'),
 ('Gross Amount', 'string'),
 ('Payment', 'string'),
 ('Open Balance', 'string'),
 ('Transaction Create Date', 'string'),
 ('Document Date', 'string'),
 ('Due Date', 'string'),
 ('Days Open', 'string'),
 ('Service Dates', 'string'),
 ('Payment Receipt Date', 'string')]

In [18]:
sales_report_kmpg_july.show(1)

+--------------+---------------+----------------+----------+--------------------+-----+--------+------------+-------+------------+-----------------------+-------------+--------+---------+--------------------+--------------------+
|Transaction ID|Document Number|Transaction Type|Project ID|            Customer|Terms|Currency|Gross Amount|Payment|Open Balance|Transaction Create Date|Document Date|Due Date|Days Open|       Service Dates|Payment Receipt Date|
+--------------+---------------+----------------+----------+--------------------+-----+--------+------------+-------+------------+-----------------------+-------------+--------+---------+--------------------+--------------------+
|       3190973|      CUMC15527|     Credit Memo|   1277343|CUMC : Columbia U...| null|     USA|      145.83| 145.83|        0.00|          7/31/23 11:02|    7/31/2023|    null|        0|07/01/2023 to 07/...|           7/30/2023|
+--------------+---------------+----------------+----------+--------------------

## Operation 1:
- 3 lines
    - 1. Open Balance != 0----------------------output ---> temp_df , Sum_Open_Balance
    - 2. Tranasaction Type  == "Credit Memo"----output ----> process_df_result
    - 3. Tranasaction Type  != "Credit Memo"----output-----> temp_df_process_3

### 1. Open Balance != 0 
    - filtering the recored `Open Balance !=0`
    - Adding Column `Outstanding = "Open"`
    - Sum of `Open Balance`

In [19]:
sales_report_kmpg_july.filter(sales_report_kmpg_july['Open Balance'] != '0.00').count()
temp_df = sales_report_kmpg_july.filter(sales_report_kmpg_july['Open Balance'] != '0.00')
print("no of records",temp_df.count())
# df_spark.withColumn('year after 2', df_spark['year']+1000).show() refrence

# adding column Outstanding = open
temp_df = temp_df.withColumn('Outstanding', lit('Open'))
temp_df = temp_df.withColumn('Applying Link Amount', temp_df["Open Balance"])
temp_df = temp_df.withColumn("FileName", lit("saleReprtKmpg_july"))
temp_df = change_dtype(temp_df, column_name='Open Balance',change_to=FloatType())
Sum_Open_Balance = temp_df.select(sum(col('Open Balance'))).collect()[0][0]
print(f"sum : {Sum_Open_Balance}, rows: {temp_df.count()}, columns {len(temp_df.columns)}")

no of records 3948
sum : 29295611.528552473, rows: 3948, columns 19


### 2. Tranasaction Type  = "Credit Memo"

In [20]:
temp_df_process_2=sales_report_kmpg_july.filter(sales_report_kmpg_july['Transaction Type'] == 'Credit Memo')
print("Input Parameters: ", temp_df_process_2.count())
temp_df_process_2 = temp_df_process_2.withColumn('Gross Amount', col('Gross Amount').cast('float'))
temp_df_process_2 = temp_df_process_2.withColumn('Payment', col('Payment').cast('float'))
temp_df_process_2 = temp_df_process_2.withColumn('Days Open', col('Days Open').cast('float'))

Input Parameters:  15459


In [21]:
groupby_df_process_2 =temp_df_process_2.groupBy(['Transaction ID',"Document Number"])

In [22]:
temp_df_process_2.dtypes

[('Transaction ID', 'string'),
 ('Document Number', 'string'),
 ('Transaction Type', 'string'),
 ('Project ID', 'string'),
 ('Customer', 'string'),
 ('Terms', 'string'),
 ('Currency', 'string'),
 ('Gross Amount', 'float'),
 ('Payment', 'float'),
 ('Open Balance', 'string'),
 ('Transaction Create Date', 'string'),
 ('Document Date', 'string'),
 ('Due Date', 'string'),
 ('Days Open', 'float'),
 ('Service Dates', 'string'),
 ('Payment Receipt Date', 'string')]

In [23]:
#     "Transaction Type": first(col("Transaction Type")),  # Pick the first occurrence of "Payment Type"
#     "Project ID": max(col("Profit"))  # Calculate the maximum profit
#     "Customer": first(col("Customer")),
#     "Terms": first(col("Terms")),
#     "Currency": first(col("Currency")),
#     "Gross Amount": min(col("Gross Amount")),
#     "Payment": max(col("Payment")),
#     "Open Balance": max(col("Open Balance")),
#     "Transaction Create Date": max(col("Transaction Create Date")),
#     "Document Date": max(col("Document Date")),
#     "Due Date": max(col("Due Date")),
#     "Days Open": max(col("Days Open")),
#     "Service Dates": max(col("Service Dates")),
#     "Payment Receipt Date": col("Payment Receipt Date")

In [24]:
agg_exprs = {
    "Transaction Type": 'first',  # Pick the first occurrence of "Payment Type"
    "Project ID": 'first',  # Calculate the maximum profit
    "Customer": 'first',
    "Terms": 'first',
    "Currency": 'first',
    "Gross Amount": 'min',
    "Payment": 'max',
    "Open Balance": 'max',
    "Transaction Create Date": 'max',
    "Document Date": 'max',
    "Due Date": 'max',
    "Days Open": 'max',
    "Service Dates": 'max',
    "Payment Receipt Date": 'first'}


In [25]:
type(first(col("Transaction Type")))

pyspark.sql.column.Column

In [26]:
# groupby_df_process_2.agg(agg_exprs).count()

In [27]:
process_df_result = groupby_df_process_2.agg(agg_exprs)

In [28]:
# column got renamed need to look for other way, there is one commented above but causing some error
for column in process_df_result.columns:
#     print(column)
    new_column = column
    remove_list = ["max","min","(",")","first"]
    for char in remove_list:
#         print(char)
        if char in new_column:
            new_column=new_column.replace(char,"")
    process_df_result=process_df_result.withColumnRenamed(column,new_column)
    
    

In [29]:
# process_df_result.columns

In [30]:
process_df_result = process_df_result.withColumn('Outstanding', lit('Cleared'))
process_df_result=process_df_result.withColumn('Applying Link Amount', -col("Payment"))

In [31]:
process_df_result.select(["Transaction ID", "Outstanding","Applying Link Amount"]).show()

+--------------+-----------+--------------------+
|Transaction ID|Outstanding|Applying Link Amount|
+--------------+-----------+--------------------+
|       3191846|    Cleared|              -84.87|
|       3196377|    Cleared|              -243.0|
|       3152869|    Cleared|            -1394.96|
|       3136755|    Cleared|                -0.0|
|       3117490|    Cleared|             -111.29|
|       3119866|    Cleared|            -1397.43|
|       3051990|    Cleared|             -525.98|
|       3034098|    Cleared|              -19.48|
|       2979232|    Cleared|              -119.6|
|       2889570|    Cleared|            -2857.77|
|       2840729|    Cleared|             -1000.0|
|       3195531|    Cleared|                -0.0|
|       3198467|    Cleared|             -446.38|
|       3154622|    Cleared|                -0.0|
|       3114388|    Cleared|               -0.08|
|       3049865|    Cleared|                -0.0|
|       3050589|    Cleared|                -0.0|


In [32]:
print("Output Result :", process_df_result.count())

Output Result : 14286


In [33]:
len(process_df_result.columns)

18

### Output : process_df_result

## 3. Tranasaction Type  != "Credit Memo"

In [34]:
print(sales_report_kmpg_july.count())
sales_report_kmpg_july.select("Transaction Type").distinct().show()

382148
+----------------+
|Transaction Type|
+----------------+
|     Credit Memo|
|         Invoice|
|            null|
+----------------+



In [35]:
temp_df_process_3=sales_report_kmpg_july.filter(sales_report_kmpg_july['Transaction Type'] != 'Credit Memo')

print("Input :", temp_df_process_3.count(), len(temp_df_process_3.columns), end="---->")

temp_df_process_3 = temp_df_process_3.withColumn('Customer', upper(trim(col('Customer'))))
temp_df_process_3 = temp_df_process_3.withColumn("FileName", lit("saleReprtKmpg_july"))
print(temp_df_process_3.count(), len(temp_df_process_3.columns))

Input : 366688 16---->366688 17


# customer_payment_history 
customer_payment_history =customer_payment_history.withColumn('Date', date_format(to_date(customer_payment_history['Date'],'M/d/yyyy'),'yyyy-MM-dd' ))

In [36]:
# customer_payment_history.filter(col("payment Date").isNull()).count()
# customer_payment_history.filter(col("Payment Date").rlike(r"^\d{1,2}/\d{1,2}/\d{4}$")).count()
# customer_payment_history.filter(col('Payment Date').like('%1/1/1900%')).select('Payment Date').show()
# customer_payment_history =customer_payment_history.withColumn('Date', to_date(customer_payment_history['Date'],'M/d/yyyy'))
# customer_payment_history =customer_payment_history.withColumn('Date', date_format(to_date(customer_payment_history['Date'],'M/d/yyyy'),'yyyy-MM-dd' ))
# customer_payment_history =customer_payment_history.withColumn('Date', date_format(customer_payment_history['Date'], 'MM/dd/yyyy'))
# customer_payment_history =customer_payment_history.withColumn('Date', to_date(customer_payment_history['Date'],'yyyy-MM-dd' ))
# customer_payment_history = customer_payment_history.fillna('1/1/1900',subset=['Payment Date'])
# customer_payment_history =customer_payment_history.withColumn('Payment Date', to_date(date_format(customer_payment_history['Payment Date'],'yyyy-MM-dd'),'M/d/yyyy' ))
# customer_payment_history = customer_payment_history.withColumn('Document Number', upper(trim(col('Document Number'))))

In [37]:
customer_payment_history = spark.read.options(inferSchema='True').csv('csv_converted\Customer_Payment_history_july.csv', header=True, inferSchema=True, sep=',')
print(f"Input {customer_payment_history.count()}, {len(customer_payment_history.columns)}, blank billing cycle : {customer_payment_history.filter(customer_payment_history['Billing Cycle'].isNull()).count()}")

Input 315677, 47, blank billing cycle : 22


In [38]:
customer_payment_history=customer_payment_history.withColumnRenamed('Currency0','Currency')
customer_payment_history=customer_payment_history.withColumnRenamed('Currency24','Currency2')
customer_payment_history=customer_payment_history.withColumnRenamed('Approved for Email10','Approved for Email')
customer_payment_history=customer_payment_history.withColumnRenamed('Approved for Email44','Approved for Email')
customer_payment_history =customer_payment_history.withColumn('Date', to_date(customer_payment_history['Date'],'M/d/yyyy'))
customer_payment_history =customer_payment_history.withColumn('Payment Date', to_date(customer_payment_history['Payment Date'],'M/d/yyyy'))
customer_payment_history= customer_payment_history.withColumnRenamed("Payment Date", "Payment Date2")
customer_payment_history= customer_payment_history.withColumn("Payment Date", customer_payment_history["Date"])
customer_payment_history = customer_payment_history.filter((customer_payment_history['Payment Date'] >= "2018-10-01") & (customer_payment_history['Payment Date'] <= "2023-07-31"))
print(f"Input {customer_payment_history.count()}, columns : {len(customer_payment_history.columns)}")
print(f"Output {customer_payment_history.count()}, columns : {len(customer_payment_history.columns)}, blank billing cycle : {customer_payment_history.filter(customer_payment_history['Billing Cycle'].isNull()).count()}")

Input 315676, columns : 48
Output 315676, columns : 48, blank billing cycle : 21


# Inner join output of 2 and  customer_payment_history

In [39]:
print(f"{temp_df_process_3.count()} {len(temp_df_process_3.columns)}, : {customer_payment_history.count()} {len(customer_payment_history.columns)}")

366688 17, : 315676 48


In [40]:
df_temp_1 =  customer_payment_history.withColumnRenamed("Currency","temp")
df_temp_1 = df_temp_1.withColumnRenamed("Approved for Email","temp2")
joined_df_2_payment_history = temp_df_process_3.join(df_temp_1, on='Document Number', how='inner')
joined_df_2_payment_history = joined_df_2_payment_history.withColumn("Outstanding", lit("Cleared"))

In [41]:
print(len(joined_df_2_payment_history.columns), joined_df_2_payment_history.count())

65 315678


In [42]:
joined_df_2_payment_history.filter(joined_df_2_payment_history['Billing Cycle'].isNull()).count()

21

# Union

In [43]:
temp_df_copy = temp_df.select("*")
joined_df_2_payment_history_copy = joined_df_2_payment_history.select("*")
process_df_result_copy = process_df_result.select("*")

In [44]:
print("Rows : ",process_df_result.count(),joined_df_2_payment_history.count(), temp_df_copy.count())
print("columns : ",len(process_df_result_copy.columns), len(joined_df_2_payment_history_copy.columns),len(temp_df_copy.columns))
print("columns : ",len(process_df_result.columns), len(joined_df_2_payment_history.columns),len(temp_df.columns))

Rows :  14286 315678 3948
columns :  18 65 19
columns :  18 65 19


In [45]:
column_names=set()
for df in [temp_df, joined_df_2_payment_history,process_df_result]:
    for column in df.columns:
        column_names.add(column)
column_names = list(column_names)
print(len(column_names))

65


In [46]:
print(len(temp_df_copy.columns), end= "-->")
for column in column_names:
    if column not in temp_df_copy.columns:
#         print(column)
        temp_df_copy = temp_df_copy.withColumn(column,lit(None).cast("string"))
print(len(temp_df_copy.columns))

19-->65


In [47]:
print(len(joined_df_2_payment_history_copy.columns), end= "-->")
for column in column_names:
    if column not in joined_df_2_payment_history_copy.columns:
        print(column)
        joined_df_2_payment_history_copy = joined_df_2_payment_history_copy.withColumn(column,lit(None).cast("string"))
print(len(joined_df_2_payment_history_copy.columns))

65-->65


In [48]:
print(len(process_df_result_copy.columns), end= "-->")
for column in column_names:
    if column not in process_df_result_copy.columns:
#         print(column)
        process_df_result_copy = process_df_result_copy.withColumn(column,lit(None).cast("string"))
print(len(process_df_result_copy.columns))

18-->65


In [49]:
union_df = temp_df_copy.unionByName(joined_df_2_payment_history_copy).unionByName(process_df_result_copy)

In [50]:
print(union_df.count(), len(union_df.columns))
union_df.select(sum(col('Open Balance'))).collect()[0][0]

333912 65


43992937.400000066

In [51]:
union_df.filter(union_df["Billing Cycle"].isNull()).count()

18255

In [52]:
union_df.filter(union_df["Billing Cycle"]== None).count()

0

In [53]:
union_df.filter(union_df["Billing Cycle"]=="").count()

0

In [54]:
union_df.filter(union_df["Billing Cycle"]==" ").count()

0

## Adding column Billing Cycle2 - Contract Terms per Client (1) - Union


In [85]:
import pandas as pd
df = pd.read_csv('csv_converted\Contract Terms per Client (1).csv', encoding = 'latin1' )

  df = pd.read_csv('csv_converted\Contract Terms per Client (1).csv', encoding = 'latin1' )


In [80]:
contract_terms_df=spark.read.csv('csv_converted\Contract Terms per Client (1).csv', header=True)
# contract_terms_df= contract_terms_df.withColumnRenamed("Name","Customer")
# contract_terms_df = contract_terms_df.select(["Customer","Billing Cycle"])
# contract_terms_df = contract_terms_df.withColumnRenamed("Billing Cycle","Billing Cycle2")
# df_block1 = union_df.join(contract_terms_df,on='Customer', how='leftouter')
# print(df_block1.count(),len(df_block1.columns))

In [81]:
contract_terms_df.count()

9228

In [78]:
# df = df.na.drop()
d=contract_terms_df.dropDuplicates()

In [79]:
d.count()

8879

In [72]:
d.filter

0

In [363]:
contract_terms_df.count()

9228

In [396]:
len(contract_terms_df.columns)

151

In [332]:
contract_terms_df.filter(contract_terms_df["Billing Cycle2"].isNull()).count()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `Billing Cycle2` cannot be resolved. Did you mean one of the following? [`Internal ID`, `Name`, `Sales Rep`, `Billing Specialist`, `Billing Cycle`, `Starting regular rate`, `Current regular rate`, `Maximum Daily Hours`, `Rate Variance`, `Rate Variance Billable`, `Rate Variance Rule`, `Balance Bill_cb`, `Balance Bill Applicable`, `Balance Bill Type`, `Balance Bill Hours`, `Balance Bill Combined`, `Balance Bill Combined Sites`, `Holiday`, `Holiday Billable`, `Holiday Rate`, `US Holidays`, `Canada Holidays`, `Australia Holidays`, `Overtime`, `Overtime Billable`, `Overtime Type`, `Overtime Class`, `Overtime Rate`, `Night Shift Differential`, `Night Shift Differential Billable`, `Night Shift Differential Rate`, `PRE-CLINICAL & CLINICAL TRAINING`, `PRE-CLINICAL & CLINICAL TRAINING BILLABLE`, `PRE-CLINICAL & CLINICAL TRAINING CAP TYPES`, `PRE-CLINICAL & CLINICAL TRAINING COMBINED`, `PRE-CLINICAL & CLINICAL TRAINING COMBINED SITES`, `PRE-CLINICAL & CLINICAL TRAINING - CURRENT YEAR CAP`, `PRE-CLINICAL & CLINICAL TRAINING - AMOUNT BILLED TO DATE`, `PRE-CLINICAL TRAINING RATE`, `CLINICAL TRAINING RATE`, `Client Mandated Training_cb`, `CLIENT MANDATED TRAINING BILLABLE`, `CLIENT MANDATED TRAINING TYPE`, `CLIENT MANDATED - CURRENT YEAR CAP`, `CLIENT MANDATED - AMOUNT BILLED TO DATE`, `CLIENT MANDATED TRAINING RATE`, `ORIENTATION & CLIENT MANDATED COMBINED`, `Orientation_cb`, `Orientation Billable`, `Orientation Type`, `ORIENTATION - CURRENT YEAR CAP`, `ORIENTATION - AMOUNT BILLED TO DATE`, `Orientation Rate`, `Customization Fee`, `Customization Fee Billable`, `Customization Fee Type`, `Customization - Other`, `CUSTOMIZATION FEE AMOUNT`, `Customization Fee Cap`, `Customization Cap Amount`, `Shift Cancellation_cb`, `Shift Cancellation Billable`, `Cancellation Billable Hrs Qty`, `Cancellation Billable Shift Percent`, `Cancellation Rate/Shift`, `Cancellation - Equipment Malfunction`, `Uncovered Shift Penalty_cb`, `Uncovered Shift Penalty Billable`, `Uncovered Shift Type`, `Uncovered Shift Penalty Rate`, `Parking Cost_cb`, `Parking Cost Billable`, `Increase Bill Rate By`, `Site Provides Free Parking`, `Parking Increase Applied`, `Travel`, `Travel Responsibility Billable`, `Credentialing Billable_cb`, `Credential Billable`, `Year 1 Cap`, `Subsequent Years Cap`, `3rd party billable`, `SA & Client Split Cost`, `Scribe - Cap Amount`, `Credentialing Comments`, `Minimum Wage`, `Min`.` Wage Applicable`, `Minimum Wage Contract Section`, `Min`.` Wage Applicable if Above`, `Minimum Wage Contract Percent`, `Current Year State Min`.` Wage`, `Current Year State Min`.` Wage Change`, `SA Percent Increase`, `Current Year Rate Increase By`, `Min`.` Wage Last Increase Date`, `Min`.` Wage Date Waived`, `Min`.` Wage Advance Notice`, `COLA`, `CPI`, `COLA / CPI Applicable`, `COLA / CPI Contract Section`, `COLA / CPI Percent`, `COLA / CPI Terms`, `COLA / CPI Last Date of Change`, `COLA / CPI Next Date of Increase`, `COLA / CPI Date Waived`, `COLA Advance Notice`, `Early Termination Fee`, `Early Termination Amount`, `Comments`, `Late Fee Surcharge`, `Late Fee Surcharge Applicable`, `Surcharge If Payment Received After Net Terms`, `Payment Receive After (Days)`, `Additional Surcharge Percent`, `Account Status`, `DNU`.`Takeover Site`, `Transition From`, `Consolidated Billing`, `Setup Per Customer`, `Estimated Starting Coverage`, `Current coverage (hrs/wk)`, `Coverage Combined`, `Coverage Combined Sites`, `Current coverage (shifts/day)`, `Chief Scribe Email`, `Quality Assurance Specialist Email`, `PL (Project Leader)`, `JPL (Jr`.` Project Manager)`, `Reg`.` Mgr (Regional Manager)`, `VP Senior Management`, `RP (Regional President)`, `Region`, `Global Contract Effective Date`, `Global Contract Signed Date`, `Contract Expiration Date`, `Auto Renew`, `SLA/SOW Effective Date`, `Addendum Effective Date`, `Amendment Effective Date`, `Start Date`, `First Worked Date`, `First Service invoice date`, `Est`.` Implementation Date`, `Proposed Implementation Date`, `Actual Implementation Start Date`, `Actual Implementation End Date`, `Shipping City`, `Shipping State/Province`, `CORP STAFFING GROUP`, `Termination Date`].

In [306]:
contract_terms_df.filter(contract_terms_df["Billing Cycle2"].isNull()).count()

822

In [308]:
contract_terms_df.filter(contract_terms_df["Billing Cycle2"]==" ").count()

0

In [85]:
dub_column(contract_terms_df)

{'Customer': 1, 'Billing Cycle2': 1}

# Adding Column region2 - join

In [86]:
"D:\Bluethink\alytrx project\csv_converted\Region (2).csv"
region_df=spark.read.options(inferSchema='True').csv('csv_converted\Region (2).csv', header=True, inferSchema=True, sep=',')
region_df_join_operation = region_df.select(["Name","region"]) # dataframe to perform join operation
region_df_join_operation = region_df_join_operation.withColumnRenamed("Name","Customer")
region_df_join_operation = region_df_join_operation.withColumnRenamed("region","region2")
region_df = region_df.withColumnRenamed("Internal ID0","Internal ID")
region_df = region_df.withColumn('Name',upper(trim(col('Name'))))

In [87]:
# removing row having same customer name and null in region
extra_column_to_remove=region_df_join_operation.groupBy("Customer").count().filter(col("count")>1).select("Customer")
print(region_df_join_operation.count(), end="-->")
for row in extra_column_to_remove.select("Customer").collect():
    for data in row:
        region_df_join_operation=region_df_join_operation.filter((region_df_join_operation["Customer"]!=data) | (region_df_join_operation["region2"].isNotNull()))
print(region_df_join_operation.count())


11464-->11462


In [88]:
df_block1 = df_block1.join(region_df_join_operation, on="Customer", how="left")

In [89]:
print(df_block1.count(), len(df_block1.columns))

333912 67


## Adding Column `Exclude` from `Bankrupt and Other Customers`

In [90]:
bankrupt_df=spark.read.options(inferSchema='True').csv('csv_converted/Bankrupt and Other Customers (1)_sheet1.csv', header=True, inferSchema=True, sep=',')
bankrupt_df = bankrupt_df.withColumn("Customer Name", trim(col("Customer Name")))

In [91]:
a = df_block1.join(bankrupt_df, df_block1["Customer"]==bankrupt_df["Customer Name"], how="left").drop("Customer Name")

In [102]:
df_block1.filter(df_block1["Billing Cycle"]==" ").count()

0

In [92]:
df_block1.filter(df_block1["Billing Cycle2"].isNull()).count()

324128

In [93]:
df_block1.filter(df_block1["Billing Cycle"].isNull()).count()

18255

In [94]:
a.filter(df_block1["Billing Cycle2"].isNull()).count()

324128

In [95]:
a.filter(df_block1["Billing Cycle"].isNull()).count()

18255

In [96]:
c = df_block1.withColumn(
        "Billing Cycle2",
        when(
            (col("Billing Cycle2").isNull()) |
            (col("Billing Cycle2") == "") |
            (col("Billing Cycle2") == " "),
            col("Billing Cycle")
        ).otherwise(col("Billing Cycle2"))
    )

In [215]:
a.filter(a["Billing Cycle2"].isNull()).select("Billing Cycle2").count()

324128

In [98]:
print(a.count(), len(a.columns))

333912 68


<!-- import time
completed = 0
start_time = time.time()
for i, row in enumerate(region_df.rdd.collect()):
    if (i+1)%100==0:
        print(i+1, end="")
        completed += 0.8724480893386843
        print(f"  : {completed:.2f} %", end = " ,")
        print(f"Took {(time.time()-start_time):.2f} seconds")
#         
#         print(region_df.count)
    else:
        if i%2==0:
            print(".", end="")
    c = c.withColumn("region2", when(c["Customer"] == row["Name"], row["region2"]).otherwise(None))
                                                                     
     -->