In [105]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import datediff, avg
import pandas as pd

# Starting the Spark Session

In [2]:
spark = SparkSession.builder.appName('dateml').getOrCreate()

# Loading the Data

In [107]:
df = spark.read.csv('../tutorial-data/ClaimDataExample1.csv', header=True, inferSchema=True)

In [108]:
df.printSchema()

root
 |-- CLM_NO: integer (nullable = true)
 |-- CLM_NO_LN: integer (nullable = true)
 |-- RCV_DT: timestamp (nullable = true)
 |-- SERV_DT: timestamp (nullable = true)
 |-- RESOLVED_DT: timestamp (nullable = true)
 |-- ALLOW_AMT: double (nullable = true)
 |-- RESOLVED_AMT: double (nullable = true)



# Registering the Table

In [109]:
df.registerTempTable('ClaimMock')

In [110]:
spark.catalog.listTables()

[Table(name='claimmock', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

# Auto-selecting Date Fields

In [111]:
date_columns = [i[0] for i in list(filter(lambda x: x[1]=='timestamp', df.dtypes))]
df.select(date_columns).show()

+-------------------+-------------------+-------------------+
|             RCV_DT|            SERV_DT|        RESOLVED_DT|
+-------------------+-------------------+-------------------+
|2017-02-04 00:00:00|2017-02-14 00:00:00|2017-03-16 00:00:00|
|2017-07-08 00:00:00|2017-07-18 00:00:00|2017-08-17 00:00:00|
|2017-09-15 00:00:00|2017-09-25 00:00:00|2017-10-25 00:00:00|
|2018-03-03 00:00:00|2018-03-13 00:00:00|2018-04-12 00:00:00|
|2018-08-04 00:00:00|2018-08-14 00:00:00|2018-09-13 00:00:00|
|2018-12-22 00:00:00|2019-01-01 00:00:00|2019-01-31 00:00:00|
|2019-06-08 00:00:00|2019-06-18 00:00:00|2019-07-18 00:00:00|
|2019-09-24 00:00:00|2019-10-04 00:00:00|2019-11-03 00:00:00|
|2017-04-19 00:00:00|2017-04-29 00:00:00|2017-05-29 00:00:00|
|2017-09-03 00:00:00|2017-09-13 00:00:00|2017-10-13 00:00:00|
|2017-09-22 00:00:00|2017-10-02 00:00:00|2017-11-01 00:00:00|
|2017-10-09 00:00:00|2017-10-19 00:00:00|2017-11-18 00:00:00|
|2017-11-21 00:00:00|2017-12-01 00:00:00|2017-12-31 00:00:00|
|2018-03

In [112]:
df.withColumn('duration', datediff(df['RCV_DT'], df['SERV_DT'])).show()

+--------+---------+-------------------+-------------------+-------------------+------------------+------------------+--------+
|  CLM_NO|CLM_NO_LN|             RCV_DT|            SERV_DT|        RESOLVED_DT|         ALLOW_AMT|      RESOLVED_AMT|duration|
+--------+---------+-------------------+-------------------+-------------------+------------------+------------------+--------+
|10000000|        1|2017-02-04 00:00:00|2017-02-14 00:00:00|2017-03-16 00:00:00| 63.06704903424165| 8.829386864793833|     -10|
|10000000|        2|2017-07-08 00:00:00|2017-07-18 00:00:00|2017-08-17 00:00:00| 25.39506690821225|3.5553093671497153|     -10|
|10000000|        3|2017-09-15 00:00:00|2017-09-25 00:00:00|2017-10-25 00:00:00| 901.7757618741892| 126.2486066623865|     -10|
|10000000|        4|2018-03-03 00:00:00|2018-03-13 00:00:00|2018-04-12 00:00:00| 967.1990971331805| 135.4078735986453|     -10|
|10000000|        5|2018-08-04 00:00:00|2018-08-14 00:00:00|2018-09-13 00:00:00|325.94044087681294| 45.6

# Feature Engineering on Date Fields

In [113]:
# create all posible combinations
i = 0
while i < len(date_columns):    
    target_col = date_columns[i]
    for col in date_columns:
        if target_col != col:
            df = df.withColumn(f'{target_col}_to_{col}_diff', datediff(target_col, col))
    i += 1

In [116]:
df.select(list(filter(lambda x: 'diff' in x, df.columns))).show()

+----------------------+--------------------------+----------------------+---------------------------+--------------------------+---------------------------+
|RCV_DT_to_SERV_DT_diff|RCV_DT_to_RESOLVED_DT_diff|SERV_DT_to_RCV_DT_diff|SERV_DT_to_RESOLVED_DT_diff|RESOLVED_DT_to_RCV_DT_diff|RESOLVED_DT_to_SERV_DT_diff|
+----------------------+--------------------------+----------------------+---------------------------+--------------------------+---------------------------+
|                   -10|                       -40|                    10|                        -30|                        40|                         30|
|                   -10|                       -40|                    10|                        -30|                        40|                         30|
|                   -10|                       -40|                    10|                        -30|                        40|                         30|
|                   -10|                       -40| 