In [4]:
import pyspark
import requests
import pandas as pd

In [175]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, lpad, length, regexp_replace, initcap, lower, col, lit

In [6]:
spark = SparkSession\
    .builder\
    .appName("extract")\
    .getOrCreate()

23/02/17 18:50:37 WARN Utils: Your hostname, Sules-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.17 instead (on interface en0)
23/02/17 18:50:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/17 18:50:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Functions

In [7]:
def get_df_size(sparkdf):
    return (sparkdf.count(), len(sparkdf.columns))

### 1. Load data sets

In [191]:
# JSON files
branch = spark.read.json("./Credit Card Dataset/cdw_sapp_branch.json")
credit = spark.read.json("./Credit Card Dataset/cdw_sapp_credit.json")
customer = spark.read.json("./Credit Card Dataset/cdw_sapp_custmer.json")

In [10]:
get_df_size(credit), get_df_size(branch), get_df_size(customer)

((46694, 9), (115, 8), (952, 14))

In [11]:
# Loan application API
# note add try except for accessing the api
url = "https://raw.githubusercontent.com/platformps/LoanDataset/main/loan_data.json"
response = requests.get(url)
print(response)
pandas_df = pd.DataFrame(response.json())
loan = spark.createDataFrame(pandas_df)

<Response [200]>


  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [12]:
get_df_size(loan)

(511, 10)


### 2. Transform data according to the specifications at [Mapping document](https://docs.google.com/spreadsheets/d/1t8UxBrUV6dxx0pM1VIIGZpSf4IKbzjdJ/edit#gid=1823293337)

In [155]:
table_names = {'credit':'CDW_SAPP_CREDIT_CARD',
               'branch':'CDW_SAPP_BRANCH',
               'customer':'CDW_SAPP_CUSTOMER'}
# Mapping dictionaries:
# tuple: (old_col, new_col, type)
column_map = {'credit':[('CREDIT_CARD_NO','CUST_CC_NO','string'),
                     #(['DAY','MONTH','YEAR'],'TIMEID','string'),
                     ('CUST_SSN','CUST_SSN','integer'),
                     ('BRANCH_CODE','BRANCH_CODE','integer'),
                     ('TRANSACTION_TYPE','TRANSACTION_TYPE','string'),
                     ('TRANSACTION_VALUE','TRANSACTION_VALUE','float'),
                     ('TRANSACTION_ID','TRANSACTION_ID','integer')],
           'branch':[('BRANCH_CODE','BRANCH_CODE','integer'),
                     ('BRANCH_NAME','BRANCH_NAME','integer'),
                     ('BRANCH_STREET','BRANCH_STREET','string'),
                     ('BRANCH_CITY','BRANCH_CITY','string'),
                     ('BRANCH_STATE','BRANCH_STATE','string'),
                     ('BRANCH_ZIP','BRANCH_ZIP','integer'),
                     ('BRANCH_PHONE','BRANCH_PHONE','string'),
                     ('LAST_UPDATED','LAST_UPDATED','timestamp')],
           'customer':[('SSN','SSN','integer'),
                       ('FIRST_NAME','FIRST_NAME','string'),
                       ('MIDDLE_NAME','MIDDLE_NAME','string'),
                       ('LAST_NAME','LAST_NAME','string'),
                       ('CREDIT_CARD_NO','CREDIT_CARD_NO','string'),
                       #(['STREET_NAME','APT_NO'],'FULL_STREET_ADDRESS','string'),
                       ('CUST_CITY','CUST_CITY','string'),
                       ('CUST_STATE','CUST_STATE','string'),
                       ('CUST_COUNTRY','CUST_COUNTRY','string'),
                       ('CUST_ZIP','CUST_ZIP','integer'),
                       ('CUST_PHONE','CUST_PHONE','string'),
                       ('CUST_EMAIL','CUST_EMAIL','string'),
                       ('LAST_UPDATED','LAST_UPDATED','timestamp')]}

In [233]:
def transform(spark_df, data_name):
    map_dict = column_map[data_name]
    for tup in map_dict:
        #general type casting
        new_spark_df = spark_df.withColumn(tup[1],spark_df[tup[0]].cast(tup[2]))
        
        # special instructions for dataframe transformations
        if data_name == 'credit':
            new_spark_df = spark_df.withColumns({'MONTH': lpad(spark_df['MONTH'],2,'0'),
                                     'DAY':lpad(spark_df['DAY'],2,'0')})
            new_spark_df = new_spark_df.withColumns({'TIMEID':concat("YEAR", "MONTH", "DAY")})
        elif data_name == 'branch':
            # check for null zip codes:
            if new_spark_df.filter(spark_df['BRANCH_ZIP'].isNull()).collect():
                new_spark_df = new_spark_df.fillna(value={'BRANCH_ZIP':99999})
            # format phone_numbers:
            new_spark_df = new_spark_df.withColumn('BRANCH_PHONE',
                                                regexp_replace('BRANCH_PHONE', r'(\d{3})(\d{3})(\d{4})',"($1)$2-$3"))
        elif data_name == 'customer':
            new_spark_df = new_spark_df.withColumns({'FIRST_NAME': initcap('FIRST_NAME'),
                                                 'MIDDLE_NAME': lower('MIDDLE_NAME'),
                                                 'LAST_NAME': initcap('LAST_NAME')})
            new_spark_df = new_spark_df.withColumn('FULL_STREET_ADDRESS', concat(col('STREET_NAME'),lit(', '), col('APT_NO')))
            new_spark_df = new_spark_df.withColumn('CUST_PHONE',
                                                regexp_replace('CUST_PHONE', r'(\d{3})(\d{4})',"(111)$1-$2"))
            
            #new_spark_df.show()
    return new_spark_df

In [234]:
transform(customer,'customer').show(3)
transform(credit,'credit').show(3)
transform(branch,'branch').show(3)

+------+----------------+------------+-------------+-------------------+-------------+----------+--------+----------+---------+-------------------+-----------+---------+-----------------+--------------------+
|APT_NO|  CREDIT_CARD_NO|   CUST_CITY| CUST_COUNTRY|         CUST_EMAIL|   CUST_PHONE|CUST_STATE|CUST_ZIP|FIRST_NAME|LAST_NAME|       LAST_UPDATED|MIDDLE_NAME|      SSN|      STREET_NAME| FULL_STREET_ADDRESS|
+------+----------------+------------+-------------+-------------------+-------------+----------+--------+----------+---------+-------------------+-----------+---------+-----------------+--------------------+
|   656|4210653310061055|     Natchez|United States|AHooper@example.com|(111)123-7818|        MS|   39120|      Alec|   Hooper|2018-04-21 12:49:02|         wm|123456100|Main Street North|Main Street North...|
|   829|4210653310102868|Wethersfield|United States|EHolman@example.com|(111)123-8933|        CT|   06109|      Etta|   Holman|2018-04-21 12:49:02|    brendan|12345