### Read CSV file partitioned by folder structure /yyyy/mm/dd

In [95]:
# start in parent of /test_data/ folder
import os

BASE_DATA_FOLDER:str = './test_data/csv_data'

def tree_printer(root_folder, ):
    #print(f'Folder structure of {root_folder}')
    for root_folder, dirs, files in os.walk(root_folder):
        for d in dirs:
            #print(os.path.join(root_folder, d))
            tree_printer(root_folder=d)
        for f in files:
            print(os.path.join(root_folder, f))
            
print('Current in folder:', os.getcwd())
tree_printer(root_folder=BASE_DATA_FOLDER)

Current in folder: /Users/chawl001/Dev/Python/Spark
./test_data/csv_data/.DS_Store
./test_data/csv_data/yyyy=2020/.DS_Store
./test_data/csv_data/yyyy=2020/mm=05/dd=31/user_orig.csv
./test_data/csv_data/yyyy=2020/mm=06/dd=18/user_latest.csv
./test_data/csv_data/yyyy=2020/mm=06/dd=15/user_day2.csv


In [96]:
user_df = spark.read.csv(BASE_DATA_FOLDER, header=True)
user_df.printSchema()

root
 |-- USER_ID: string (nullable = true)
 |-- USER_NAME: string (nullable = true)
 |-- SESSION_ID: string (nullable = true)
 |-- LOG_IN_DT_TM: string (nullable = true)
 |-- LOGOUT_DT_TM: string (nullable = true)
 |-- yyyy: integer (nullable = true)
 |-- mm: integer (nullable = true)
 |-- dd: integer (nullable = true)



In [97]:
print('Original Dataframe:')
user_df.show(25)

Original Dataframe:
+-------+----------+-----------+-------------------+-------------------+----+---+---+
|USER_ID| USER_NAME| SESSION_ID|       LOG_IN_DT_TM|       LOGOUT_DT_TM|yyyy| mm| dd|
+-------+----------+-----------+-------------------+-------------------+----+---+---+
|    100|    lalitc|123-456-789|2020-05-30 23:59:50|2020-05-31 20:21:22|2020|  5| 31|
|    007|  bond_007|007-007-007|2020-05-31 07:07:57|2020-05-31 08:07:57|2020|  5| 31|
|    101|  mt_baker|009-009-009|2020-05-31 10:10:10|2020-05-31 22:22:22|2020|  5| 31|
|    007|  bond_007|007-007-777|2020-06-16 00:00:01|2020-06-16 23:59:59|2020|  6| 18|
|    100|    lalitc|999-000-999|2020-06-16 03:30:33|2020-06-16 06:55:06|2020|  6| 18|
|    100|    lalitc|000-888-000|2020-06-16 08:08:08|2020-06-16 09:08:08|2020|  6| 18|
|    007|  bond_007|007-007-777|2020-06-15 01:01:01|2020-06-15 23:59:59|2020|  6| 15|
|    111|mt_rainier|447-447-447|2020-06-15 09:00:00|2020-06-15 17:00:00|2020|  6| 15|
+-------+----------+-----------+--

In [98]:
print('Repartitioned Dataframe:')
user_df_partn_by_UserID = user_df.repartition('USER_ID')
user_df_partn_by_UserID.sort('mm','dd').show(25)

# write out the repartioned dataframe
repartition_data_folder = './test_data/repartitioned_by_user_id/'
user_df_partn_by_UserID.write.option("overwrite","true").csv(repartition_data_folder, header=True)

print('Writen repartitioned files at: ', repartition_data_folder)
tree_printer(root_folder=repartition_data_folder)

reread_df = spark.read.csv(repartition_data_folder, header=True)
reread_df.show(25)

Repartitioned Dataframe:
+-------+----------+-----------+-------------------+-------------------+----+---+---+
|USER_ID| USER_NAME| SESSION_ID|       LOG_IN_DT_TM|       LOGOUT_DT_TM|yyyy| mm| dd|
+-------+----------+-----------+-------------------+-------------------+----+---+---+
|    101|  mt_baker|009-009-009|2020-05-31 10:10:10|2020-05-31 22:22:22|2020|  5| 31|
|    007|  bond_007|007-007-007|2020-05-31 07:07:57|2020-05-31 08:07:57|2020|  5| 31|
|    100|    lalitc|123-456-789|2020-05-30 23:59:50|2020-05-31 20:21:22|2020|  5| 31|
|    007|  bond_007|007-007-777|2020-06-15 01:01:01|2020-06-15 23:59:59|2020|  6| 15|
|    111|mt_rainier|447-447-447|2020-06-15 09:00:00|2020-06-15 17:00:00|2020|  6| 15|
|    100|    lalitc|999-000-999|2020-06-16 03:30:33|2020-06-16 06:55:06|2020|  6| 18|
|    100|    lalitc|000-888-000|2020-06-16 08:08:08|2020-06-16 09:08:08|2020|  6| 18|
|    007|  bond_007|007-007-777|2020-06-16 00:00:01|2020-06-16 23:59:59|2020|  6| 18|
+-------+----------+---------

In [99]:
dedup_df = reread_df.dropDuplicates(['USER_ID'])
dedup_df.show()

+-------+----------+-----------+-------------------+-------------------+----+---+---+
|USER_ID| USER_NAME| SESSION_ID|       LOG_IN_DT_TM|       LOGOUT_DT_TM|yyyy| mm| dd|
+-------+----------+-----------+-------------------+-------------------+----+---+---+
|    101|  mt_baker|009-009-009|2020-05-31 10:10:10|2020-05-31 22:22:22|2020|  5| 31|
|    100|    lalitc|123-456-789|2020-05-30 23:59:50|2020-05-31 20:21:22|2020|  5| 31|
|    111|mt_rainier|447-447-447|2020-06-15 09:00:00|2020-06-15 17:00:00|2020|  6| 15|
|    007|  bond_007|007-007-007|2020-05-31 07:07:57|2020-05-31 08:07:57|2020|  5| 31|
+-------+----------+-----------+-------------------+-------------------+----+---+---+

