# FDM File Management
1. Convert CSV dataset to Parquet to reduce file size. - deprecated
2. Create database of list of FDM files, origin file, path, filesize info. 

## Import

In [2]:
import pandas as pd
import numpy as np
import os
import math

### Constant

In [11]:
# fdm_folder_path = '../dataset/fdm'
# output = '../dataset/database_local/01_fdm_files.csv'

In [1]:
fdm_folder_path = '/Volumes/My Passport/desy'
output = '../dataset/database/01_fdm_files.csv'

## Read Files
Retrieve all FDM files information and create dataset.

In [3]:
# Scan through the folder
filename_list, tail_id_list, fsize_mb_list, fullpath_list = [], [], [], []
for root, dirs, files in os.walk(fdm_folder_path):
    for file in files:
        if file.endswith(".csv"):
            path = os.path.join(root, file)
            filesize = round(os.stat(path).st_size / 1000000, 2)
            tail_id = os.path.basename(root)

            filename_list.append(file)
            fullpath_list.append(path)
            fsize_mb_list.append(filesize)
            tail_id_list.append(tail_id)

### Create File Database

In [4]:
col_to_list = {'fname': filename_list,
               'tail_id': tail_id_list,
               'fsize_mb': fsize_mb_list,
               'fullpath': fullpath_list}
fdm_files = pd.DataFrame(col_to_list)

In [5]:
fdm_files

Unnamed: 0,fname,tail_id,fsize_mb,fullpath
0,686200104111724.csv,tail_686_1,50.99,/Volumes/My Passport/desy/tail_686_1/686200104...
1,686200104120602.csv,tail_686_1,24.43,/Volumes/My Passport/desy/tail_686_1/686200104...
2,686200104120802.csv,tail_686_1,40.53,/Volumes/My Passport/desy/tail_686_1/686200104...
3,686200104121013.csv,tail_686_1,43.39,/Volumes/My Passport/desy/tail_686_1/686200104...
4,686200104121245.csv,tail_686_1,61.52,/Volumes/My Passport/desy/tail_686_1/686200104...
...,...,...,...,...
9311,687201003032120.csv,tail_687_9,16.87,/Volumes/My Passport/desy/tail_687_9/687201003...
9312,687201003040534.csv,tail_687_9,20.63,/Volumes/My Passport/desy/tail_687_9/687201003...
9313,687201003040811.csv,tail_687_9,56.74,/Volumes/My Passport/desy/tail_687_9/687201003...
9314,687201003041105.csv,tail_687_9,55.84,/Volumes/My Passport/desy/tail_687_9/687201003...


In [6]:
fdm_files.fname.is_unique

True

In [7]:
fdm_files.to_csv(output, index=False)

## Convert to Parquet (Optional)
Using Spark to save time

#### Initialize Spark

In [2]:
import findspark
findspark.init()

import pyspark
import random

#Settings for PySpark to work
driver_memory = '4g'
num_executors = 2
executor_memory = '1g'
pyspark_submit_args = ' --driver-memory ' + driver_memory + ' pyspark-shell'

#Setting the required parameters to start up PySpark
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

#Import Modules Needed for PySpark
from pyspark.sql import SparkSession

In [3]:
import pyspark.sql.functions as F 
from pyspark.sql.types import DoubleType, IntegerType, StringType

In [4]:
spark = SparkSession.builder.appName("Data Exploration").getOrCreate()

In [5]:
#Helper for pretty formatting for Spark DataFrames
def showDF(df, limitRows =  20, truncate = True):
    if(truncate):
        pd.set_option('display.max_colwidth', 50)
    else:
        pd.set_option('display.max_colwidth', None)
    pd.set_option('display.max_rows', limitRows)
    display(df.limit(limitRows).toPandas())
    pd.reset_option('display.max_rows')

#### Extract All CSV

In [10]:
all_fdm_df = spark.read \
                .option("header", "true") \
                .csv(f'{fdm_folder_path}/*/*.csv') \
                .withColumn("orig_fname", F.input_file_name())

In [11]:
# Change orig_fname column to filename only
extract_fname_udf = F.udf(os.path.basename, StringType())
fdm_with_fname = all_fdm_df.withColumn("orig_fname", extract_fname_udf("orig_fname"))

In [12]:
fdm_with_fname.limit(5).toPandas()

Unnamed: 0,airbrk_pos_rad,ail_l_rad,ail_r_rad,hbaro_m,hdot_1_mps,aoa_1_rad,aoa_2_rad,aoac_rad,aoai_rad,auto_thr_status,...,temp_total_degC,psi_rad,psi_mag_selected,chi_rad,chi_mag_rad,az_mps2,wdir_rad,wow,ws_mps,orig_fname
0,2.09410904426956,1.82175343497467,1.79283149314117,337.7184,-0.08128,0.151096090453625,0.322900797260857,0.0,0.236998452179623,0.0,...,32.0,0.814423972265244,1.0,-0.523647275167336,-0.386839432914986,9.7546836659193,0.0,0.0,0.0,686200203171255.csv
1,,,,,,,,,,,...,,,,,,,,,,686200203171255.csv
2,,,,,,,,,,,...,,,,,,9.7546836659193,,,,686200203171255.csv
3,,,,,,,,,,,...,,,,,,,,,,686200203171255.csv
4,,,,337.7184,-0.16256,0.150329116375732,0.322900797260857,0.0,0.236998452179623,,...,,0.814423972265244,,-0.499296052092614,-0.362488209840264,9.77712930648327,0.0,,0.0,686200203171255.csv


#### Write to Parquet

In [None]:
dest_path = '../dataset/fdm_parquet/'
fdm_with_fname.write \
              .mode("overwrite") \
              .partitionBy('orig_fname') \
              .parquet(f"{dest_path}")

#### Restructure Folder and File Name

In [None]:
import shutil

fname_parquet_list, fname_orig_list, fullpath_list = [], [], []

# Scan through the folder
for root, dirs, files in os.walk(dest_path):
    for file in files:
        if file.endswith(".parquet"):
            # Rename File
            orig_fname = os.path.basename(root).split('=')[1]
            dest_fname = os.path.splitext(orig_fname)[0] + '.parquet'
            
            src_fpath = os.path.join(root, file)
            dest_fpath = os.path.join(dest_path, dest_fname)
            
            fname_parquet_list.append(dest_fname)
            fname_orig_list.append(orig_fname)
            fullpath_list.append(dest_fpath)
            
            shutil.move(src_fpath, dest_fpath)
            shutil.rmtree(root)
            
            print(orig_fname, dest_fname, dest_fpath)

### Save Parquet File Metadata

In [None]:
fdm_parquet_files = pd.DataFrame(columns=['fname', 'orig_fname', 'fullpath'])

In [None]:
fdm_parquet_files['fname'] = fname_parquet_list
fdm_parquet_files['orig_fname'] = fname_orig_list
fdm_parquet_files['fullpath'] = fullpath_list

In [None]:
fdm_parquet_files

In [None]:
fdm_parquet_files.to_csv('../dataset/database/01_fdm_parquet_files.csv', index=False)