# Data Persistent Loader - HDFS + Parquet

In [8]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import os
from tqdm.notebook import tqdm
from datetime import datetime
from project_settings import env
from database_settings import hdfs_utilities as hdfs

In [9]:
# Get the path of the headings file
file = env.TEMPORAL_LANDING_FOLDER+'/headings/NANDINA.txt'

# Parse the file and convert it into a Dataframe
with open(file, 'r') as f:
    file_lines = f.readlines()
    file_lines = [string.rstrip('\t\n') for string in file_lines][1:]
    file_lines = [string.split('\t') for string in file_lines]
    file_lines = [[element for element in inner_list if element.strip()] for inner_list in file_lines]
# Convert to dataframe
headings = pd.DataFrame(file_lines)
# Convert the column names into strings
headings.columns = headings.columns.astype(str)
# Add the loading date column
headings['LOAD_DATE'] = datetime.today().strftime('%Y%m%d')

# Create a parquet file
# Convert the dataframe into a pyarrow table
headings = pa.Table.from_pandas(headings)
# Generate the parquet file in the same folder than the original headings file
parquet_writer = pq.ParquetWriter(os.path.dirname(file) + '/headings' + '.parquet', headings.schema)
parquet_writer.write_table(headings)
parquet_writer.close()

# Define the directory in HDFS to store the files
hdfs_directory = '/thesis/peru/headings/'
# Add the files
result = hdfs.add_file_to_hdfs(os.path.dirname(file)+'/headings.parquet', hdfs_directory, log_context='historical')
if result == 0:
    print('Ingestion finished! headings file added to HDFS')
else:
    print('Ingestion of headings in HDFS failed!')

Ingestion finished! headings file added to HDFS
