# Overview of Writing Pandas Dataframes to Files 

In [1]:
import json 
import pandas as pd

file_path = 'E:/Projects/Data_Engineering/Data-Engineering/data/retail_db/schemas.json'
schema = json.load(open(file_path))

# get column details.

def get_column_name(schema, tableName, sortingKey='column_position'):
    column_details = schema[tableName]
    column_details_sort = sorted(column_details, key=lambda col:col[sortingKey])
    return [col['column_name'] for col in column_details_sort]


orders_column_names = get_column_name(schema, 'orders')
orders_data_file_path = 'E:/Projects/Data_Engineering/Data-Engineering/data/retail_db/orders/part-00000'
orders = pd.read_csv(
                        orders_data_file_path,
                        names=orders_column_names
                    )
orders

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD


- We can use to_csv/json/others to write data to perticular file

In [2]:
# write data into CSV file
writen_file_path = 'E:/Projects/Data_Engineering/Data-Engineering/material/02_PYTHON/Pandas/orders_data.csv'
orders.to_csv(
                writen_file_path, 
                sep=','
                ,header= True
)

In [3]:
# Write data into json file
# Let us assume some file path is not exist so we first need to create the file path 

import os
# exist_ok = True will work when path is already exists and the below statement will not fail.
os.makedirs('E:/Projects/Data_Engineering/Data-Engineering/material/02_PYTHON/Pandas/Json data', exist_ok= True)

In [4]:

json_writen_file_path = 'E:/Projects/Data_Engineering/Data-Engineering/material/02_PYTHON/Pandas/Json data/orders_data.json'
# In the blow if are not specify any other argument but the file path the data will in a big json column format document.

orders.to_json(
                json_writen_file_path
)

In [5]:
# for row format/record format jsonn doument 
orders.to_json(
                'E:/Projects/Data_Engineering/Data-Engineering/material/02_PYTHON/Pandas/Json data/orders_data_1.json',
                orient= 'records',
                lines= True
)

In [6]:
# read the json data.

orders_json_record_format = pd.read_json(
                            'E:/Projects/Data_Engineering/Data-Engineering/material/02_PYTHON/Pandas/Json data/orders_data_1.json',
                            lines= True
)

orders_json_record_format

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD


In [21]:
orders_json_column_format = pd.read_json(
                            'E:/Projects/Data_Engineering/Data-Engineering/material/02_PYTHON/Pandas/Json data/orders_data.json'
)

orders_json_column_format

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD
