#### Pandas export json such that we can read by line (streaming fashion) and parse one line at a time during import - No Compression - 126 bytes

In [1]:
# imports and sample data

import pandas as pd
import json
import gzip

data = {
    "col1" : ["i11","i21","i31"],
    "col2" : ["i12","i22","i32"],
    "col3" : ["i13","i23","i33"]
}

In [2]:
# creating dataframe to test proper way of export

df = pd.DataFrame(data)

df 

Unnamed: 0,col1,col2,col3
0,i11,i12,i13
1,i21,i22,i23
2,i31,i32,i33


In [3]:
# Export DataFrame to JSON with one line per record

df_json = df.to_json(orient='records', lines=True)

print(df_json)

{"col1":"i11","col2":"i12","col3":"i13"}
{"col1":"i21","col2":"i22","col3":"i23"}
{"col1":"i31","col2":"i32","col3":"i33"}



In [4]:
# Write the JSON data to a file

with open('./exportedFiles/output.json', 'w') as file:
    file.write(df_json)

In [5]:
# reading the data from the file in streaming fashion, one line at a time

with open('./exportedFiles/output.json', 'r') as file:
    line = file.readline()

line

'{"col1":"i11","col2":"i12","col3":"i13"}\n'

In [6]:
# parsing json to dictionary

data = json.loads(line)
data

{'col1': 'i11', 'col2': 'i12', 'col3': 'i13'}

In [7]:
# parsser function

def parse_fields(line):
    data = json.loads(line)
    return {
        'col1': data['col1'],
        'col2': data['col2'],
        'col3': data['col3'],
        'col4': data['col3']
    }

In [8]:
# recursively read all the data file lines in streaming fashion

books = []

with open("./exportedFiles/output.json") as f:
    while True:
        # reading the line
        line = f.readline()

        # we will break the infinite loop when we reach the end of the dataset file
        if not line:
            break
        
        # parsing the line
        fields = parse_fields(line)

        books.append(fields)

books

[{'col1': 'i11', 'col2': 'i12', 'col3': 'i13', 'col4': 'i13'},
 {'col1': 'i21', 'col2': 'i22', 'col3': 'i23', 'col4': 'i23'},
 {'col1': 'i31', 'col2': 'i32', 'col3': 'i33', 'col4': 'i33'}]

In [9]:
len(books)

3

In [10]:
# converting the data back to dictionary to check the data integrety

items = pd.DataFrame.from_dict(books)

items

Unnamed: 0,col1,col2,col3,col4
0,i11,i12,i13,i13
1,i21,i22,i23,i23
2,i31,i32,i33,i33


#### Pandas export json such that we can read by line (streaming fashion) and parse one line at a time during import - gzip Compression - 89 bytes

In [11]:
# Export DataFrame to JSON with one line per record

json_data = df.to_json(orient='records', lines=True)

In [12]:
# Compress the JSON string and write to a gzip file

with gzip.open('./exportedFiles/output.json.gz', 'wt', encoding='utf-8') as file:
    file.write(json_data)

In [13]:
# opening gzip file streaming fashion

with gzip.open('./exportedFiles/output.json.gz') as file:
    line = file.readline()

line

b'{"col1":"i11","col2":"i12","col3":"i13"}\r\n'

In [14]:
# parse function

def parse_fields(line):
    data = json.loads(line)
    return {
        'col1': data['col1'],
        'col2': data['col2'],
        'col3': data['col3'],
        'col4': data['col3']
    }

In [15]:
# reading all the lines in streaming fashion

books = []

with gzip.open("./exportedFiles/output.json.gz") as f:
    while True:
        # reading the line
        line = f.readline()

        # we will break the infinite loop when we reach the end of the dataset file
        if not line:
            break
        
        # parsing the line
        fields = parse_fields(line)

        books.append(fields)

books

[{'col1': 'i11', 'col2': 'i12', 'col3': 'i13', 'col4': 'i13'},
 {'col1': 'i21', 'col2': 'i22', 'col3': 'i23', 'col4': 'i23'},
 {'col1': 'i31', 'col2': 'i32', 'col3': 'i33', 'col4': 'i33'}]

In [16]:
len(books)

3

In [17]:
# checking data integrety

items = pd.DataFrame.from_dict(books)

items

Unnamed: 0,col1,col2,col3,col4
0,i11,i12,i13,i13
1,i21,i22,i23,i23
2,i31,i32,i33,i33
