# ex-02 Compare Reading Speed Among Several Spatial Formats

In [1]:
import numpy as np
import pandas as pd 
import fiona
import geopandas as gpd

from pathlib import Path
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Read data

In [2]:
infile = "data/gadm404_Level1.parquet"

In [3]:
%%time
df_pq = gpd.read_parquet(infile)

CPU times: user 1.33 s, sys: 1.07 s, total: 2.41 s
Wall time: 2.41 s


## 2. Transform parquet format to other formats

- shapefile
- geojson
- GeoPackage


### 2.1 shapefile

In [4]:
%%time
df_pq.to_file("data/gadm404_Level1.shp", index=False, driver="ESRI Shapefile")

CPU times: user 1min 16s, sys: 1.29 s, total: 1min 17s
Wall time: 1min 17s


### 2.2  GeoJSON

In [5]:
%%time
df_pq.to_file("data/gadm404_Level1.geojson", index=False, driver="GeoJSON")

CPU times: user 2min 44s, sys: 2.22 s, total: 2min 46s
Wall time: 2min 46s


### 2.3 GeoPackage

In [6]:
%%time
df_pq.to_file("data/gadm404_Level1.gpkg", index=False, driver="GPKG", layer="layer")

CPU times: user 36 s, sys: 2.98 s, total: 39 s
Wall time: 42.4 s


## 3. Compare reading speed

In [7]:
%%time
gdf_shp = gpd.read_file("data/gadm404_Level1.shp")

CPU times: user 5.91 s, sys: 428 ms, total: 6.34 s
Wall time: 6.34 s


***The produced geojson is too big to read by geopandas***

In [8]:
%%time
gdf_gjson = gpd.read_file("data/gadm404_Level1.geojson", )

DriverError: Failed to read GeoJSON data

In [9]:
%%time
gpd.read_file("data/gadm404_Level1.gpkg")

CPU times: user 6.05 s, sys: 412 ms, total: 6.46 s
Wall time: 6.46 s


Unnamed: 0,Country,State,geometry
0,Aruba,,"POLYGON ((-69.97820 12.46986, -69.97847 12.469..."
1,Afghanistan,Badakhshan,"POLYGON ((71.00134 38.47703, 71.00505 38.47462..."
2,Afghanistan,Badghis,"POLYGON ((63.90568 35.87608, 63.90471 35.84964..."
3,Afghanistan,Baghlan,"POLYGON ((68.31232 36.55094, 68.31882 36.54806..."
4,Afghanistan,Balkh,"POLYGON ((67.05399 37.36167, 67.06006 37.35707..."
...,...,...,...
3675,Zimbabwe,Mashonaland West,"POLYGON ((29.90175 -15.62221, 29.90864 -15.623..."
3676,Zimbabwe,Masvingo,"POLYGON ((30.96762 -19.15971, 30.98009 -19.161..."
3677,Zimbabwe,Matabeleland North,"POLYGON ((28.05612 -17.05775, 28.05182 -17.058..."
3678,Zimbabwe,Matabeleland South,"POLYGON ((29.27012 -19.51698, 29.27394 -19.525..."


## Conclusion

The parquet format presents the fastest reading speed. In addition, if you compare the file sizes, you will found that the parquet format also has the smallest size. That is reason why we use the parquet format to store our global data.