In [2]:
pip install pandas pyarrow

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Path to the TSV file
tsv_file_path = 'dataDefuncionesPorEdadComunaYSexo_2014_2023.tsv'

# Read the TSV file into a Pandas DataFrame
df = pd.read_csv(tsv_file_path, sep='\t')

# Convert the DataFrame to an Arrow table
arrow_table = pa.Table.from_pandas(df)

# Save the Arrow table as a Parquet file
parquet_file_path = 'dataDefuncionesPorEdadComunaYSexo_2014_2023.parquet'
pq.write_table(arrow_table, parquet_file_path)

# Save the Arrow table as an Arrow file
arrow_file_path = 'dataDefuncionesPorEdadComunaYSexo_2014_2023.arrow'
with pa.OSFile(arrow_file_path, 'wb') as sink:
    with pa.ipc.new_file(sink, arrow_table.schema) as writer:
        writer.write_table(arrow_table)

print(f"TSV file {tsv_file_path} has been converted to Parquet format and saved as {parquet_file_path}")
print(f"TSV file {tsv_file_path} has been converted to Arrow format and saved as {arrow_file_path}")


TSV file dataDefuncionesPorEdadComunaYSexo_2014_2023.tsv has been converted to Parquet format and saved as dataDefuncionesPorEdadComunaYSexo_2014_2023.parquet
TSV file dataDefuncionesPorEdadComunaYSexo_2014_2023.tsv has been converted to Arrow format and saved as dataDefuncionesPorEdadComunaYSexo_2014_2023.arrow


In [4]:
arrow_table

pyarrow.Table
comuna: string
sexo: string
edad: int64
defunciones: int64
----
comuna: [["Aisén","Aisén","Aisén","Aisén","Aisén",...,"Chile","Chile","Chile","Chile","Chile"]]
sexo: [["Hombre","Hombre","Hombre","Hombre","Hombre",...,"Hombre","Mujer","Hombre","Mujer","Mujer"]]
edad: [[0,1,5,13,14,...,52,63,43,74,109]]
defunciones: [[6,1,1,2,1,...,5684,6461,2872,11260,42]]

In [6]:
import pyarrow.compute as pc


In [7]:
filtered_table = arrow_table.filter(pc.field('edad') < 30)
print(filtered_table)

pyarrow.Table
comuna: string
sexo: string
edad: int64
defunciones: int64
----
comuna: [["Aisén","Aisén","Aisén","Aisén","Aisén",...,"Panquehue","Pedro Aguirre Cerda","Pedro Aguirre Cerda","Paine","Pedro Aguirre Cerda"],["Penco","Penco","Penco","Penco","Penco",...,"Chile","Chile","Chile","Chile","Chile"]]
sexo: [["Hombre","Hombre","Hombre","Hombre","Hombre",...,"Hombre","Hombre","Mujer","Hombre","Mujer"],["Hombre","Hombre","Hombre","Hombre","Hombre",...,"Mujer","Hombre","Mujer","Hombre","Hombre"]]
edad: [[0,1,5,13,14,...,18,22,14,4,1],[0,4,11,12,14,...,28,14,23,27,11]]
defunciones: [[6,1,1,2,1,...,1,9,1,1,2],[15,1,1,1,1,...,595,287,491,1732,178]]
