In [1]:
# Import dependencies
import boto3
import pyarrow
import pyarrow.parquet as pq
import pandas as pd
import os

Run the cell and then go to the data bucket and see that the filesize has changed from 1.2MB to 1.4MB

In [2]:
# modify the file in S3
fs = pyarrow.fs.S3FileSystem(
        endpoint_override=os.environ.get('AWS_S3_ENDPOINT'),
        access_key=os.environ.get('AWS_ACCESS_KEY_ID'),
        secret_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
    )

with fs.open_input_file('data/song_properties.parquet') as file:
    df = pd.read_parquet(file)

# Make some change
df = pd.concat([df, df], ignore_index=True)

pq.write_table(pyarrow.table(df), 'data/song_properties.parquet', filesystem=fs)

Now that we have updated the data, we can create a new data version

In [None]:
# Update our version with the new change
!dvc update song_properties.parquet.dvc --to-remote

And we will track that version in Git...

In [None]:
# Track the change in git
!git add song_properties.parquet.dvc
!git commit -m "updated data"

So now when we want to, we can just check out an old version from git to know what data version was used with that git

In [None]:
# Revert to our old dvc file
!git checkout HEAD~1 song_properties.parquet.dvc

Pull down the original file and push it to the data storage (we don't have a way to push it directly through DVC)

In [None]:
!dvc pull
df = pd.read_parquet('song_properties.parquet', engine='pyarrow')
pq.write_table(pyarrow.table(df), 'data/song_properties.parquet', filesystem=fs)

And we are now back at the original data and able to track the revert!

In [None]:
# Update and version dvc again with the reverted data
!dvc update song_properties.parquet.dvc --to-remote
!git add song_properties.parquet.dvc
!git commit -m "reverted data"