# Download Data, sort after date and save

### Download Serverless

In [3]:
!aws s3 cp --no-sign-request s3://redshift-downloads/redset/serverless/full.parquet full_serverless.parquet

download: s3://redshift-downloads/redset/serverless/full.parquet to ./full_serverless.parquet


### Download Provisioned 1%

In [6]:
!aws s3 cp --no-sign-request s3://redshift-downloads/redset/provisioned/sample_0.01.parquet sample_0.01_provisioned.parquet

download: s3://redshift-downloads/redset/provisioned/sample_0.01.parquet to ./sample_0.01_provisioned.parquet


### Combine both data

In [7]:
import pandas as pd

# Load the serverless dataset
print("Loading serverless dataset...")
serverless_df = pd.read_parquet("full_serverless.parquet")
serverless_df['dataset_type'] = 'serverless'  # Add flag for serverless

# Load the provisioned dataset
print("Loading provisioned dataset...")
provisioned_df = pd.read_parquet("sample_0.01_provisioned.parquet")
provisioned_df['dataset_type'] = 'provisioned'  # Add flag for provisioned

# Combine both datasets into one DataFrame
print("Combining datasets...")
combined_df = pd.concat([serverless_df, provisioned_df], ignore_index=True)

# Sort the combined dataset by 'arrival_timestamp'
print("Sorting by arrival_timestamp...")
combined_df = combined_df.sort_values(by='arrival_timestamp')

print("Droping duplicates...")
combined_df = combined_df.drop_duplicates()

# Save the sorted and combined dataset as a Parquet file
output_path = "combined_sorted_redset_datasets.parquet"
print(f"Saving combined and sorted dataset to {output_path}...")
combined_df.to_parquet(output_path, index=False)

print("Done! Combined and sorted dataset saved as Parquet.")

Loading serverless dataset...
Loading provisioned dataset...
Combining datasets...
Sorting by arrival_timestamp...
Saving combined and sorted dataset to combined_sorted_redset_datasets.parquet...
Done! Combined and sorted dataset saved as Parquet.
2024-02-29 23:59:58.741545
2024-05-30 23:59:42.680457
