# Download Data, sort after date , split into static and streaming data, save

In [None]:
# Download data using jupyter notebook magic
!aws s3 cp --no-sign-request s3://redshift-downloads/redset/serverless/full.parquet full_serverless.parquet
!aws s3 cp --no-sign-request s3://redshift-downloads/redset/provisioned/full.parquet full_provisioned.parquet

import pandas as pd
import gc  # Import garbage collector interface

# Load the serverless dataset
print("Loading serverless dataset...")
serverless_df = pd.read_parquet("full_serverless.parquet")
serverless_df['dataset_type'] = 'serverless'  # Add flag for serverless

# Load the provisioned dataset
print("Loading provisioned dataset...")
provisioned_df = pd.read_parquet("full_provisioned.parquet")
provisioned_df['dataset_type'] = 'provisioned'  # Add flag for provisioned

# Combine both datasets into one DataFrame
print("Combining datasets...")
combined_df = pd.concat([serverless_df, provisioned_df], ignore_index=True)

# Clear memory of the individual datasets as they're no longer needed
del serverless_df
del provisioned_df
gc.collect()  # Force garbage collection

# Sort the combined dataset by 'arrival_timestamp'
print("Sorting by arrival_timestamp...")
combined_df = combined_df.sort_values(by='arrival_timestamp')

# Drop duplicate records
print("Dropping duplicates...")
combined_df = combined_df.drop_duplicates()

# Only take every 10th row to reduce the dataset size
print("Sampling the dataset...")
combined_df = combined_df.iloc[::10]

# Calculate the index for 66% of the DataFrame
split_index = int(len(combined_df) * 0.66)

# Split the DataFrame into static_data and streaming_data
static_data = combined_df.iloc[:split_index]

# Display information about the static dataset
print(f"Static dataset - Start day: {static_data['arrival_timestamp'].iloc[0].date()}, "
      f"End day: {static_data['arrival_timestamp'].iloc[-1].date()}")

# Save the static data to a Parquet file
print("Saving static_data.parquet...")
static_data.to_parquet("static_data.parquet", index=False)

# Clear memory of static_data
del static_data
gc.collect()  # Force garbage collection

# Take the remaining 34% for the streaming dataset
streaming_data = combined_df.iloc[split_index:]

# Display information about the streaming dataset
print(f"Streaming dataset - Start day: {streaming_data['arrival_timestamp'].iloc[0].date()}, "
      f"End day: {streaming_data['arrival_timestamp'].iloc[-1].date()}")

# Save the streaming data to a Parquet file
print("Saving streaming_data.parquet...")
streaming_data.to_parquet("streaming_data.parquet", index=False)

# Clear memory of streaming_data
del streaming_data
gc.collect()  # Force garbage collection

# Clear the combined DataFrame from memory
del combined_df
gc.collect()  # Force garbage collection

print("Done!")
