In [2]:
!pip install fastparquet google-cloud-storage pandas

import pandas as pd
import fastparquet as fp
from google.cloud import storage

# Set up Google Cloud Storage details
GCS_BUCKET_NAME = "holdout_data"
PARQUET_FILE = "data.parquet"  # Local file name
GCS_PARQUET_PATH = "parquet_files/data.parquet"  # GCS destination path

# Sample DataFrame
data = {
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "score": [95, 89, 76]
}
df = pd.DataFrame(data)

# 🔹 Save DataFrame to a Parquet file using fastparquet
fp.write(PARQUET_FILE, df, compression="snappy")

print("✅ Data saved as Parquet locally.")

# 🔹 Step 3: Upload Parquet File to Google Cloud Storage
def upload_to_gcs(local_file, bucket_name, destination_blob):
    """Uploads a file to Google Cloud Storage."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob)
    
    blob.upload_from_filename(local_file)
    print(f"✅ {local_file} uploaded to GCS: gs://{bucket_name}/{destination_blob}")

upload_to_gcs(PARQUET_FILE, GCS_BUCKET_NAME, GCS_PARQUET_PATH)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
✅ Data saved as Parquet locally.




OSError: Project was not passed and could not be determined from the environment.

In [None]:
def read_parquet_from_gcs(bucket_name, parquet_path):
    """Downloads a Parquet file from GCS and loads it into a Pandas DataFrame."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(parquet_path)
    
    # Download Parquet file
    blob.download_to_filename("temp.parquet")
    
    # Read using fastparquet
    df = fp.ParquetFile("temp.parquet").to_pandas()
    
    print("✅ Parquet file read from GCS successfully.")
    return df

# Fetch and display the data
df_from_gcs = read_parquet_from_gcs(GCS_BUCKET_NAME, GCS_PARQUET_PATH)
print(df_from_gcs)


In [7]:
import pandas as pd
import fastparquet as fp
from google.cloud import storage

# 🔹 Define your GCS bucket details
GCS_BUCKET_NAME = "holdout_data"
PROJECT_ID = "logistics-data-storage-staging"  # Replace with your actual GCP project ID
LOCAL_PARQUET_FILE = "data.parquet"  # Parquet file to upload
GCS_PARQUET_PATH = "parquet_files/data.parquet"  # Destination in GCS

# 🔹 Initialize Google Cloud Storage client
storage_client = storage.Client(project=PROJECT_ID)

# 🔹 Connect to the bucket
bucket = storage_client.bucket(GCS_BUCKET_NAME)
print(f"✅ Connected to GCS bucket: {bucket.name}")

# 🔹 Sample DataFrame
data = {
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "score": [95, 89, 76]
}
df = pd.DataFrame(data)

# 🔹 Save DataFrame as a Parquet file using fastparquet
fp.write(LOCAL_PARQUET_FILE, df, compression="snappy")
print("✅ Parquet file saved locally.")

# 🔹 Function to upload Parquet file to GCS
def upload_parquet_to_gcs(local_file, bucket, destination_blob):
    """Uploads a Parquet file to Google Cloud Storage."""
    blob = bucket.blob(destination_blob)
    blob.upload_from_filename(local_file)
    print(f"✅ {local_file} uploaded to GCS: gs://{bucket.name}/{destination_blob}")

# 🔹 Upload the Parquet file to GCS
upload_parquet_to_gcs(LOCAL_PARQUET_FILE, bucket, GCS_PARQUET_PATH)





✅ Connected to GCS bucket: holdout_data
✅ Parquet file saved locally.
✅ data.parquet uploaded to GCS: gs://holdout_data/parquet_files/data.parquet


In [8]:
import pandas as pd
import fastparquet as fp
from google.cloud import storage
import io

# 🔹 Define your GCS bucket details
GCS_BUCKET_NAME = "holdout_data"
PROJECT_ID = "logistics-data-storage-staging"  # Replace with your actual GCP project ID
GCS_PARQUET_PATH = "parquet_files/data.parquet"  # Destination in GCS

# 🔹 Initialize Google Cloud Storage client
storage_client = storage.Client(project=PROJECT_ID)

# 🔹 Connect to the bucket
bucket = storage_client.bucket(GCS_BUCKET_NAME)
print(f"✅ Connected to GCS bucket: {bucket.name}")

# 🔹 Sample DataFrame (Replace with your actual data)
raw_data = pd.DataFrame({
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "score": [95, 89, 76]
})

# 🔹 Convert DataFrame to a Parquet file in memory (without saving to disk)
parquet_buffer = io.BytesIO()
fp.write(parquet_buffer, raw_data, compression="snappy")
parquet_buffer.seek(0)  # Reset buffer position

# 🔹 Upload Parquet directly to GCS
def upload_parquet_to_gcs(buffer, bucket, destination_blob):
    """Uploads an in-memory Parquet file to Google Cloud Storage."""
    blob = bucket.blob(destination_blob)
    blob.upload_from_file(buffer, content_type="application/octet-stream")
    print(f"✅ Parquet data uploaded directly to GCS: gs://{bucket.name}/{destination_blob}")

# 🔹 Upload the Parquet file to GCS
upload_parquet_to_gcs(parquet_buffer, bucket, GCS_PARQUET_PATH)




✅ Connected to GCS bucket: holdout_data
✅ Parquet data uploaded directly to GCS: gs://holdout_data/parquet_files/data.parquet
