# Tesla Vehicle Data Storage and Processing
This notebook demonstrates how to efficiently store and process Tesla vehicle data using CSV and Parquet formats, with environment-driven paths for better organization and reproducibility.

## Objectives:
- Configure environment-driven paths for `data/raw/` and `data/processed/`
- Save and load Tesla vehicle data in both CSV and Parquet formats
- Implement utility functions for data I/O operations
- Validate data integrity across different storage formats

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import datetime as dt

In [2]:
# Load environment variables
dotenv_path = "../.env"
load_dotenv(dotenv_path)
VEHICLE_DATA_RAW = os.getenv("DATA_DIR_RAW")
VEHICLE_DATA_PROCESSED = os.getenv("DATA_DIR_PROCESSED")
print(f"Raw data directory: {VEHICLE_DATA_RAW}")
print(f"Processed data directory: {VEHICLE_DATA_PROCESSED}")

Raw data directory: None
Processed data directory: None


In [5]:
# Create sample Tesla vehicle data
tesla_vehicles = pd.DataFrame({
    "model": ["Model 3", "Model S", "Model X", "Model Y", "Cybertruck"],
    "production_year": [2022, 2023, 2023, 2023, 2024],
    "battery_type": ["LFP", "NCA", "NCA", "LFP", "4680"],
    "range_miles": [272, 405, 348, 330, 340],
    "acceleration_0_60": [5.8, 3.1, 3.8, 4.8, 2.9],
    "price_usd": [42990, 88490, 98490, 47490, 79990]
})

# Define file paths
vehicle_metrics_csv = os.path.join(VEHICLE_DATA_RAW, "tesla_vehicles.csv")
vehicle_metrics_parquet = os.path.join(VEHICLE_DATA_PROCESSED, "tesla_vehicles.parquet")

# Display sample data
print("Sample Tesla Vehicle Data:")
tesla_vehicles

TypeError: expected str, bytes or os.PathLike object, not NoneType

In [4]:
# Save data to CSV and Parquet
def save_vehicle_data(df, path, **kwargs):
    """Save vehicle data to either CSV or Parquet format.
    
    Args:
        df: DataFrame containing vehicle data
        path: Output file path (must end with .csv or .parquet)
        **kwargs: Additional arguments passed to pandas to_csv or to_parquet
    """
    os.makedirs(os.path.dirname(path), exist_ok=True)
    if path.endswith(".csv"):
        df.to_csv(path, index=False, **kwargs)
    elif path.endswith(".parquet"):
        try:
            df.to_parquet(path, index=False, **kwargs)
        except ImportError:
            raise RuntimeError("Missing Parquet engine. Install pyarrow or fastparquet.")
    else:
        raise ValueError("Unsupported file type. Use .csv or .parquet")

# Save the data
save_vehicle_data(tesla_vehicles, vehicle_metrics_csv)
save_vehicle_data(tesla_vehicles, vehicle_metrics_parquet, engine="fastparquet")

print(f"Data saved to:\n- {vehicle_metrics_csv}\n- {vehicle_metrics_parquet}")

NameError: name 'vehicle_metrics_csv' is not defined

In [None]:
# Load and validate the saved data
def load_vehicle_data(path, **kwargs):
    """Load vehicle data from either CSV or Parquet format.
    
    Args:
        path: Input file path (must end with .csv or .parquet)
        **kwargs: Additional arguments passed to pandas read_csv or read_parquet
        
    Returns:
        DataFrame containing the loaded vehicle data
    """
    if path.endswith(".csv"):
        return pd.read_csv(path, **kwargs)
    elif path.endswith(".parquet"):
        return pd.read_parquet(path, **kwargs)
    else:
        raise ValueError("Unsupported file type. Use .csv or .parquet")

def validate_vehicle_data(df1, df2, key_cols):
    """Validate that two DataFrames have the same shape and column data types.
    
    Args:
        df1: First DataFrame for comparison
        df2: Second DataFrame for comparison
        key_cols: List of column names to check data types for
        
    Returns:
        Dictionary with validation results
    """
    same_shape = df1.shape == df2.shape
    same_dtypes = all(df1[col].dtype == df2[col].dtype for col in key_cols)
    return {"same_shape": same_shape, "same_dtypes": same_dtypes}

# Load the data
tesla_csv_data = load_vehicle_data(vehicle_metrics_csv)
tesla_parquet_data = load_vehicle_data(vehicle_metrics_parquet, engine="fastparquet")

# Validate the data
print("CSV data shape:", tesla_csv_data.shape)
print("Parquet data shape:", tesla_parquet_data.shape)

# Check data integrity
validation_results = validate_vehicle_data(
    tesla_csv_data, 
    tesla_parquet_data, 
    key_cols=["model", "production_year", "range_miles"]
)

print("\nData Validation Results:")
for key, value in validation_results.items():
    print(f"- {key}: {value}")

# Display first few rows from both formats
print("\nFirst 3 rows from CSV data:")
tesla_csv_data.head(3)

In [None]:
# Example analysis with the loaded data
print("Tesla Vehicle Analysis")
print("=" * 50)
print(f"Total models: {len(tesla_csv_data)}")
print(f"Average price: ${tesla_csv_data['price_usd'].mean():,.2f}")
print(f"Average range: {tesla_csv_data['range_miles'].mean():.1f} miles")
print(f"Fastest 0-60 mph: {tesla_csv_data['acceleration_0_60'].min()} seconds")

# Group by battery type
battery_stats = tesla_csv_data.groupby('battery_type').agg({
    'model': 'count',
    'range_miles': 'mean',
    'price_usd': 'mean'
}).round(2)

print("\nBattery Type Analysis:")
battery_stats