# Data Processing Notebook

This notebook processes the raw weather data generated in the previous step.

In [None]:
import pandas as pd
import numpy as np
import os

print("Starting data processing...")

In [None]:
# Ensure data directory exists
os.makedirs('data', exist_ok=True)

In [None]:
# Load the raw data
input_file = 'data/raw_weather_data.csv'
print(f"Loading data from {input_file}")

try:
    df = pd.read_csv(input_file)
    print(f"Loaded {len(df)} rows of data")
    print(df.head())
except FileNotFoundError:
    print(f"Error: Could not find {input_file}")
    raise

In [None]:
# Process the data
def process_data(df):
    # Convert date to datetime
    df['date'] = pd.to_datetime(df['date'])
    
    # Add some derived features
    df['day_of_week'] = df['date'].dt.day_name()
    df['month'] = df['date'].dt.month_name()
    
    # Calculate rolling averages
    df['temperature_7day_avg'] = df['temperature'].rolling(window=7, min_periods=1).mean()
    df['humidity_7day_avg'] = df['humidity'].rolling(window=7, min_periods=1).mean()
    df['pressure_7day_avg'] = df['pressure'].rolling(window=7, min_periods=1).mean()
    
    # Flag extreme values
    df['extreme_temp'] = np.where(
        (df['temperature'] > df['temperature'].mean() + 2*df['temperature'].std()) | 
        (df['temperature'] < df['temperature'].mean() - 2*df['temperature'].std()),
        True, False
    )
    
    return df

# Process the data
processed_df = process_data(df)
print("Data processing complete")
print(processed_df.head())

In [None]:
# Save processed data
output_file = 'data/processed_weather_data.csv'
processed_df.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")

In [None]:
print("Data processing complete!")