In [6]:
%%time

import pandas as pd
import numpy as np

# Load and preprocess data
data = pd.read_csv('weatherHistory.csv')  
print(f"Original rows: {len(data)}")

# Select last 100 rows
data = data.iloc[-100:]
print(f"Rows after selecting last 100: {len(data)}")

# Convert dates and sort
data['Formatted Date'] = pd.to_datetime(data['Formatted Date'], utc=True)
data = data.sort_values('Formatted Date')

# Define feature combinations
combo1_features = ['Temperature (C)']
combo2_features = ['Temperature (C)', 'Humidity']
combo3_features = ['Temperature (C)', 'Humidity', 'Wind Speed (km/h)']
combo4_features = ['Temperature (C)', 'Humidity', 'Wind Speed (km/h)', 'Visibility (km)']
combo5_features = ['Temperature (C)', 'Humidity', 'Wind Speed (km/h)', 'Visibility (km)', 'Pressure (millibars)']

# Drop NA for relevant columns
all_features = list(set(combo1_features + combo2_features + combo3_features + combo4_features + combo5_features))
data = data[all_features].dropna()
print(f"Rows after dropna: {len(data)}")

# Prepare sliding window data (7-day window)
window_size = 7
n_windows = len(data) - window_size
print(f"Number of windows: {n_windows}")

# Preallocate arrays for efficiency
X_combo1 = np.zeros((n_windows, window_size, len(combo1_features)))  
X_combo2 = np.zeros((n_windows, window_size, len(combo2_features)))  
X_combo3 = np.zeros((n_windows, window_size, len(combo3_features))) 
X_combo4 = np.zeros((n_windows, window_size, len(combo4_features)))  
X_combo5 = np.zeros((n_windows, window_size, len(combo5_features)))  
y = np.zeros(n_windows)  

# Create sliding windows
for i in range(n_windows):
    X_combo1[i] = data[combo1_features].iloc[i:i + window_size].values
    X_combo2[i] = data[combo2_features].iloc[i:i + window_size].values
    X_combo3[i] = data[combo3_features].iloc[i:i + window_size].values
    X_combo4[i] = data[combo4_features].iloc[i:i + window_size].values
    X_combo5[i] = data[combo5_features].iloc[i:i + window_size].values
    y[i] = data['Temperature (C)'].iloc[i + window_size]

# Save preprocessed data to .npz file
np.savez('weather_preprocessed_last100.npz', 
         X_combo1=X_combo1, 
         X_combo2=X_combo2, 
         X_combo3=X_combo3, 
         X_combo4=X_combo4, 
         X_combo5=X_combo5, 
         y=y)
print("Preprocessed data saved to 'weather_preprocessed_last100.npz'")

Original rows: 96453
Rows after selecting last 100: 100
Rows after dropna: 100
Number of windows: 93
Preprocessed data saved to 'weather_preprocessed_last100.npz'
CPU times: user 142 ms, sys: 21.2 ms, total: 163 ms
Wall time: 165 ms
