In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
raw_data = pd.read_csv('/home/sina.tvk.1997/AI-weather-predictor/data/raw_data.csv')

In [3]:
def add_time_features(df):

    df['time'] = pd.to_datetime(df['time'])
    df['year'] = df['time'].dt.year
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['weekday'] = df['time'].dt.weekday
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['dayofyear'] = df['time'].dt.dayofyear
    df['dayofyear_sin'] = np.sin(2 * np.pi * df['dayofyear'] / 365)
    df['dayofyear_cos'] = np.cos(2 * np.pi * df['dayofyear'] / 365)

    
    return df


In [4]:
def handle_missing_value(data):
    new_order = [
        'city', 'year', 'month', 'day', 'dayofyear',
        'dayofyear_sin', 'dayofyear_cos', 'weekday', 'is_weekend',
        'tavg', 'tmin', 'tmax', 'wspd', 'prcp', 'snow',
        'wspd_missing'
    ]

    snowy_cities = [
        "Moscow", "Toronto", "Chicago", "Helsinki", "Oslo", "Stockholm", "Tallinn", "Montreal", "Halifax",
        "Winnipeg", "Fairbanks", "Yellowknife", "Barrow (Utqiaġvik)", "Tromsø", "Novosibirsk", "Irkutsk",
        "Anchorage", "Vladivostok", "Astana", "Bishkek"
    ]

    for col in ['tavg', 'tmin', 'tmax', 'wspd', 'prcp', 'snow']:
        print(f"{col}: {data[col].isna().sum()} missing before filling")

    for col in ['wspd', 'prcp', 'snow']:
        data[f'{col}_missing'] = data[col].isna().astype(int)

    data.drop(columns=['wdir', 'tsun', 'wpgt', 'pres'], inplace=True, errors='ignore')

    data['snow'] = data.groupby('city')['snow'].transform(
        lambda x: x.fillna(x.rolling(window=7, min_periods=1).median()) if x.name in snowy_cities else x.fillna(0.0)
    )
    data['snow'] = data['snow'].fillna(0.0)

    data['prcp'] = data.groupby('city')['prcp'].transform(
        lambda x: x.fillna(x.rolling(window=7, min_periods=1).median())
    )
    data['prcp'] = data['prcp'].fillna(0.0)

    for col in ['tavg', 'tmin', 'tmax', 'wspd']:
        data[col] = data.groupby('city')[col].transform(
            lambda x: x.interpolate(limit=5, limit_direction='both')
        )

    # Final fallback fill for any leftovers
    for col in ['tavg', 'tmin', 'tmax', 'wspd']:
        data[col] = data.groupby('city')[col].transform(
            lambda x: x.fillna(method='ffill').fillna(method='bfill')
        )

    # Optional: drop rows that somehow still have NaN
    data = data.dropna().reset_index(drop=True)

    return data


In [5]:
def add_derived_weather_features(df):
    
    # Temperature range: difference between daily max and min temperature
    df['temp_range'] = df['tmax'] - df['tmin']
    
    # Snow ratio: how much of the precipitation was snow
    # If no precipitation, set ratio to 0 to avoid division by zero
    df['snow_ratio'] = df.apply(
        lambda row: row['snow'] / row['prcp'] if row['prcp'] > 0 else 0.0, axis=1
    )

    # Average temperature change over last 3 days
    # Helps capture recent warming/cooling trends
    df['avg_temp_change_3d'] = df.groupby('city')['tavg'].transform(
        lambda x: x.diff().rolling(window=3, min_periods=1).mean()
    )

    # Wind chill index: "feels like" temperature in cold, windy conditions
    # Only computed for t <= 10°C and wind speed > 4.8 km/h
    def compute_wind_chill(t, v):
        if pd.isna(t) or pd.isna(v):
            return np.nan
        if t > 10 or v <= 4.8:
            return t  # No wind chill effect
        return 13.12 + 0.6215*t - 11.37*(v**0.16) + 0.3965*t*(v**0.16)

    df['wind_chill_index'] = df.apply(lambda row: compute_wind_chill(row['tavg'], row['wspd']), axis=1)

    # Extreme event binary flags (used as extra indicators or classification targets)

    # Freezing conditions: daily minimum temperature at or below 0°C
    df['is_freezing'] = (df['tmin'] <= 0).astype(int)

    # Heavy precipitation event: 10 mm or more in a day
    df['heavy_precip'] = (df['prcp'] >= 10.0).astype(int)

    # Strong wind event: 40 km/h or more
    df['strong_wind'] = (df['wspd'] >= 40.0).astype(int)

    # Heatwave indicator: daily max temperature at or above 35°C
    df['heatwave'] = (df['tmax'] >= 35.0).astype(int)

    # Cold wave indicator: daily min temperature at or below -10°C
    df['cold_wave'] = (df['tmin'] <= -10.0).astype(int)


    return df


In [6]:
def create_sequences(df, input_len=30, output_len=7, features=None, targets=None):

    """
    Converts a time series dataframe into a list of (input_sequence, target_sequence) pairs
    for supervised learning with sequential models (e.g., LSTM, Transformer).

    Parameters:
    - df: Preprocessed pandas DataFrame containing all cities' weather data
    - input_len: Number of past days used as input (default is 30)
    - output_len: Number of future days to predict (default is 7)
    - features: List of feature columns to include in the input (X)
    - targets: List of target columns to include in the output (y)

    Returns:
    - sequences: A list of tuples, each containing:
        (input_array: shape [input_len, num_features],
         target_array: shape [output_len, num_targets])
    """

    sequences = []

    # Process each city independently to preserve temporal order and locality
    for city, group in df.groupby('city'):
        # Sort data by time to maintain chronological order
        group = group.sort_values('time').reset_index(drop=True)
        
        # Slide a moving window across the data
        for i in range(len(group) - input_len - output_len + 1):
            # Input window: 30 days of past data
            input_slice = group.iloc[i:i+input_len][features].values
            
            # Output window: 7 days of future targets
            target_slice = group.iloc[i+input_len:i+input_len+output_len][targets].values

            # Save the (X, y) pair
            sequences.append((input_slice, target_slice))

    return sequences


In [7]:
def prepare_numpy_batches(sequences):
    
    """
    Converts a list of (input_sequence, target_sequence) pairs into NumPy arrays
    that can be fed into machine learning models.

    Parameters:
    - sequences: List of tuples, where each tuple is:
        (input_array: shape [input_len, num_features],
         target_array: shape [output_len, num_targets])

    Returns:
    - X: NumPy array of shape [num_samples, input_len, num_features]
    - y: NumPy array of shape [num_samples, output_len, num_targets]
    """

    # Extract all input sequences into a NumPy array
    X = np.array([s[0] for s in sequences])

    # Extract all target sequences into a NumPy array
    y = np.array([s[1] for s in sequences])

    return X, y


In [8]:
def main(data):

    # Step 1: Enrich with time features
    data = add_time_features(data)

    # Step 2: Fill missing values
    data = handle_missing_value(data)

    # Step 3: Add engineered features
    data = add_derived_weather_features(data)

    # Step 4: Select features and targets
    features = [
        'tavg', 'tmin', 'tmax', 'wspd', 'prcp', 'snow',
        'dayofyear_sin', 'dayofyear_cos',
        'month', 'weekday', 'is_weekend',
        'temp_range', 'snow_ratio', 'avg_temp_change_3d',
        'wind_chill_index', 'is_freezing', 'heavy_precip',
        'strong_wind', 'heatwave', 'cold_wave'
    ]

    targets = ['tavg', 'tmin', 'tmax', 'wspd', 'prcp', 'snow']
    data = data.dropna().reset_index(drop=True)

    # Step 5: Generate sequences
    sequences = create_sequences(
        df=data,
        input_len=30,
        output_len=7,
        features=features,
        targets=targets
    )

    # Step 6: Convert to NumPy
    X, y = prepare_numpy_batches(sequences)

    # Step 7: Save to disk

    save_dir = '/home/sina.tvk.1997/AI-weather-predictor/data/'
    np.save(os.path.join(save_dir, "X.npy"), X)
    np.save(os.path.join(save_dir, "y.npy"), y)
    print(f"✅ Saved X and y to {save_dir}/")

    return X, y

In [9]:
x,y = main(raw_data)

tavg: 17114 missing before filling
tmin: 36320 missing before filling
tmax: 17988 missing before filling
wspd: 114564 missing before filling
prcp: 123715 missing before filling
snow: 381025 missing before filling


  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')


✅ Saved X and y to /home/sina.tvk.1997/AI-weather-predictor/data//
