In [None]:
import numpy as np
import pandas as pd 
from sklearn.linear_model import Ridge as ridge
from sklearn.metrics import mean_absolute_error as mae
from multiprocessing import Pool, Semaphore as sem, cpu_count
import time


In [2]:
#!/usr/bin/env python3

import numpy as np
import pandas as pd 
from sklearn.linear_model import Ridge as ridge
from sklearn.metrics import mean_absolute_error as mae
from multiprocessing import Pool, Semaphore, cpu_count
import time

global LOC, DAILY_FILEPATH, HOURLY_FILEPATH, headers, new_cols
new_cols = [
    'target', 'weather_code', 'temperature_2m', 'relative_humidity_2m', 'precipitation', 
    'rain', 'surface_pressure', 'cloud_cover', 'wind_speed_10m', 'wind_speed_100m', 
    'wind_direction_10m', 'wind_direction_100m', 'temperature_2m_K', 'surface_pressure_Pa', 
    'density', 'speed_of_sound'
]
LOC = '/home/bkelley/capstone'
DAILY_FILEPATH='/home/bkelley/capstone/data_collection/weather/data/daily_weather_with_temp_avg.csv'
HOURLY_FILEPATH = '/home/bkelley/capstone/data_collection/weather/data/cleaned_hourly_weather_data.csv'
# Semaphore to limit the number of parallel processes
sem = Semaphore(64)  # Limit to the number of CPU cores


# generates predictions for data except first 5 years
def backtest(data, model, predictors, start=365*5, step=70):
    all_pred = []
    for i in range(start, data.shape[0], step):
        train = data.iloc[:i, :]  # all rows up to i
        test = data.iloc[i:(i+step), :]  # takes next step days to make predictions on
        model.fit(train[predictors], train['target'])
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test['target'], preds], axis=1)
        combined.columns = ['actual', 'prediction']
        combined['diff'] = (combined['prediction'] - combined['actual']).abs()
        all_pred.append(combined)
    return pd.concat(all_pred)

def pct_diff(old, new):
    return (new - old) / old

def compute_rolling(data, horizon, col):
    label = f"rolling_{horizon}_{col}"
    data[label] = data[col].rolling(horizon).mean()
    data[f"{label}_pct"] = pct_diff(data[label], data[col])
    return data

def expand_mean(data):
    return data.expanding(1).mean()

# Function to process each header with alpha tuning
# Adjust the stopping criteria and step size
def process_header(current_value):
    with sem:
        result = []
        data = pd.read_csv(HOURLY_FILEPATH, index_col='date')
        # fill in missing data with last known value
        data = data.ffill()
        data.index = pd.to_datetime(data.index)

        data['target'] = data.shift(-1)[current_value]
        data = data.ffill()

        # Set initial alpha and other parameters
        alpha = 0.1
        step_size = 0.5  # Increase the step size
        min_delta = 0.01  # Allow larger changes in MAE
        max_iterations = 10_000  # Cap the iterations
        previous_mae = float('inf')
        best_alpha = alpha
        best_mae = previous_mae
        iteration = 0

        while iteration < max_iterations:
            rr = ridge(alpha=alpha)
            predictors = data.columns[~data.columns.isin([new_cols])]
            predictions = backtest(data, rr, predictors)
            mean_abs_error = mae(predictions['actual'], predictions['prediction'])

            mae_diff = abs(previous_mae - mean_abs_error)

            if mean_abs_error < best_mae:
                best_mae = mean_abs_error
                best_alpha = alpha

            # If MAE change is below threshold, continue but stop after a set number of iterations
            if mae_diff < min_delta and iteration > 100:
                break

            previous_mae = mean_abs_error
            alpha += step_size
            iteration += 1

        # Return only the best result for each header
        return f"Best MAE for {current_value}: {best_mae:.6f} with optimal alpha: {best_alpha}"

# Main code with multiprocessing pool
if __name__ == "__main__":
    start_time = time.time()

    data = pd.read_csv(DAILY_FILEPATH, index_col='date')
    null_pct = data.apply(pd.isnull).sum()/data.shape[0]
    valid_columns = data.columns[null_pct < 0.05]
    data = data[valid_columns].copy()
    data.columns = data.columns.str.lower()

    data = data.ffill()
    data.index = pd.to_datetime(data.index)
    
    headers = [col for col in data.columns if col not in ['weather_code']]

    results = []
    with Pool(64) as pool:
        for result in pool.map(process_header, headers):
            results.append(result)

    # Print the final result for each header
    print("\n".join(results))

    end_time = time.time()
    print(f"\nFinished processing all headers in {end_time - start_time:.2f} seconds.")
    print(data)


KeyError: 'temperature_2m_max'