In [1]:
import numpy as np
import pandas as pd
import itertools
from MIOpt import getMagDf, makeKernel, getCoordinateGrid, greedy_select
from makeModel import df_to_X_y, buildModel, trainModel

In [2]:
import numpy as np
import pandas as pd

energy = pd.read_csv('/Users/schristianson/Desktop/NY Wind Energy Model/historical_energy_24(test).csv')

def dateparser(date_str):
    # Adjust the timezone offset format to ISO 8601 (from -05 to -05:00)
    date_str = str(date_str).strip()
    date_str = date_str + ":00"  # Insert the colon in the timezone offset
    date = pd.to_datetime(date_str, format="%Y-%m-%dT%H%z").tz_convert("America/New_York")
    return date

energy['period'] =  energy["period"].astype(str).str.strip().apply(dateparser)

start_ny = pd.Timestamp('2024-01-01', tz='America/New_York')
end_ny = pd.Timestamp('2024-12-31', tz='America/New_York')
energy = energy[(energy['period'] >= start_ny) & (energy['period'] < end_ny)]
energy = energy.set_index('period')


In [3]:
x_km_from_turbines = 10
df_mag = getMagDf(x_km_from_turbines)
print(df_mag)
print(df_mag.index)

                     (42.942934562000126, -74.49950663544769)  \
2024-01-01 00:00:00                                  5.010549   
2024-01-01 01:00:00                                  5.830823   
2024-01-01 02:00:00                                  5.255778   
2024-01-01 03:00:00                                  5.526014   
2024-01-01 04:00:00                                  4.253234   
...                                                       ...   
2024-12-30 19:00:00                                 10.943729   
2024-12-30 20:00:00                                 11.093845   
2024-12-30 21:00:00                                  8.748582   
2024-12-30 22:00:00                                 10.072057   
2024-12-30 23:00:00                                  7.943422   

                     (42.3219345620001, -78.39350663544784)  \
2024-01-01 00:00:00                                2.028267   
2024-01-01 01:00:00                                1.230134   
2024-01-01 02:00:00           

In [4]:
print(df_mag.isna().any(axis=1).sum())

253


In [9]:
#square root to stabilize variance
df_mag_sqrt = np.sqrt(df_mag)
print(df_mag_sqrt.shape)
print(df_mag_sqrt.isna().any(axis=1).sum())
K = makeKernel(df_mag_sqrt)


(8760, 1516)
253


In [6]:
def getCoordinateGrid(df):
    """
    Takes dataframe with coordinates as the columns and returns ordered lat and lon dicts with coordinates
    columns are given as (lat, lon) strings
    """

    coords = df.columns.to_list()
    coords = [coord.strip("()") for coord in coords]
    lat, lon = zip(*[(float(lat), float(lon)) for lat, lon in (coord.split(", ") for coord in coords)])
    lat = sorted(set(lat))
    lon = sorted(set(lon))
    lat_dict=dict(zip(range(len(lat)),lat))
    lon_dict=dict(zip(range(len(lon)),lon))
    return lat_dict, lon_dict, coords

def mutual_info_gain(K_sub, sigma2):
    n = K_sub.shape[0]
    return 0.5 * np.linalg.slogdet(np.eye(n) + (1 / sigma2) * K_sub)[1]

def greedy_select(lat_dict, lon_dict, coords, K_full, k=10, sigma2=10**(-5)):
    """
    Greedy Mutual Information Optimization to select points of maximum information

    Args:
        lat_dict: dictionary of integer range as keys and lat coordinates as values
        lon_dict: dictionary of integer range as keys and lon coordinates as values
        coords: list of coords within x km from turbine as strings 'xxx, yyy'
        K_full: Covariance Kernel Matrix
        k: number of sensors to select
        sigma2: variance of observed values, not relevant to building distribution so set to small
        value for stability

    Returns:
        selected_indices: indices of selected coordinates
        selected_coords: selected coordinates
    """
    
    selected_indices = []
    selected_coords = []

    remaining_indices = list(itertools.product(list(lat_dict.keys()), list(lon_dict.keys())))

    remaining_indices = [i for i in remaining_indices if f"{lat_dict[i[0]]}, {lon_dict[i[1]]}" in coords]
    
    for _ in range(k):
        print(f'finding {_} out of {k}')
        best_gain = -np.inf
        best_idx = None
        
        for i in remaining_indices:
            candidate = selected_indices + [i]

            lats, lons = zip(*candidate)
            print(lats)
            K_full[np.ix_(lats, lons)]


            K_sub = K_full[np.ix_(lats, lons)]
            gain = mutual_info_gain(K_sub, sigma2)
            
            if gain > best_gain:
                best_gain = gain
                best_idx = i
        
        print(best_idx)
        selected_indices.append(best_idx)
        selected_coords.append((lat_dict[best_idx[0]], lon_dict[best_idx[1]]))
        remaining_indices.remove(best_idx)

    return selected_indices, selected_coords

In [10]:
print(K)

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]


In [8]:
lat_dict, lon_dict, coords = getCoordinateGrid(df_mag_sqrt)
best_indices, best_coords = greedy_select(lat_dict, lon_dict, coords, K)

finding 0 out of 10
(0,)
(1,)
(2,)
(3,)
(4,)
(5,)
(5,)
(5,)
(6,)
(7,)
(8,)
(9,)
(10,)
(10,)
(11,)
(12,)
(12,)
(13,)
(14,)
(15,)
(16,)
(17,)
(18,)
(19,)
(20,)
(21,)
(22,)
(23,)
(24,)
(24,)
(25,)
(26,)
(27,)
(28,)
(29,)
(30,)
(31,)
(32,)
(33,)
(34,)
(35,)
(36,)
(37,)
(38,)
(39,)
(40,)
(41,)
(42,)
(43,)
(44,)
(45,)
(46,)
(47,)
(48,)
(49,)
(50,)
(51,)
(51,)
(51,)
(51,)
(51,)
(51,)
(51,)
(51,)
(51,)
(51,)
(51,)
(51,)
(51,)
(52,)
(52,)
(52,)
(52,)
(52,)
(52,)
(52,)
(52,)
(52,)
(52,)
(52,)
(52,)
(52,)
(52,)
(52,)
(53,)
(53,)
(53,)
(53,)
(53,)
(53,)
(53,)
(53,)
(53,)
(53,)
(53,)
(53,)
(53,)
(53,)
(53,)
(53,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(54,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(55,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(56,)
(57,)
(57,)
(57,)
(57,)
(57,)
(5

  sign, logdet = _umath_linalg.slogdet(a, signature=signature)


TypeError: 'NoneType' object is not subscriptable

In [None]:
best_coords_str = [str(i).strip('()') for i in best_coords]

#Select optimized sensor locations
X = df_mag_sqrt.loc[:, df_mag_sqrt.columns.str.contains('|'.join(best_coords_str))]

#get target variables
y = energy['Megawatthours']

In [None]:
#We are getting less than 8760 data points in the X data, looks like there were some points missing in the download,
#I checked the data cleaning steps to make sure we were not losing any there. Only missing ~250 hours so should be fine
#Added a function to the dataframe creation to make rows for missing hours and use NaN in columns
"""
Testing to make sure dates lined up and if there was anything missing

X_dates = set(X.index.astype(str).to_list())
y_dates = y.index.astype(str).to_list()

y_dates = set([i[:-6] for i in y_dates])

missing_x_dates = y_dates - X_dates
missing_x_dates = [str(x)+"-05:00" for x in missing_x_dates]
"""

'\nTesting to make sure dates lined up and if there was anything missing\n\nX_dates = set(X.index.astype(str).to_list())\ny_dates = y.index.astype(str).to_list()\n\ny_dates = set([i[:-6] for i in y_dates])\n\nmissing_x_dates = y_dates - X_dates\nmissing_x_dates = [str(x)+"-05:00" for x in missing_x_dates]\n'

In [None]:
X,y = df_to_X_y(X, y, window_size=24)
