# Test Preprocessing from SDU 

In [2]:
import pandas as pd
import numpy as np
import holidays

In [21]:
df = pd.read_csv('DumbCharging_2020_to_2032/Measurements.csv', skipinitialspace=True)

# Only print aggregated Charging data
print(df[['Timestamp', 'Aggregated charging load',
      'Total number of EVs', 'Number of charging EVs', 'Number of driving EVs', 'Overload duration [min]']].head(20))

                   Timestamp  Aggregated charging load  Total number of EVs  \
0   Jan 1, 2020, 12:00:07 AM                       0.0                  1.0   
1    Jan 1, 2020, 1:00:07 AM                       0.0                  1.0   
2    Jan 1, 2020, 2:00:07 AM                       0.0                  1.0   
3    Jan 1, 2020, 3:00:07 AM                       0.0                  1.0   
4    Jan 1, 2020, 4:00:07 AM                       0.0                  1.0   
5    Jan 1, 2020, 5:00:07 AM                       0.0                  1.0   
6    Jan 1, 2020, 6:00:07 AM                       0.0                  1.0   
7    Jan 1, 2020, 7:00:07 AM                       0.0                  1.0   
8    Jan 1, 2020, 8:00:07 AM                       0.0                  1.0   
9    Jan 1, 2020, 9:00:07 AM                       0.0                  1.0   
10  Jan 1, 2020, 10:00:07 AM                       0.0                  1.0   
11  Jan 1, 2020, 11:00:07 AM                       0

In [None]:
columns_to_keep = ['Timestamp', 'Aggregated charging load',
                   'Total number of EVs', 'Number of charging EVs',
                   'Number of driving EVs', 'Overload duration [min]']

df = df[columns_to_keep]

df['Timestamp'] = pd.to_datetime(df['Timestamp'], format="%b %d, %Y, %I:%M:%S %p")

numeric_cols = [
    'Aggregated charging load',
    'Total number of EVs',
    'Number of charging EVs',
    'Number of driving EVs',
    'Overload duration [min]'
]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')


print(df.dtypes)

print(df.head(20))

Timestamp                   datetime64[ns]
Aggregated charging load           float64
Total number of EVs                float64
Number of charging EVs             float64
Number of driving EVs              float64
Overload duration [min]            float64
dtype: object
             Timestamp  Aggregated charging load  Total number of EVs  \
0  2020-01-01 00:00:07                       0.0                  1.0   
1  2020-01-01 01:00:07                       0.0                  1.0   
2  2020-01-01 02:00:07                       0.0                  1.0   
3  2020-01-01 03:00:07                       0.0                  1.0   
4  2020-01-01 04:00:07                       0.0                  1.0   
5  2020-01-01 05:00:07                       0.0                  1.0   
6  2020-01-01 06:00:07                       0.0                  1.0   
7  2020-01-01 07:00:07                       0.0                  1.0   
8  2020-01-01 08:00:07                       0.0                  1.0  

In [32]:
df['Timestamp'] = df['Timestamp'].dt.floor('h')

# Optional: Aggregate if multiple entries exist for the same hour
df = df.groupby('Timestamp')[numeric_cols].sum().reset_index()

# Preview result
print(df.head(50))

# make it to a csv file 
df.to_csv('CleanedDataset.csv', index=False)



             Timestamp  Aggregated charging load  Total number of EVs  \
0  2020-01-01 00:00:00                       0.0                  1.0   
1  2020-01-01 01:00:00                       0.0                  1.0   
2  2020-01-01 02:00:00                       0.0                  1.0   
3  2020-01-01 03:00:00                       0.0                  1.0   
4  2020-01-01 04:00:00                       0.0                  1.0   
5  2020-01-01 05:00:00                       0.0                  1.0   
6  2020-01-01 06:00:00                       0.0                  1.0   
7  2020-01-01 07:00:00                       0.0                  1.0   
8  2020-01-01 08:00:00                       0.0                  1.0   
9  2020-01-01 09:00:00                       0.0                  1.0   
10 2020-01-01 10:00:00                       0.0                  1.0   
11 2020-01-01 11:00:00                       0.0                  1.0   
12 2020-01-01 12:00:00                       0.0   

In [None]:
# def convert_SDU_to_hourly(df):
#   df = df.set_index('Timestamp')

#   hourly = df.resample('h').agg({
#       'Total number of EVs':      'sum',
#       'Number of charging EVs':   'sum',
#       'Number of driving EVs':    'sum',
#       'Total grid load':          'sum',
#       'Aggregated base load':     'sum',
#       'Aggregated charging load': 'sum',
#       'Overload duration [min]':  'sum',
#   })

#   # remove nan values

#   return hourly

# def convert_SDU_to_hourly(df):
#     df = df.set_index('Timestamp')

#     hourly = df.resample('h').agg({
#         'Total number of EVs':      'first',
#         'Number of charging EVs':   'first',
#         'Number of driving EVs':    'first',
#         'Aggregated charging load': 'first',
#         'Overload duration [min]':  'first',
#     })

#     return hourly

def convert_SDU_to_hourly(df):
    df = df.copy()

    # Parse and floor Timestamp to nearest hour to align timestamps exactly on the hour
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['Timestamp'] = df['Timestamp'].dt.floor('H')

    # Set Timestamp as index for resampling
    df.set_index('Timestamp', inplace=True)

    # Optional: drop duplicates for the same floored hour if they exist
    df = df[~df.index.duplicated(keep='first')]

    # Resample hourly and take the first value per hour (since you want to avoid summing)
    hourly = df.resample('H').first()

    # If any columns are missing due to resampling, you can explicitly select and fill NA if needed
    columns = ['Total number of EVs', 'Number of charging EVs', 'Number of driving EVs',
               'Aggregated charging load', 'Overload duration [min]']
    hourly = hourly.reindex(columns=columns)

    return hourly


def add_featuresSDU(df):
  ####################### TIMED BASED FEATURES  #######################
  df['Day_of_Week'] = df.index.dayofweek
  df['Hour_of_Day'] = df.index.hour
  df['Month_of_Year'] = df.index.month
  df['Year'] = df.index.year
  df['Day/Night'] = (df['Hour_of_Day'] >= 6) & (df['Hour_of_Day'] <= 18)

  # Add holiday
  dk_hols = holidays.DK(years=range(
      df.index.year.min(), df.index.year.max() + 1))
  df['IsHoliday'] = df.index.to_series().dt.date.isin(dk_hols).astype(int)

  # Add weekend
  df['Weekend'] = (df['Day_of_Week'] >= 5).astype(int)

  ####################### CYCLIC FEATURES  #######################

  df['HourSin'] = np.sin(2 * np.pi * df['Hour_of_Day'] / 24)
  df['HourCos'] = np.cos(2 * np.pi * df['Hour_of_Day'] / 24)
  df['DayOfWeekSin'] = np.sin(2 * np.pi * df['Day_of_Week'] / 7)
  df['DayOfWeekCos'] = np.cos(2 * np.pi * df['Day_of_Week'] / 7)
  df['MonthOfYearSin'] = np.sin(2 * np.pi * df['Month_of_Year'] / 12)
  df['MonthOfYearCos'] = np.cos(2 * np.pi * df['Month_of_Year'] / 12)

  ####################### SEASONAL FEATURES  #######################
  month_to_season = {1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 2,
                     7: 2, 8: 2, 9: 3, 10: 3, 11: 3, 12: 0}
  df['Season'] = df['Month_of_Year'].map(month_to_season)

  ####################### HISTORICAL CONSUMPTION FEATURES  #######################
  df['Aggregated_charging_load_1h'] = df['Aggregated charging load'].shift(1)
  df['Aggregated_charging_load_6h'] = df['Aggregated charging load'].shift(6)
  df['Aggregated_charging_load_12h'] = df['Aggregated charging load'].shift(12)
  df['Aggregated_charging_load_24h'] = df['Aggregated charging load'].shift(24)
  df['Aggregated_charging_load_1w'] = df['Aggregated charging load'].shift(
      24*7)
  df['Aggregated_charging_rolling'] = df['Aggregated charging load'].rolling(
      window=24).mean()

  return df


def filter_data(start_date, end_date, data):
    return data[(data.index >= start_date) & (data.index <= end_date)].copy()

In [None]:
start_date = pd.to_datetime('2024-12-31')
end_date = pd.to_datetime('2032-12-31')

df = pd.read_csv('\DumbCharging_2020_to_2032\Measurements.csv', skipinitialspace=True)
df.columns = df.columns.str.strip()

# Fix timestamp format and floor
df['Timestamp'] = df['Timestamp'].astype(str).str.strip()
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format="%b %d, %Y, %I:%M:%S %p")
df['Timestamp'] = df['Timestamp'].dt.floor('h')


df = convert_SDU_to_hourly(df)

feature_df = add_featuresSDU(df)

df = filter_data(start_date, end_date, feature_df)

df = df.dropna()

print(df.columns) 
