# Short notebook

### Helper functions and classes

In [4]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
import pandas as pd
import os

#### Preprocessing

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import PredefinedSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

#General Preprocessing functions:
class pre:
    def clean_NaN(df):
        df = df.copy()
        df.dropna(subset=['target'], inplace=True)
        return df

    def remove_long_sequences(df, col_name, seq_len):
        df = df.copy()
        # Identify sequences of zeros
        df['group'] = (df[col_name] != 0).cumsum()
        df['group_count'] = df.groupby('group')[col_name].transform('count')
        
        # Create a mask to identify rows with sequences longer than seq_len and isshadow lower than 1
        mask = (df[col_name] == 0) & (df['group_count'] > seq_len) #& (df['is_in_shadow:idx'] < 1)
        
        # Remove rows with sequences longer than seq_len and isshadow lower than 1
        df_cleaned = df[~mask].drop(columns=['group', 'group_count'])
        return df_cleaned.copy()


    def remove_repeating_nonzero(df, col_name, repeat_count=5):
        df = df.copy()
        # create a mask to identify rows with repeating nonzero values in the target column
        mask = ((df[col_name] != 0) & (df[col_name].shift(1) == df[col_name]))
        # create a mask to identify rows with repeating nonzero values that occur more than repeat_count times
        repeat_mask = mask & (mask.groupby((~mask).cumsum()).cumcount() >= repeat_count)
        # create a mask to identify the complete sequence of repeating nonzero values
        seq_mask = repeat_mask | repeat_mask.shift(-5)
        # remove rows with repeating nonzero values that occur more than repeat_count times
        df = df[~seq_mask]
        return df

    def clean(df):
        df = df.copy()
        df=pre.clean_NaN(df)
        df=pre.remove_long_sequences(df, 'target', 60)
        df=pre.remove_repeating_nonzero(df, 'target')
        return df


    def encode(data, col, max_val):
        data = data.copy()
        data = data.copy()
        data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
        data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
        return data

    def create_time_features(df):
        df = df.copy()
        df["hour"]=df.index.hour
        df["dayofyear"]=df.index.dayofyear
        df["month"]=df.index.month
        df["week"] = df.index.isocalendar().week

        #zero indexing:
        df["dayofyear"]-=1
        df["month"]-=1
        df["week"]-=1


        #Cycling the time features:
        df = pre.encode(df, "hour", 24)
        df = pre.encode(df, "month", 12)
        df = pre.encode(df, "week", 53)
        df = pre.encode(df, "dayofyear", 366)

        df.drop(columns=["hour", "month", "week", "dayofyear"], inplace=True)




        df["mult1"]=(1-df["is_in_shadow:idx"])*df['direct_rad:W']
        df["mult2"]=(1-df["is_in_shadow:idx"])*df['clear_sky_rad:W']
        df["date_calc"]=pd.to_datetime(df["date_calc"])
        df.index=pd.to_datetime(df.index)
        df["uncertainty"]=(df.index-df["date_calc"]).apply(lambda x: x.total_seconds()/3600)
        df["uncertainty"].fillna(0, inplace=True)
        return df

    def create_features(df):
        df = df.copy()

        df.dropna(subset=['absolute_humidity_2m:gm3'], inplace=True)
        df["total_solar_rad"]=df["direct_rad:W"]+df["diffuse_rad:W"]
        #df["clear_sky_%"]=df["total_solar_rad"]/df["clear_sky_rad:W"]*100
        #df["clear_sky_%"].fillna(0, inplace=True)
        df["spec humid"]=df["absolute_humidity_2m:gm3"]/df["air_density_2m:kgm3"]
        df["temp*total_rad"]=df["t_1000hPa:K"]*df["total_solar_rad"]
        df["wind_angle"]=(np.arctan2(df["wind_speed_u_10m:ms"],df["wind_speed_v_10m:ms"]))*180/np.pi
        #df["total_snow_depth"] = df["snow_depth:cm"] + df["fresh_snow_1h:cm"]
        #df["total_precip_5min"] = df["precip_5min:mm"] + df["snow_melt_10min:mm"]
        #df["total_precip_type"] = df["precip_type_5min:idx"] + df["snow_water:kgm2"]
        df["total_pressure"] = df["pressure_50m:hPa"] + df["pressure_100m:hPa"]
        df["total_sun_angle"] = df["sun_azimuth:d"] + df["sun_elevation:d"]
        df["solar intensity"]=1361*np.cos(np.radians(90-df["sun_elevation:d"]))
        df["solar intensity"].clip(lower=0, inplace=True)
        return df

    def shift_target(df, target_col):
        df = df.copy()
        # Ensure the DataFrame is indexed by date
        df.index = pd.to_datetime(df.index)

        # Store the original indices
        original_indices = df.index

        # Reindex the DataFrame to include all 15-minute intervals
        all_intervals = pd.date_range(start=df.index.min(), end=df.index.max(), freq='15T')
        df = df.reindex(all_intervals)

        # Shift the target variable by 1 period (15 minutes) forward and backward
        df[target_col + '_shifted_forward'] = df[target_col].shift(-1)
        df[target_col + '_shifted_backward'] = df[target_col].shift(1)

        # Forward fill the missing values for the forward shift
        df[target_col + '_shifted_forward'].fillna(method='ffill', inplace=True)

        # Backward fill the missing values for the backward shift
        df[target_col + '_shifted_backward'].fillna(method='bfill', inplace=True)

        # Keep only the original indices
        df = df.loc[original_indices]

        return df


    def add_lagged_features(df):
        df = df.copy()
        features_to_lag = [ "total_solar_rad", "temptotal_rad", "clear_sky_radW", "diffuse_radW", "direct_radW",  "total_cloud_coverp", "solarintensity", "total_sun_angle", "pressure_100mhPa"]
        
        for feature in features_to_lag:
            df = pre.shift_target(df, feature)

        return df


    def general_read_flaml(letter):

        df = pd.read_parquet(f"{letter}/X_train_observed.parquet")
        df2=pd.read_parquet(f"{letter}/X_train_estimated.parquet")
        y = pd.read_parquet(f"{letter}/train_targets.parquet")
        # set the index to date_forecast and group by hourly frequency
        df.set_index("date_forecast", inplace=True)
        df2.set_index("date_forecast", inplace=True)
        y.set_index("time", inplace=True)

        df.index = pd.to_datetime(df.index)
        df2.index = pd.to_datetime(df2.index)
        y.index = pd.to_datetime(y.index) 
        
        df=pd.concat([df,df2],axis=0)

        # truncate y to match the index of df
        y = y.truncate(before=df.index[0], after=df.index[-1])
        latest_y_time = y.index[-1]
        latest_needed_df_time = latest_y_time + pd.Timedelta(minutes=45)
        # Truncate y based on df
        y = y.truncate(before=df.index[0], after=df.index[-1])
        # Ensure df has all needed entries from the start of y to 45 minutes after the end of y
        df = df.truncate(before=y.index[0], after=latest_needed_df_time)
        y.rename(columns={"pv_measurement":"target"},inplace=True)
        X = df.copy()
        Y = y.copy()
        #drop nan rows in Y
        Y = pre.clean(Y)
        X.index = pd.to_datetime(X.index)
        Y.index = pd.to_datetime(Y.index)

        X_filtered = X[X.index.floor('H').isin(Y.index)]

        # Step 2: Ensure there are exactly four 15-min intervals for each hour
        valid_indices = X_filtered.groupby(X_filtered.index.floor('H')).filter(lambda group: len(group) == 4).index

        # Final filtered X
        X_final = X[X.index.isin(valid_indices)]


        #Troubleshooting: Find and print the hours with a mismatch
        group_sizes = X_filtered.groupby(X_filtered.index.floor('H')).size()
        mismatch_hours = group_sizes[group_sizes != 4]

        #Additional troubleshooting: find hours in Y without four 15-min intervals in X
        missing_hours_in_x = Y.index[~Y.index.isin(X_filtered.index.floor('H'))]


        #Remove mismatched and missing hours from Y
        all_issues = mismatch_hours.index.union(missing_hours_in_x)
        Y_clean = Y[~Y.index.isin(all_issues)]

        #dropping nan columns:
        X_final = X_final.drop(columns=['cloud_base_agl:m'])
        X_final = X_final.drop(columns=['ceiling_height_agl:m'])
        X_final = X_final.drop(columns=['snow_density:kgm3'])

        X_final = pre.create_features(X_final)
        X_final = pre.create_time_features(X_final)
        X_final.drop(columns=['date_calc'], inplace=True)

        X_final = X_final.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
        Y_clean = Y_clean.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

        #X_final = add_lagged_features(X_final)

        # Split X_final into a list of 4-row DataFrames
        X_grouped = [group for _, group in X_final.groupby(X_final.index.floor('H')) if len(group) == 4]
        
        # Ensure we only take the groups of X corresponding to Y_clean
        X_list = [X_grouped[i] for i in range(len(Y_clean))]

        return X_list, Y_clean


    def general_read(letter):

        df = pd.read_parquet(f"{letter}/X_train_observed.parquet")
        df2=pd.read_parquet(f"{letter}/X_train_estimated.parquet")
        y = pd.read_parquet(f"{letter}/train_targets.parquet")
        # set the index to date_forecast and group by hourly frequency
        df.set_index("date_forecast", inplace=True)
        df2.set_index("date_forecast", inplace=True)
        y.set_index("time", inplace=True)

        df.index = pd.to_datetime(df.index)
        df2.index = pd.to_datetime(df2.index)
        y.index = pd.to_datetime(y.index) 
        
        df=pd.concat([df,df2],axis=0)

        # truncate y to match the index of df
        y = y.truncate(before=df.index[0], after=df.index[-1])
        latest_y_time = y.index[-1]
        latest_needed_df_time = latest_y_time + pd.Timedelta(minutes=45)
        # Truncate y based on df
        y = y.truncate(before=df.index[0], after=df.index[-1])
        # Ensure df has all needed entries from the start of y to 45 minutes after the end of y
        df = df.truncate(before=y.index[0], after=latest_needed_df_time)
        y.rename(columns={"pv_measurement":"target"},inplace=True)
        X = df.copy()
        Y = y.copy()
        #drop nan rows in Y
        Y = pre.clean(Y)
        X.index = pd.to_datetime(X.index)
        Y.index = pd.to_datetime(Y.index)

        X_filtered = X[X.index.floor('H').isin(Y.index)]

        # Step 2: Ensure there are exactly four 15-min intervals for each hour
        valid_indices = X_filtered.groupby(X_filtered.index.floor('H')).filter(lambda group: len(group) == 4).index

        # Final filtered X
        X_final = X[X.index.isin(valid_indices)]


        #Troubleshooting: Find and print the hours with a mismatch
        group_sizes = X_filtered.groupby(X_filtered.index.floor('H')).size()
        mismatch_hours = group_sizes[group_sizes != 4]

        #Additional troubleshooting: find hours in Y without four 15-min intervals in X
        missing_hours_in_x = Y.index[~Y.index.isin(X_filtered.index.floor('H'))]


        #Remove mismatched and missing hours from Y
        all_issues = mismatch_hours.index.union(missing_hours_in_x)
        Y_clean = Y[~Y.index.isin(all_issues)]

        #dropping nan columns:
        X_final = X_final.drop(columns=['cloud_base_agl:m'])
        X_final = X_final.drop(columns=['ceiling_height_agl:m'])
        X_final = X_final.drop(columns=['snow_density:kgm3'])

        X_final = pre.create_features(X_final)
        X_final = pre.create_time_features(X_final)
        X_final.drop(columns=['date_calc'], inplace=True)

        X_final = X_final.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
        Y_clean = Y_clean.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

        X_final = pre.add_lagged_features(X_final)

        # Split X_final into a list of 4-row DataFrames
        X_grouped = [group for _, group in X_final.groupby(X_final.index.floor('H')) if len(group) == 4]
        
        # Ensure we only take the groups of X corresponding to Y_clean
        X_list = [X_grouped[i] for i in range(len(Y_clean))]

        return X_list, Y_clean

    def readRawData(letter):
        # read X_train_observed.parquet file for the current letter
        df = pd.read_parquet(f"{letter}/X_train_observed.parquet")

        df2=pd.read_parquet(f"{letter}/X_train_estimated.parquet")

        # set the index to date_forecast and group by hourly frequency
        df.set_index("date_forecast", inplace=True)
        df.index = pd.to_datetime(df.index)



        df2.set_index("date_forecast", inplace=True)
        df2.index = pd.to_datetime(df2.index)

        # read train_targets.parquet file for the current letter
        y = pd.read_parquet(f"/Users/henrikhorpedal/Documents/Skolearbeid/Maskinlæring/Group Task/TDT4173_Machine_Learning/{letter}/train_targets.parquet")
        y.set_index("time", inplace=True)
        y.index = pd.to_datetime(y.index) 

        
        df=pd.concat([df,df2],axis=0)

        # truncate y to match the index of df
        y = y.truncate(before=df.index[0], after=df.index[-1])
        latest_y_time = y.index[-1]
        latest_needed_df_time = latest_y_time + pd.Timedelta(minutes=45)

        # Truncate y based on df
        y = y.truncate(before=df.index[0], after=df.index[-1])

        # Ensure df has all needed entries from the start of y to 45 minutes after the end of y
        df = df.truncate(before=y.index[0], after=latest_needed_df_time)

        y.rename(columns={"pv_measurement":"target"},inplace=True)


        X = df.copy()
        Y = y.copy()
        #drop nan rows in Y
        Y = pre.clean(Y)
        X.index = pd.to_datetime(X.index)
        Y.index = pd.to_datetime(Y.index)

        #removing november december and january
        #Y = Y[(Y.index.month != 11) & (Y.index.month != 12) & (Y.index.month != 1)] 

        # Step 1: Keep only rows in X that are within an hour present in Y
        X_filtered = X[X.index.floor('H').isin(Y.index)]

        # Step 2: Ensure there are exactly four 15-min intervals for each hour
        valid_indices = X_filtered.groupby(X_filtered.index.floor('H')).filter(lambda group: len(group) == 4).index

        # Final filtered X
        X_final = X[X.index.isin(valid_indices)]

        #Check length conditions
        # print(f"\nExpected length of X_final: {4 * len(Y)}")
        # print(f"Actual length of X_final: {len(X_final)}")

        #Troubleshooting: Find and print the hours with a mismatch
        group_sizes = X_filtered.groupby(X_filtered.index.floor('H')).size()
        mismatch_hours = group_sizes[group_sizes != 4]

        # print("\nHours with mismatched number of 15-min intervals:")
        # print(mismatch_hours)

        #Additional troubleshooting: find hours in Y without four 15-min intervals in X
        missing_hours_in_x = Y.index[~Y.index.isin(X_filtered.index.floor('H'))]
        # if not missing_hours_in_x.empty:
        #     print("\nAdditional hours in Y without four 15-min intervals in X:")
        #     print(missing_hours_in_x)

        #Remove mismatched and missing hours from Y
        all_issues = mismatch_hours.index.union(missing_hours_in_x)
        Y_clean = Y[~Y.index.isin(all_issues)]

        #Re-check length conditions
        # print(f"\nAdjusted expected length of X_final: {4 * len(Y_clean)}")
        # print(f"Actual length of X_final: {len(X_final)}")


        #dropping nan columns:
        X_final.drop(columns=['cloud_base_agl:m'], inplace=True)
        X_final.drop(columns=['ceiling_height_agl:m'], inplace=True)
        X_final.drop(columns=['snow_density:kgm3'], inplace=True)

        X_final = pre.create_features(X_final)
        X_final = pre.create_time_features(X_final)
        X_final.drop(columns=['date_calc'], inplace=True)

        X_final = X_final.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
        Y_clean = Y_clean.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))



        return X_final, Y_clean


    def general_read_lstm(letter):

        df = pd.read_parquet(f"{letter}/X_train_observed.parquet")
        df2=pd.read_parquet(f"{letter}/X_train_estimated.parquet")
        y = pd.read_parquet(f"{letter}/train_targets.parquet")
        # set the index to date_forecast and group by hourly frequency
        df.set_index("date_forecast", inplace=True)
        df2.set_index("date_forecast", inplace=True)
        y.set_index("time", inplace=True)

        df.index = pd.to_datetime(df.index)
        df2.index = pd.to_datetime(df2.index)
        y.index = pd.to_datetime(y.index) 
        
        df=pd.concat([df,df2],axis=0)

        # truncate y to match the index of df
        y = y.truncate(before=df.index[0], after=df.index[-1])
        latest_y_time = y.index[-1]
        latest_needed_df_time = latest_y_time + pd.Timedelta(minutes=45)
        # Truncate y based on df
        y = y.truncate(before=df.index[0], after=df.index[-1])
        # Ensure df has all needed entries from the start of y to 45 minutes after the end of y
        df = df.truncate(before=y.index[0], after=latest_needed_df_time)
        y.rename(columns={"pv_measurement":"target"},inplace=True)
        X = df.copy()
        Y = y.copy()
        #drop nan rows in Y
        Y = pre.clean(Y)
        X.index = pd.to_datetime(X.index)
        Y.index = pd.to_datetime(Y.index)

        X_filtered = X[X.index.floor('H').isin(Y.index)]

        # Step 2: Ensure there are exactly four 15-min intervals for each hour
        valid_indices = X_filtered.groupby(X_filtered.index.floor('H')).filter(lambda group: len(group) == 4).index

        # Final filtered X
        X_final = X[X.index.isin(valid_indices)]


        #Troubleshooting: Find and print the hours with a mismatch
        group_sizes = X_filtered.groupby(X_filtered.index.floor('H')).size()
        mismatch_hours = group_sizes[group_sizes != 4]


        #Additional troubleshooting: find hours in Y without four 15-min intervals in X
        missing_hours_in_x = Y.index[~Y.index.isin(X_filtered.index.floor('H'))]


        #Remove mismatched and missing hours from Y
        all_issues = mismatch_hours.index.union(missing_hours_in_x)
        Y_clean = Y[~Y.index.isin(all_issues)]

        #dropping nan columns:
        X_final = X_final.drop(columns=['cloud_base_agl:m'])
        X_final = X_final.drop(columns=['ceiling_height_agl:m'])
        X_final = X_final.drop(columns=['snow_density:kgm3'])

        X_final = pre.create_features(X_final)
        X_final = pre.create_time_features(X_final)
        X_final.drop(columns=['date_calc'], inplace=True)

        X_final = X_final.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
        Y_clean = Y_clean.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

        return X_final, Y_clean


    def concatenate_dfs(df_list):
        """
        Concatenates a list of DataFrames into a single DataFrame.

        Args:
        df_list (list of pd.DataFrame): List of DataFrame objects to concatenate.

        Returns:
        pd.DataFrame: A single DataFrame containing all rows from the input DataFrames in the order they appear in the list.
        """
        return pd.concat(df_list, ignore_index=False)


    class QuartersAsColumnsTransformer(BaseEstimator, TransformerMixin):
        def fit(self, X, y=None):
            return self
        
        def transform(self, X, y=None):
            # Ensure input is a DataFrame
            X = X.copy()
            assert isinstance(X, pd.DataFrame)
            #make sure index is datetime:
            X.index = pd.to_datetime(X.index)

            original_index = X.index


            X['hour'] = X.index.floor('H')
            X['minute'] = X.index.minute

            # Melt the DataFrame to long format
            df_melted = pd.melt(X, id_vars=['hour', 'minute'], value_vars=X.columns[:-2]).copy()  # excluding 'hour' and 'minute'

            # Create a multi-level column name combining variable and minute
            df_melted['variable_minute'] = df_melted['variable'] + '_' + df_melted['minute'].astype(str) + 'min'

            # Drop the 'variable_minute' column


            # Pivot the data to get one row per hour and columns for each variable and minute
            X = df_melted.pivot(index='hour', columns='variable_minute', values='value').copy()
            #rename index to date_forecast:
            X.index.rename("date_forecast", inplace=True)


            #drop irrelevant columns:
            #hour_sin	hour_cos	month_sin	month_cos	week_sin	week_cos	dayofyear_sin	dayofyear_cos	mult1	mult2	uncertainty


            irrelevant_cols = ["hour_sin", "hour_cos", "month_sin", "month_cos", "week_sin", "week_cos", "dayofyear_sin", "dayofyear_cos", "uncertainty"]
            variantes = ["_0min", "_15min", "_30min", "_45min"]
            for variant in variantes:
                for col in irrelevant_cols:
                    if variant == "_0min":
                        #remove _0min from column name;
                        X.rename(columns={col+variant:col}, inplace=True)
                    else:
                        X.drop(columns=[col+variant], inplace=True)
            

            reindex_map = original_index.floor('H').unique()
            X = X.reindex(reindex_map)
            X.index = reindex_map

            #drop hour_

            if "object" in X.dtypes.unique():
                print("waring: object in QuarterAsColumnsTransformer")
                print(X.dtypes.unique())
                for col in X.columns:
                    print(col)

            #X = X.select_dtypes(include=[np.number])
            return X

    class StatisticalFeaturesTransformer(BaseEstimator, TransformerMixin):
        def fit(self, X, y=None):
            return self

        def transform(self, X, y=None):
            X_copy = X.copy()
            X_copy.index = pd.to_datetime(X_copy.index)
            X_copy['hour'] = X_copy.index.floor('H')
            
            # Compute mean, std
            aggregated = X_copy.groupby('hour').agg(['mean', 'std'])
            
            # Filter hours with exactly 4 data points
            valid_hours = X_copy.groupby('hour').size()
            valid_hours = valid_hours[valid_hours == 4].index
            
            X_final = aggregated.loc[valid_hours]
            
            # Flatten the multi-index to form new column names
            X_final.columns = ['_'.join(col).strip() for col in X_final.columns.values]
            # for col in X_final.columns:
            #     print(col)
            #drop minute_mean and minute_std if they exist:
            if "minute_mean" in X_final.columns:
                X_final.drop(columns=["minute_mean", "minute_std"], inplace=True)
            
            X_final = X_final.select_dtypes(include=[np.number])
            # print(X_final.dtypes.unique())
            # for col in X_final.columns:
            #     print(col)


            return X_final
        
    class TrimmedMeanTransformer(BaseEstimator, TransformerMixin):
        def fit(self, X, y=None):
            return self

        def transform(self, X, y=None):
            # Ensure the dataframe's index is a datetime type
            X = X.copy()
            X.index = pd.to_datetime(X.index)
            
            original_index = X.index

            # Create a helper column 'hour_label' 
            X['hour_label'] = X.index.floor('H')
            
            # Compute the trimmed mean for each valid hour
            def compute_trimmed_mean(group):
                if group.shape[0] != 4:  # Only process groups of size 4
                    return np.nan

                # Exclude any datetime columns
                numeric_cols = group.select_dtypes(include=[np.number])
                
                min_val = np.min(numeric_cols, axis=0)
                max_val = np.max(numeric_cols, axis=0)
                total = np.sum(numeric_cols, axis=0)
                return (total - min_val - max_val) / 2  # Removing min and max

            # Group and apply the function
            X_trimmed_mean = X.groupby('hour_label').apply(compute_trimmed_mean)
            
            # Drop the helper column in the result as it's no longer needed
            if 'hour_label' in X_trimmed_mean.columns:
                X_trimmed_mean = X_trimmed_mean.drop(columns=['hour_label'])

            # Filter hours with exactly 4 data points
            valid_hours = X['hour_label'].value_counts()
            valid_hours = valid_hours[valid_hours == 4].index
            X_final = X_trimmed_mean[X_trimmed_mean.index.isin(valid_hours)]

            reindex_map = original_index.floor('H').unique()
            X_final = X_final.reindex(reindex_map)
            X_final.index = reindex_map

            X_final = X_final.select_dtypes(include=[np.number])
            
            return X_final

    class HourMonthTargetEncoder(BaseEstimator, TransformerMixin):
        def __init__(self):
            self.encoding_map = {}
            self.y_ = None  # To store y during fit

        def fit(self, X, y=None):
            # Ensure X's index is a datetime index
            if not isinstance(X.index, pd.DatetimeIndex):
                raise ValueError("Index of input X must be a pandas DatetimeIndex")

            if y is None:
                raise ValueError("y cannot be None for fitting the encoder")

            # Store the target values for encoding later
            self.y_ = y

            try:
                # Extract hour and month from the index and use y provided during fit
                df = pd.DataFrame({'target': self.y_, 'hour': X.index.hour, 'month': X.index.month})
            except Exception as e:
                raise e

            # Compute mean target value for each hour of each month
            self.encoding_map = df.groupby(['month', 'hour'])['target'].mean().to_dict()
            return self

        def transform(self, X):
            # Ensure X's index is a datetime index
            if not isinstance(X.index, pd.DatetimeIndex):
                raise ValueError("Index of input X must be a pandas DatetimeIndex")

            if self.y_ is None:
                raise ValueError("The encoder has not been fitted with target values")

            # Extract hour and month from the index
            X_transformed = X.copy()
            X_transformed['hour'] = X.index.hour
            X_transformed['month'] = X.index.month

            # Map the mean target values
            X_transformed['target_encoded'] = X_transformed.apply(
                lambda row: self.encoding_map.get((row['month'], row['hour']), np.nan), axis=1)

            # Optionally drop 'hour' and 'month' if they're not needed
            X_transformed.drop(['hour', 'month'], axis=1, inplace=True)

            # Check for object dtypes and print warning if any
            if "object" in X_transformed.dtypes.values:
                print("Warning: object dtype in HourMonthTargetEncoder")
                print(X_transformed.dtypes)

            # Ensure that only numeric types are returned
            X_transformed = X_transformed.select_dtypes(include=[np.number])

            return X_transformed




    def apply_preprocessor(data, preprocessor_name):
        data = data.copy()
        # Assuming `pre.choose_transformer` returns a callable object that can be used to transform the data
        preprocessor = pre.choose_transformer(preprocessor_name)
        return preprocessor.transform(data)

    #Preprocessing functions for the different models:


    #LSTM preprocessing:
    def train_val_split_diffrent_folds(X,y,letter,fold_number):
        X = X.copy()
        y = y.copy()
        if letter == "A":
                assert fold_number in [0,1,2,3]
                year = 2019 + fold_number
        elif letter == "B":
            assert fold_number in [0,1,2]
            year = 2019 + fold_number
        elif letter == "C":
            assert fold_number in [0,1]
            year = 2020 + fold_number

        
        # Define conditions to move May and June of split_date's year from train to test
        may_june_july_condition_X = ((X.index.month == 5) | (X.index.month == 6) | (X.index.month == 7)) & ((X.index.year == year))
        may_june_july_condition_y = ((y.index.month == 5) | (y.index.month == 6) | (y.index.month == 7)) & ((y.index.year == year))
        
        X_val = X[may_june_july_condition_X]
        y_val = y[may_june_july_condition_y]

        # Remove May and June data from training set
        X_train = X[~may_june_july_condition_X]
        y_train = y[~may_june_july_condition_y]

        return X_train, y_train, X_val, y_val





    def train_test_split_may_june_july(X, y,letter):
        """
        Splits the data based on a given date. Additionally, moves May, June and July data of split_date's year
        from training set to test set.
        
        Parameters:
        - X: Quarter-hourly input data with DateTime index.
        - y: Hourly target data with DateTime index.
        - split_date: Date (string or datetime object) to split the data on.
        
        Returns:
        X_train, y_train, X_test, y_test
        """

        if letter == "A":
            year = 2022
        elif letter == "B":
            year = 2019
        elif letter == "C":
            year = 2020
        
        # Define conditions to move May and June of split_date's year from train to test
        may_june_july_condition_X = ((X.index.month == 5) | (X.index.month == 6) | (X.index.month == 7)) & ((X.index.year == year))
        may_june_july_condition_y = ((y.index.month == 5) | (y.index.month == 6) | (y.index.month == 7)) & ((y.index.year == year))
        
        X_may_june_july = X[may_june_july_condition_X]
        y_may_june_july = y[may_june_july_condition_y]

        # Remove May and June data from training set
        X_train = X[~may_june_july_condition_X]
        y_train = y[~may_june_july_condition_y]

        return X_train, y_train, X_may_june_july, y_may_june_july

    def train_val_blend(X, y,letter):
        X = X.copy()
        y = y.copy()
        if letter == "A":
            year = 2022
        elif letter == "B":
            year = 2019
        elif letter == "C":
            year = 2021


        if letter == "A":
            blend_year = 2021
        elif letter == "B":
            blend_year = 2020
        elif letter == "C":
            blend_year = 2020
        
        if letter == "C":
            # Define conditions to move May and June of split_date's year from train to test
            may_june_july_condition_X = ((X.index.month == 6) | (X.index.month == 5)) & ((X.index.year == year))
            may_june_july_condition_y = ((y.index.month == 6) | (y.index.month == 5)) & ((y.index.year == year))

        else:
            # Define conditions to move May and June of split_date's year from train to test
            may_june_july_condition_X = ((X.index.month == 5) | (X.index.month == 6) | (X.index.month == 7)) & ((X.index.year == year))
            may_june_july_condition_y = ((y.index.month == 5) | (y.index.month == 6) | (y.index.month == 7)) & ((y.index.year == year))
            
        X_val = X[may_june_july_condition_X]
        y_val = y[may_june_july_condition_y]

        if letter == "C":
            X_blend_condition = ((X.index.month == 7) | (X.index.month == 8)) & ((X.index.year == blend_year))
            y_blend_condition = ((y.index.month == 7) | (y.index.month == 8)) & ((y.index.year == blend_year))
        else:
            X_blend_condition = ((X.index.month == 5) | (X.index.month == 6) | (X.index.month == 7)) & ((X.index.year == blend_year))
            y_blend_condition = ((y.index.month == 5) | (y.index.month == 6) | (y.index.month == 7)) & ((y.index.year == blend_year))


        X_blend = X[X_blend_condition]
        y_blend = y[y_blend_condition]

        # Remove the data from training set
        X_train = X[~may_june_july_condition_X & ~X_blend_condition]
        y_train = y[~may_june_july_condition_y & ~y_blend_condition]


        return X_train, y_train, X_val, y_val, X_blend, y_blend





    def train_test_split_on_specific_day_May_june(X, y, split_date):
        """
        Splits the data based on a given date. Additionally, moves May, June and July data of split_date's year
        from training set to test set.
        
        Parameters:
        - X: Quarter-hourly input data with DateTime index.
        - y: Hourly target data with DateTime index.
        - split_date: Date (string or datetime object) to split the data on.
        
        Returns:
        X_train, y_train, X_test, y_test
        """
        split_date = pd.Timestamp(split_date).normalize()

        # Ensure split_date is a datetime object
        if isinstance(split_date, str):
            split_date = pd.Timestamp(split_date)

        print(f"Split date: {split_date}")

        # Split the data based on the provided date
        X_train = X[X.index.normalize() < split_date]
        y_train = y[y.index.normalize() < split_date]

        X_test = X[X.index.normalize() >= split_date]
        y_test = y[y.index.normalize() >= split_date]

        # Define conditions to move May and June of split_date's year from train to test
        may_june_condition_X = ((X_train.index.month == 5) | (X_train.index.month == 6) | (X_train.index.month == 7)) & ((X_train.index.year == split_date.year))
        may_june_condition_y = ((y_train.index.month == 5) | (y_train.index.month == 6) | (y_train.index.month == 7)) & ((y_train.index.year == split_date.year))
        
        X_may_june = X_train[may_june_condition_X]
        y_may_june = y_train[may_june_condition_y]

        # Remove May and June data from training set
        X_train = X_train[~may_june_condition_X]
        y_train = y_train[~may_june_condition_y]

        # Append May and June data to test set
        X_test = pd.concat([X_may_june, X_test])
        y_test = pd.concat([y_may_june, y_test])

        return X_train, y_train, X_test, y_test








    def time_series_split(X, split_date = "2022-10-29"):
        
        if not isinstance(X.index, pd.DatetimeIndex):
            X.index = pd.to_datetime(X.index)
        
        split_date = pd.to_datetime(split_date)
        
        mask_val = (X.index >= split_date)
        
        split_year = split_date.year
        mask_may_june_july = (X.index.month.isin([5, 6, 7])) & (X.index.year == split_year)
        
        mask_val = mask_val | mask_may_june_july

        
        test_fold = np.where(mask_val, 0, -1)
        
        
        return PredefinedSplit(test_fold)

    def split_df_on_alternate_days(x_df, y_df):
        # Convert index to datetime if it's not already
        x_df.index = pd.to_datetime(x_df.index)
        y_df.index = pd.to_datetime(y_df.index)
        
        # Check if both dataframes are aligned
        assert all(x_df.index == y_df.index), "Indexes of x_df and y_df do not match!"

        # Extract day from the index
        days = x_df.index.day

        # Split into even and odd days
        x_even_days = x_df[days % 2 == 0]
        y_even_days = y_df[days % 2 == 0]

        x_odd_days = x_df[days % 2 != 0]
        y_odd_days = y_df[days % 2 != 0]

        return x_even_days, y_even_days, x_odd_days, y_odd_days

    def lstm_train_test_split(X, y,letter, split_date):
        """
        Splits the data based on a given date. Additionally, moves May, June and July data of split_date's year
        from training set to test set.
        
        Parameters:
        - X: Quarter-hourly input data with DateTime index.
        - y: Hourly target data with DateTime index.
        - split_date: Date (string or datetime object) to split the data on.
        
        Returns:
        X_train, y_train, X_test, y_test
        """
        split_date = pd.Timestamp(split_date).normalize()

        if isinstance(split_date, str):
            split_date = pd.Timestamp(split_date)
        if letter == "A":
            year = 2022
        elif letter == "B":
            year = 2019
        elif letter == "C":
            year = 2020

        X_train = X[X.index.normalize() < split_date]
        y_train = y[y.index.normalize() < split_date]

        X_test = X[X.index.normalize() >= split_date]
        y_test = y[y.index.normalize() >= split_date]
        
        # Define conditions to move May and June of split_date's year from train to test
        may_june_july_condition_X = ((X.index.month == 5) | (X.index.month == 6) | (X.index.month == 7)) & ((X.index.year == year))
        may_june_july_condition_y = ((y.index.month == 5) | (y.index.month == 6) | (y.index.month == 7)) & ((y.index.year == year))
        
        X_may_june_july = X[may_june_july_condition_X]
        y_may_june_july = y[may_june_july_condition_y]

        # Remove May and June data from training set
        X_train = X[~may_june_july_condition_X]
        y_train = y[~may_june_july_condition_y]

        # Append May and June data to test set
        X_test = pd.concat([X_may_june_july, X_test])
        y_test = pd.concat([y_may_june_july, y_test])

        return X_train, y_train, X_test, y_test


    def remove_winter_months(df):
        """
        Removes the winter months (December, January, February) from a DataFrame that
        has a DateTime index.

        Parameters:
        - df: DataFrame with DateTime index.

        Returns:
        - DataFrame with winter months removed.
        """
        # Ensure the index is a DateTimeIndex
        if not isinstance(df.index, pd.DatetimeIndex):
            raise ValueError("DataFrame index must be a DateTimeIndex.")

        # Define condition to filter out the winter months
        winter_condition = (df.index.month == 12) | (df.index.month == 1) | (df.index.month == 2) | (df.index.month == 11)

        # Filter out the winter months
        df_no_winter = df[~winter_condition]
        return df_no_winter


    def new_train_test_split(X, y,letter, split_date):
        """
        Splits the data based on a given date. Additionally, moves May, June and July data of split_date's year
        from training set to test set.
        
        Parameters:
        - X: Quarter-hourly input data with DateTime index.
        - y: Hourly target data with DateTime index.
        - split_date: Date (string or datetime object) to split the data on.
        
        Returns:
        X_train, y_train, X_test, y_test
        """
        split_date = pd.Timestamp(split_date).normalize()
        print(f"Split date: {split_date}")

        if isinstance(split_date, str):
            split_date = pd.Timestamp(split_date)
        if letter == "A":
            year = 2022
        elif letter == "B":
            year = 2019
        elif letter == "C":
            year = 2020

        X_train = X[X.index.normalize() < split_date]
        y_train = y[y.index.normalize() < split_date]

        X_test = X[X.index.normalize() >= split_date]
        y_test = y[y.index.normalize() >= split_date]
        
        # Define conditions to move May and June of split_date's year from train to test
        may_june_july_condition_X = ((X.index.month == 5) | (X.index.month == 6) | (X.index.month == 7)) & ((X.index.year == year))
        may_june_july_condition_y = ((y.index.month == 5) | (y.index.month == 6) | (y.index.month == 7)) & ((y.index.year == year))
        
        X_may_june_july = X[may_june_july_condition_X]
        y_may_june_july = y[may_june_july_condition_y]

        # Remove May and June data from training set
        X_train = X[~may_june_july_condition_X]
        y_train = y[~may_june_july_condition_y]

        # Append May and June data to test set
        X_test = pd.concat([X_may_june_july, X_test])
        y_test = pd.concat([y_may_june_july, y_test])

        return X_train, y_train, X_test, y_test


    def choose_scaler(scaler_string):
        if scaler_string == "minmax":
            return MinMaxScaler()
        elif scaler_string == "standard":
            return StandardScaler()
        elif scaler_string == "robust":
            return RobustScaler()

    def choose_transformer(transformer_string):
        if transformer_string == "quarters":
            return pre.QuartersAsColumnsTransformer()
        elif transformer_string == "statistical":
            return pre.StatisticalFeaturesTransformer()
        elif transformer_string == "trimmedMean":
            return pre.TrimmedMeanTransformer()

    def choose_encoder(encoder_boolian):
        if encoder_boolian == True:
            return pre.HourMonthTargetEncoder()
        else:
            return None

    def generate_predefined_split(X_train, X_val, y_train, y_val):
        """
        This function takes in separate training and validation datasets, combines them,
        and creates a PredefinedSplit object that can be used with sklearn's GridSearchCV
        or other model selection utilities. This allows for specifying which samples are
        used for training and which are used for validation.

        Parameters:
        X_train (array-like): Training features.
        X_val (array-like): Validation features.
        y_train (array-like): Training labels.
        y_val (array-like): Validation labels.

        Returns:
        X (array-like): The combined dataset of features.
        y (array-like): The combined dataset of labels.
        split_index (PredefinedSplit): An instance of PredefinedSplit with the indices set.
        """

        # Combine the training and validation sets
        X = np.concatenate((X_train, X_val), axis=0)
        y = np.concatenate((y_train, y_val), axis=0)

        # Generate the indices array where -1 indicates the sample is part of the training set,
        # and 0 indicates the sample is part of the validation set.
        train_indices = -1 * np.ones(len(X_train))
        val_indices = 0 * np.ones(len(X_val))
        test_fold = np.concatenate((train_indices, val_indices))

        # Create the PredefinedSplit object
        predefined_split = PredefinedSplit(test_fold)

        return X, y, predefined_split

    def printhei():
        print("hei")

#### Postprocessing

In [7]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
import os
#import a mean signed error function

class post:
    def calculate_hourly_mae_and_plot(predictions, actuals):
        """
        Calculate the MAE for each hour of the day across multiple days, 
        and plot a histogram of these MAE values.
        Assumes each day contains 24 consecutive hourly observations in order.

        :param predictions: List or array of predictions.
        :param actuals: List or array of actual values.
        """
        if len(predictions) % 24 != 0 or len(actuals) % 24 != 0:
            raise ValueError("The length of predictions and actuals should be a multiple of 24.")

        hourly_mae = []
        num_days = len(predictions) // 24

        for hour in range(24):
            hourly_preds = predictions[hour::24]
            hourly_acts = actuals[hour::24]
            hourly_mae.append(mean_absolute_error(hourly_acts, hourly_preds))

        # Plotting the results
        plt.figure(figsize=(10, 6))
        plt.bar(range(24), hourly_mae, color='skyblue')
        plt.xlabel('Hour of Day')
        plt.ylabel('Mean Absolute Error')
        plt.title('Hourly MAE of Predictions')
        plt.xticks(range(24), [f"{hour:02d}:00" for hour in range(24)])
        plt.grid(axis='y', linestyle='--')
        plt.show()


    def calculate_hourly_me_and_plot(predictions, actuals):
        """
        Calculate the ME (Mean Error) for each hour of the day across multiple days, 
        and plot a histogram of these ME values.
        Assumes each day contains 24 consecutive hourly observations in order.

        :param predictions: List or array of predictions.
        :param actuals: List or array of actual values.
        """
        if len(predictions) % 24 != 0 or len(actuals) % 24 != 0:
            raise ValueError("The length of predictions and actuals should be a multiple of 24.")

        hourly_me = []
        num_days = len(predictions) // 24

        for hour in range(24):
            hourly_preds = predictions[hour::24]
            hourly_acts = actuals[hour::24]
            hourly_me.append(np.mean([a - p for a, p in zip(hourly_acts, hourly_preds)]))

        # Plotting the results
        plt.figure(figsize=(10, 6))
        plt.bar(range(24), hourly_me, color='skyblue')
        plt.xlabel('Hour of Day')
        plt.ylabel('Mean Error')
        plt.title('Hourly ME of Predictions')
        plt.xticks(range(24), [f"{hour:02d}:00" for hour in range(24)])
        plt.grid(axis='y', linestyle='--')
        plt.show()


    def readRawTest(letter):
        df = pd.read_parquet(f"/Users/henrikhorpedal/Documents/Skolearbeid/Maskinlæring/Group Task/TDT4173_Machine_Learning/{letter}/X_test_estimated.parquet")
        df.set_index("date_forecast", inplace=True)
        df.index = pd.to_datetime(df.index)
        return df

    def readAndBasicPreprocess(letter):
        X = post.readRawTest(letter)
        X.drop(columns=['cloud_base_agl:m'], inplace=True)
        X.drop(columns=['ceiling_height_agl:m'], inplace=True)
        X.drop(columns=['snow_density:kgm3'], inplace=True)
        X=pre.create_features(X)
        X=pre.create_time_features(X)
        X.drop(columns=['date_calc'], inplace=True)
        X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
        X = pre.add_lagged_features(X)
        return X

    def makePrediction(A_model, B_model, C_model, filename):
        A_x_test = post.readAndBasicPreprocess("A")
        A_y_pred=A_model.predict(A_x_test)
        A_y_pred=pd.DataFrame(A_y_pred, index=range(0,720), columns=['prediction'])

        B_x_test= post.readAndBasicPreprocess("B")
        B_y_pred=B_model.predict(B_x_test)
        B_y_pred=pd.DataFrame(B_y_pred, index=range(720,1440), columns=['prediction'])

        C_x_test= post.readAndBasicPreprocess("C")
        C_y_pred=C_model.predict(C_x_test)
        C_y_pred=pd.DataFrame(C_y_pred, index=range(1440,2160), columns=['prediction'])

        combined_pred = pd.concat([A_y_pred, B_y_pred, C_y_pred], axis=0)
        combined_pred["prediction"] = combined_pred["prediction"].clip(lower=0)
        combined_pred.index.name = "id"
        combined_pred.to_csv(filename, index=True)

    def makeEnsemblePrediction(A_xgb_model, A_xgb_processing, A_dnn_model, A_dnn_preprocessing, A_dnn_target_preprocessing, B_xgb_model, B_xgb_processing, B_dnn_model, B_dnn_preprocessing, B_dnn_target_preprocessing, C_xgb_model, C_xgb_processing, C_dnn_model,C_dnn_preprocessing, C_dnn_target_preprocessing, filename):
        A_X_test = post.readAndBasicPreprocess("A")

        A_X_test_xgb = A_xgb_processing.transform(A_X_test)
        A_y_pred_xgb = A_xgb_model.predict(A_X_test_xgb)

        A_X_test_dnn = pd.DataFrame(A_dnn_preprocessing.transform(A_X_test))
        A_y_pred_dnn = A_dnn_model.predict(A_X_test_dnn)
        A_y_pred_dnn = A_dnn_target_preprocessing.inverse_transform(A_y_pred_dnn).reshape(-1)

        A_y_pred = (A_y_pred_xgb + A_y_pred_dnn) / 2
        A_y_pred = pd.DataFrame(A_y_pred, index=range(0,720), columns=['prediction'])

        B_X_test = post.readAndBasicPreprocess("B")

        B_X_test_xgb = B_xgb_processing.transform(B_X_test)
        B_y_pred_xgb = B_xgb_model.predict(B_X_test_xgb)

        B_X_test_dnn = pd.DataFrame(B_dnn_preprocessing.transform(B_X_test))
        B_y_pred_dnn = B_dnn_model.predict(B_X_test_dnn)
        B_y_pred_dnn = B_dnn_target_preprocessing.inverse_transform(B_y_pred_dnn).reshape(-1)

        B_y_pred = (B_y_pred_xgb + B_y_pred_dnn) / 2
        B_y_pred = pd.DataFrame(B_y_pred, index=range(720,1440), columns=['prediction'])

        C_X_test = post.readAndBasicPreprocess("C")

        C_X_test_xgb = B_xgb_processing.transform(C_X_test)
        C_y_pred_xgb = B_xgb_model.predict(C_X_test_xgb)

        C_X_test_dnn = pd.DataFrame(B_dnn_preprocessing.transform(C_X_test))
        C_y_pred_dnn = B_dnn_model.predict(C_X_test_dnn)
        C_y_pred_dnn = B_dnn_target_preprocessing.inverse_transform(C_y_pred_dnn).reshape(-1)

        C_y_pred = (C_y_pred_xgb + C_y_pred_dnn) / 2

        C_y_pred = pd.DataFrame(C_y_pred, index=range(1440,2160), columns=['prediction'])

        combined_pred = pd.concat([A_y_pred, B_y_pred, C_y_pred], axis=0)
        combined_pred["prediction"] = combined_pred["prediction"].clip(lower=0)
        combined_pred.index.name = "id"
        combined_pred.to_csv(filename, index=True)
        

    def make_dnn_prediction(A_model, A_preprocessing, A_target_scaling, B_model, B_preprocessing, B_target_scaling, C_model, C_preprocessing, C_target_scaling, filename):
        A_X_test = post.readAndBasicPreprocess("A")
        A_X_test_dnn = pd.DataFrame(A_preprocessing.transform(A_X_test))
        A_y_pred_dnn = A_model.predict(A_X_test_dnn)
        A_y_pred_dnn = A_target_scaling.inverse_transform(A_y_pred_dnn).reshape(-1)
        A_y_pred = pd.DataFrame(A_y_pred_dnn, index=range(0,720), columns=['prediction'])

        B_X_test = post.readAndBasicPreprocess("B")
        B_X_test_dnn = pd.DataFrame(B_preprocessing.transform(B_X_test))
        B_y_pred_dnn = B_model.predict(B_X_test_dnn)
        B_y_pred_dnn = B_target_scaling.inverse_transform(B_y_pred_dnn).reshape(-1)
        B_y_pred = pd.DataFrame(B_y_pred_dnn, index=range(720,1440), columns=['prediction'])

        C_X_test = post.readAndBasicPreprocess("C")
        C_X_test_dnn = pd.DataFrame(C_preprocessing.transform(C_X_test))
        C_y_pred_dnn = C_model.predict(C_X_test_dnn)
        C_y_pred_dnn = C_target_scaling.inverse_transform(C_y_pred_dnn).reshape(-1)
        C_y_pred = pd.DataFrame(C_y_pred_dnn, index=range(1440,2160), columns=['prediction'])

        combined_pred = pd.concat([A_y_pred, B_y_pred, C_y_pred], axis=0)
        combined_pred["prediction"] = combined_pred["prediction"].clip(lower=0)
        combined_pred.index.name = "id"
        combined_pred.to_csv(filename, index=True)


    def make_xgb_prediction(A_model, A_preprocessing, B_model, B_preprocessing, C_model, C_preprocessing, filename):
        A_X_test = post.readAndBasicPreprocess("A")
        A_X_test_xgb = A_preprocessing.transform(A_X_test)

        A_y_pred_xgb = A_model.predict(A_X_test_xgb)
        A_y_pred = pd.DataFrame(A_y_pred_xgb, index=range(0,720), columns=['prediction'])

        B_X_test = post.readAndBasicPreprocess("B")
        B_X_test_xgb = B_preprocessing.transform(B_X_test)

        B_y_pred_xgb = B_model.predict(B_X_test_xgb)
        B_y_pred = pd.DataFrame(B_y_pred_xgb, index=range(720,1440), columns=['prediction'])

        C_X_test = post.readAndBasicPreprocess("C")
        C_X_test_xgb = C_preprocessing.transform(C_X_test)

        C_y_pred_xgb = C_model.predict(C_X_test_xgb)
        C_y_pred = pd.DataFrame(C_y_pred_xgb, index=range(1440,2160), columns=['prediction'])

        combined_pred = pd.concat([A_y_pred, B_y_pred, C_y_pred], axis=0)
        combined_pred["prediction"] = combined_pred["prediction"].clip(lower=0)
        combined_pred.index.name = "id"
        combined_pred.to_csv(filename, index=True)

    def makeAutoMLPred(A_model, B_model, C_model, filename):
        """
        Assumes QuarterAsColumn

        """

        A_X_test = post.readAndBasicPreprocess("A")
        A_X_test = pre.QuartersAsColumnsTransformer().transform(A_X_test)
        A_y_pred = A_model.predict(A_X_test)
        A_y_pred = pd.DataFrame(A_y_pred, index=range(0,720), columns=['prediction'])

        B_X_test = post.readAndBasicPreprocess("B")
        B_X_test = pre.QuartersAsColumnsTransformer().transform(B_X_test)
        B_y_pred = B_model.predict(B_X_test)
        B_y_pred = pd.DataFrame(B_y_pred, index=range(720,1440), columns=['prediction'])

        C_X_test = post.readAndBasicPreprocess("C")
        C_X_test = pre.QuartersAsColumnsTransformer().transform(C_X_test)
        C_y_pred = C_model.predict(C_X_test)
        C_y_pred = pd.DataFrame(C_y_pred, index=range(1440,2160), columns=['prediction'])



        combined_pred = pd.concat([A_y_pred, B_y_pred, C_y_pred], axis=0)
        combined_pred["prediction"] = combined_pred["prediction"].clip(lower=0)
        combined_pred.loc[(combined_pred.index % 24).isin([22, 23, 0]), "prediction"] = 0
        combined_pred.index.name = "id"
        combined_pred.to_csv(filename, index=True)

    def compute_mae(ser1, ser2):
        """Compute Mean Absolute Error between two Series."""
        return np.abs(ser1 - ser2).mean()

    def plot_mae_grid(dataframes_dict):
        """Plot a grid of MAE values for a dictionary of DataFrames."""
        
        labels = list(dataframes_dict.keys())
        dataframes = list(dataframes_dict.values())
        n = len(dataframes)
        
        mae_grid = np.zeros((n, n))

        for i in range(n):
            for j in range(n):
                if i != j:
                    mae_grid[i][j] = post.compute_mae(dataframes[i]["prediction"], dataframes[j]["prediction"])

        fig, ax = plt.subplots(figsize=(10, 8))
        cax = ax.matshow(mae_grid, cmap="viridis")
        
        ax.grid(False)
        plt.xticks(range(n), labels, rotation=45)
        plt.yticks(range(n), labels)
        
        # Add annotations
        for i in range(n):
            for j in range(n):
                text = ax.text(j, i, f"{mae_grid[i, j]:.2f}",
                            ha="center", va="center", color="w" if mae_grid[i, j] > (mae_grid.max() / 2) else "black")
        
        plt.colorbar(cax)
        plt.title('MAE Between DataFrames on "prediction" Column', pad=20)
        plt.show()


    def test():
        print("hei")

    def makePredictionWithModelAndPreprocessor(A_model, B_model, C_model, preprocessor, filename):
        """
        Assumes same preprocessing for all locations

        """

        A_X_test = post.readAndBasicPreprocess("A")
        A_X_test = preprocessor.transform(A_X_test)
        A_y_pred = A_model.predict(A_X_test)
        A_y_pred = pd.DataFrame(A_y_pred, index=range(0,720), columns=['prediction'])

        B_X_test = post.readAndBasicPreprocess("B")
        B_X_test = preprocessor.transform(B_X_test)
        B_y_pred = B_model.predict(B_X_test)
        B_y_pred = pd.DataFrame(B_y_pred, index=range(720,1440), columns=['prediction'])

        C_X_test = post.readAndBasicPreprocess("C")
        C_X_test = preprocessor.transform(C_X_test)
        C_y_pred = C_model.predict(C_X_test)
        C_y_pred = pd.DataFrame(C_y_pred, index=range(1440,2160), columns=['prediction'])

        combined_pred = pd.concat([A_y_pred, B_y_pred, C_y_pred], axis=0)
        combined_pred["prediction"] = combined_pred["prediction"].clip(lower=0)
        combined_pred.index.name = "id"
        combined_pred.to_csv(filename, index=True)


    def submission_vs_best_submission(filepath):
        """
            mostly for debugging and testing. Checks if the submission is in the same ballpark as the best submission
        """

        refrence = pd.read_csv("/Users/henrikhorpedal/Documents/Skolearbeid/Maskinlæring/Group Task/TDT4173_Machine_Learning/Jallastacking/csvfiles/two_best_combined_zeroed_night_hours.csv")

        submission = pd.read_csv(filepath)

        print(f"MAE for location A: {mean_absolute_error(refrence['prediction'].iloc[0:720], submission['prediction'].iloc[0:720])}")
        print(f"MAE for location B: {mean_absolute_error(refrence['prediction'].iloc[720:1440], submission['prediction'].iloc[720:1440])}")
        print(f"MAE for location C: {mean_absolute_error(refrence['prediction'].iloc[1440:2160], submission['prediction'].iloc[1440:2160])}")


    def make_average_prediction(preds_dict,filename):
        """
        Generates a prediction by taking the average of the predictions in preds_dict.
        """
        lenght = len(preds_dict)
        data = 0
        for value in preds_dict.values():
            data += value["prediction"]
        data = data / lenght
        data = pd.DataFrame(data, columns=['prediction'])
        data.index.name = "id"
        data["prediction"] = data['prediction'].apply(lambda x: 0 if x < 0.05 else x)
        data.loc[(data.index % 24).isin([22, 23, 0]), "prediction"] = 0
        data.to_csv(filename, index=True)



    def make_lgbm_preprocessor_pred(A_model, A_scaler, A_preprocessor, B_model, B_scaler, B_preprocessor,C_model, C_scaler, C_preprocessor, filename):

        A_X_test = post.readAndBasicPreprocess("A")
        A_X_test = A_preprocessor.transform(A_X_test)
        A_X_test = A_scaler.transform(A_X_test)
        A_y_pred = A_model.predict(A_X_test)
        A_y_pred = pd.DataFrame(A_y_pred, index=range(0,720), columns=['prediction'])

        B_X_test = post.readAndBasicPreprocess("B")
        B_X_test = B_preprocessor.transform(B_X_test)
        B_X_test = B_scaler.transform(B_X_test)
        B_y_pred = B_model.predict(B_X_test)
        B_y_pred = pd.DataFrame(B_y_pred, index=range(720,1440), columns=['prediction'])

        C_X_test = post.readAndBasicPreprocess("C")
        C_X_test = C_preprocessor.transform(C_X_test)
        C_X_test = C_scaler.transform(C_X_test)
        C_y_pred = C_model.predict(C_X_test)
        C_y_pred = pd.DataFrame(C_y_pred, index=range(1440,2160), columns=['prediction'])

        combined_pred = pd.concat([A_y_pred, B_y_pred, C_y_pred], axis=0)
        combined_pred["prediction"] = combined_pred["prediction"].clip(lower=0)
        combined_pred.index.name = "id"
        combined_pred.to_csv(filename, index=True)

    def make_one_location_pred(model, letter, preprocessor, filename):
        if letter == "A":
            index_range = range(0,720)
        elif letter == "B":
            index_range = range(720,1440)
        elif letter == "C":
            index_range = range(1440,2160)
        

        X_test = post.readAndBasicPreprocess(letter)
        X_test = preprocessor.transform(X_test)
        y_pred = model.predict(X_test)
        y_pred = pd.DataFrame(y_pred, index=index_range, columns=['prediction'])
        y_pred["prediction"] = y_pred["prediction"].clip(lower=0)
        y_pred.index.name = "id"
        y_pred.to_csv(filename, index=True)

    def mean_diffrent_summers(filepath,folds_dict, predictionfilename):
        #FOLDS = {"A": 4, "B": 3, "C": 2}
        d_A = 0
        d_B = 0
        d_C = 0

        for letter in folds_dict.keys():
            preds = []
            folder_path = f"{filepath}/{letter}"
            for filename in os.listdir(folder_path):
                if filename.endswith('.csv'):  # Check if the file is a CSV
                    file_path = os.path.join(folder_path, filename)
                    df = pd.read_csv(file_path)
                    data = df["prediction"]
                    preds.append(data)
            #mean the predictions
            if letter == "A":
                for i,pred in enumerate(preds):
                    if i == 0:
                        d_A = pred
                    else:
                        d_A += pred
                d_A =d_A/len(preds)
                #d_A = pd.DataFrame(d_A, index=range(0,720), columns=['prediction'])

            
            elif letter == "B":
                for i,pred in enumerate(preds):
                    if i == 0:
                        d_B = pred
                    else:
                        d_B += pred
                #print(d_B)

                d_B = d_B/len(preds)
                #d_B = pd.DataFrame(d_B, index=range(720,1440), columns=['prediction'])

            
            elif letter == "C":
                for i,pred in enumerate(preds):
                    if i == 0:
                        d_C = pred
                    else:
                        d_C += pred
                d_C =d_C/len(preds)
                #d_C = pd.DataFrame(d_C, index=range(1440,2160), columns=['prediction'])

            
        combined_pred = pd.concat([d_A, d_B, d_C], axis=0)
        #name the column prediction
        combined_pred = pd.DataFrame(combined_pred, columns=['prediction'])
        combined_pred["prediction"] = combined_pred["prediction"].clip(lower=0)
        #reset index:
        combined_pred.reset_index(inplace=True, drop=True)
        combined_pred.index.name = "id"
        combined_pred.loc[(combined_pred.index % 24).isin([22, 23, 0]), "prediction"] = 0
        combined_pred.to_csv(predictionfilename, index=True)

In [5]:
RANDOM_STATE = 69

#np.random.seed(RANDOM_STATE)
FOLDER_NAME = "SHORT_NOTEBOOK1_CSV_FILES"
GLOBAL_VERBOSE = False
PREPROCESSORS = ["quarters","statistical", "trimmedMean"]
LETTERS = ["A", "B", "C"]
if not os.path.exists(FOLDER_NAME):
    os.makedirs(FOLDER_NAME)

## Deep Neural Network with tuned hyperparameters

#### Defining The DNN:

In [8]:
import torch
import torch.nn as nn
#impoting lightning:
import pytorch_lightning as pl
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
import numpy as np
import pandas as pd
import torch.nn.functional as F
#import dataset
from torch.utils.data import DataLoader
from torch.utils.data import Dataset as DataSet
from sklearn.base import BaseEstimator, RegressorMixin
import numpy as np
import random
from pytorch_lightning import seed_everything

SEED = 69
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.use_deterministic_algorithms(True)
g = torch.Generator()
g.manual_seed(SEED)



def seed_worker(worker_id):
    worker_seed = SEED
    np.random.seed(worker_seed)
    random.seed(worker_seed)

class SolarForecastingDataset(DataSet):
    def __init__(self, features_df, target_series):
        """
        Initializes the dataset with features and target labels.

        :param features_df: DataFrame containing the features.
        :param target_series: Series containing the target labels.
        """
        self.features = features_df
        self.targets = target_series

    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, index):
        # Extracting the features and the target label for the given index
        feature_vector = self.features.iloc[index].values
        target_label = self.targets.iloc[index]

        return {
            "feature_vector": torch.tensor(feature_vector, dtype=torch.float),
            "target_label": torch.tensor(target_label, dtype=torch.float)
        }

class SolarForecastingDatasetDataModule(pl.LightningDataModule):
    def __init__(self, train_features_df, train_targets_series, test_features_df, test_targets_series, batch_size=8):
        super().__init__()
        self.train_features_df = train_features_df
        self.train_targets_series = train_targets_series
        self.test_features_df = test_features_df
        self.test_targets_series = test_targets_series
        self.batch_size = batch_size

        
    def setup(self, stage=None):
        self.train_dataset = SolarForecastingDataset(self.train_features_df, self.train_targets_series)
        self.test_dataset = SolarForecastingDataset(self.test_features_df, self.test_targets_series)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size,worker_init_fn=seed_worker, generator=g, shuffle=True,)

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, shuffle=False, worker_init_fn=seed_worker, generator=g)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, shuffle=False, worker_init_fn=seed_worker, generator=g)

class FullyConnectedDNN(nn.Module):
    def __init__(self, input_size, layer_sizes, output_size, dropout_prob=0.1):
        super(FullyConnectedDNN, self).__init__()
        # Create fully connected layers
        self.fc_layers = nn.ModuleList()
        for i in range(len(layer_sizes)):
            in_features = input_size if i == 0 else layer_sizes[i - 1]
            out_features = layer_sizes[i]
            self.fc_layers.append(nn.Linear(in_features, out_features))

        self.output_layer = nn.Linear(layer_sizes[-1], output_size)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        for layer in self.fc_layers:
            x = F.relu(layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)
        return x

def weighted_mae_loss(input, target, exponent=1, constant=1):
    assert input.size() == target.size()

    # Calculate the absolute error
    absolute_errors = torch.abs(input - target)

    # Apply exponential scaling with a constant
    adjusted_target = target + constant
    weighted_errors = absolute_errors * (adjusted_target ** exponent)

    return weighted_errors.mean()


class SolarPowerProductionPredictor(pl.LightningModule):

    def __init__(self, input_size, layer_sizes, output_size, weight_decay=1e-5, dropout_prob=0.1, learning_rate=0.01, verbose=True, loss_exponent=1.0, loss_beta=1.0):
        super().__init__()
        self.model = FullyConnectedDNN(input_size, layer_sizes, output_size, dropout_prob=dropout_prob)
        self.criterion = self.criterion = lambda input, target: weighted_mae_loss(input, target, exponent=loss_exponent, constant=1)
        #self.criterion = CustomMAELoss(loss_alpha, loss_beta)

        self.weight_decay = weight_decay
        self.learning_rate = learning_rate
        self.verbose = verbose
    
    def forward(self, x, labels=None):
        output = self.model(x)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output

    def training_step(self, batch, batch_idx):
        features, labels = batch["feature_vector"], batch["target_label"]
        loss, outputs = self(features, labels) 
        self.log("train_loss", loss, prog_bar=self.verbose, logger=False)
        return loss
    
    def validation_step(self, batch, batch_idx):
        features, labels = batch["feature_vector"], batch["target_label"]
        loss, outputs = self(features, labels) 
        self.log("val_loss", loss, prog_bar=self.verbose, logger=False)
        return loss
    
    def test_step(self, batch, batch_idx):
        features, labels = batch["feature_vector"], batch["target_label"]
        loss, outputs = self(features, labels) 
        self.log("test_loss", loss, prog_bar=self.verbose, logger=False)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)



class CustomModelCheckpoint(ModelCheckpoint):
    def on_save_checkpoint(self, trainer, pl_module, checkpoint):
        # Save the best model path to the pl_module
        pl_module.best_model_path = self.best_model_path

def get_predictions(model, dataloader):
    model.eval()  # set the model to evaluation mode
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            features, labels = batch["feature_vector"], batch["target_label"]
            predictions = model(features)[1]  
            if not isinstance(predictions, torch.Tensor):
                raise TypeError("Model output is not a tensor. Got type: {}".format(type(predictions)))
            
            all_predictions.append(predictions)
            all_labels.append(labels)

    # Check for tensor types before concatenation
    if not all(isinstance(p, torch.Tensor) for p in all_predictions):
        raise TypeError("Not all elements in predictions are tensors.")

    if not all(isinstance(l, torch.Tensor) for l in all_labels):
        raise TypeError("Not all elements in labels are tensors.")

    all_predictions_tensor = torch.cat(all_predictions, dim=0)
    all_labels_tensor = torch.cat(all_labels, dim=0)

    # Convert tensors to numpy arrays
    all_predictions_np = all_predictions_tensor.cpu().numpy()
    all_labels_np = all_labels_tensor.cpu().numpy()
    
    return all_predictions_np, all_labels_np

class HenrikDNN:

    def __init__(self,n_features = None, layer_sizes = [100,50], output_size = 1, drop_out_prob = 0.1, learning_rate = 0.01, weight_decay = 1e-5, max_epochs = 100, paitience = 5, batch_size = 16, val_chack_interval = 1, pruning_callback = None, verbose = True, loss_expontent = 1):

        self.n_features = n_features
        self.layer_sizes = layer_sizes
        self.verbose = verbose
        self.output_size = output_size
        self.drop_out_prob = drop_out_prob
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.loss_expontent = loss_expontent
        self.paitience = paitience
        self.batch_size = batch_size
        self.max_epochs = max_epochs
        self.val_chack_interval = val_chack_interval
        self.pruning_callback = pruning_callback
        SEED = 69
        print("hei")
        torch.manual_seed(SEED)
        np.random.seed(SEED)
        random.seed(SEED)
        torch.use_deterministic_algorithms(True)
        g = torch.Generator()
        g.manual_seed(SEED)


        self.pl_model = SolarPowerProductionPredictor(self.n_features, self.layer_sizes, self.output_size, weight_decay=self.weight_decay, dropout_prob=self.drop_out_prob, learning_rate=self.learning_rate, verbose=self.verbose, loss_exponent=self.loss_expontent)

        self.checkpoint_callback = CustomModelCheckpoint(
            dirpath='HenrikDNN_checkpoints',
            save_top_k=1,
            verbose=self.verbose,
            monitor='val_loss',
            mode='min',
            filename='model-{epoch:02d}-{val_loss:.2f}'
        )

        self.early_stopping_callback = EarlyStopping(
            monitor="val_loss",
            patience=self.paitience
        )


        self.callbacks = [self.early_stopping_callback, self.checkpoint_callback]
        if self.pruning_callback is not None:
            self.callbacks.append(self.pruning_callback)
        seed_everything(69, workers=True)

        self.trainer = pl.Trainer(
            max_epochs=self.max_epochs,
            callbacks=self.callbacks,
            enable_progress_bar=self.verbose,
            accelerator="cpu",
            check_val_every_n_epoch = 2,
            deterministic=True

            #val_check_interval=self.val_chack_interval
        )

    def train(self, X_train, y_train, X_val, y_val):
        """
        Args:
            X_train: Training df with datetime index. 
            y_train: Training df, with datetime index. Each row in y_train corresponds to four rows in X_train.
    
        """
        #print the seed:
        SEED = 69
        print("hei")
        torch.manual_seed(SEED)
        np.random.seed(SEED)
        random.seed(SEED)
        torch.use_deterministic_algorithms(True)
        g = torch.Generator()
        g.manual_seed(SEED)

        self.data_module = SolarForecastingDatasetDataModule(X_train, y_train, X_val, y_val, batch_size=self.batch_size)
        self.trainer.fit(self.pl_model, self.data_module)


    def predict(self, X):
        trained_model = SolarPowerProductionPredictor.load_from_checkpoint(
            self.pl_model.best_model_path,
            input_size=self.n_features,
            layer_sizes=self.layer_sizes,
            output_size=self.output_size,
            dropout_prob=self.drop_out_prob,
            learning_rate=self.learning_rate,
            weight_decay=self.weight_decay,
            verbose=self.verbose,
            loss_exponent=self.loss_expontent
        )

        X_dataloader = torch.utils.data.DataLoader(
            SolarForecastingDataset(X, pd.Series(np.zeros(X.shape[0]))),
            batch_size=1,
            shuffle=False,
            num_workers=0,
            worker_init_fn=seed_worker,
            generator=g
        )

        predictions, _ = get_predictions(trained_model, X_dataloader)

        return predictions



def train_test_split_on_specific_day_May_june(X, y, split_date):
    """
    Splits the data based on a given date. Additionally, moves May, June and July data of split_date's year
    from training set to test set.
    
    Parameters:
    - X: Quarter-hourly input data with DateTime index.
    - y: Hourly target data with DateTime index.
    - split_date: Date (string or datetime object) to split the data on.
    
    Returns:
    X_train, y_train, X_test, y_test
    """
    split_date = pd.Timestamp(split_date).normalize()

    # Ensure split_date is a datetime object
    if isinstance(split_date, str):
        split_date = pd.Timestamp(split_date)

    print(f"Split date: {split_date}")

    # Split the data based on the provided date
    X_train = X[X.index.normalize() < split_date]
    y_train = y[y.index.normalize() < split_date]

    X_test = X[X.index.normalize() >= split_date]
    y_test = y[y.index.normalize() >= split_date]

    # Define conditions to move May and June of split_date's year from train to test
    may_june_condition_X = ((X_train.index.month == 5) | (X_train.index.month == 6) | (X_train.index.month == 7)) & (X_train.index.year == split_date.year)
    may_june_condition_y = ((y_train.index.month == 5) | (y_train.index.month == 6) | (y_train.index.month == 7)) & (y_train.index.year == split_date.year)
    
    X_may_june = X_train[may_june_condition_X]
    y_may_june = y_train[may_june_condition_y]

    # Remove May and June data from training set
    X_train = X_train[~may_june_condition_X]
    y_train = y_train[~may_june_condition_y]

    # Append May and June data to test set
    X_test = pd.concat([X_may_june, X_test])
    y_test = pd.concat([y_may_june, y_test])

    return X_train, y_train, X_test, y_test


In [16]:
def trainDNN(letter):
    X, y = pre.general_read(letter)
    X = pre.concatenate_dfs(X)
    X_train, y_train,X_val, y_val = pre.train_test_split_may_june_july(X,y , letter)
    y_train = y_train["target"]
    y_val = y_val["target"]

    if letter == "A":

        dnn_params = {
            'layer_sizes': [119,101], #<----- from opta hyper parameter tuning
            'drop_out_prob': 0.03, #<----- from opta hyper parameter tuning
            'learning_rate': 0.0000969995972939842, #<----- from opta hyper parameter tuning
            'loss_expontent': 0.9702228408589507, #<----- from opta hyper parameter tuning
            'max_epochs': 300, #
            'paitience': 15, #
            'batch_size': 128, #
            'val_chack_interval': 0.5, #
            'verbose': True,
            'weight_decay': 2.499126185711371e-8 #<----- from opta hyper parameter tuning
        }

        dnn_feature_scaler = 'minmax' #<----- from opta hyper parameter tuning
        dnn_target_scaler = 'minmax' #<----- from opta hyper parameter tuning
        dnn_preprocessor = 'quarters' #<----- from opta hyper parameter tuning
        dnn_target_encoder = True #<----- from opta hyper parameter tuning
    
    elif letter == "B":
        
        dnn_params = {
            'layer_sizes': [200,180], #<----- from opta hyper parameter tuning
            'drop_out_prob': 0.03, #<----- from opta hyper parameter tuning
            'learning_rate': 0.000018846485346070986, #<----- from opta hyper parameter tuning
            'loss_expontent': 0.963895316469423, #<----- from opta hyper parameter tuning
            'max_epochs': 300, #
            'paitience': 15, #
            'batch_size': 128, #
            'val_chack_interval': 0.5, #
            'verbose': True,
            'weight_decay': 6.586949775384596e-7 #<----- from opta hyper parameter tuning
        }
        
        dnn_feature_scaler = 'minmax'
        dnn_target_scaler = 'minmax'
        dnn_preprocessor = 'statistical'
        dnn_target_encoder = False
    
    elif letter == "C":
        #52.31801357204807	quarters	true	144	131	0.000060363753946263044	-1.5104000124283146	2.335809622586189e-7
        dnn_params = {
            'layer_sizes': [144,131], #<----- from opta hyper parameter tuning
            'drop_out_prob': 0.03, #<----- from opta hyper parameter tuning
            'learning_rate': 0.000060363753946263044, #<----- from opta hyper parameter tuning
            'loss_expontent': -1.5104000124283146, #<----- from opta hyper parameter tuning
            'max_epochs': 300, #
            'paitience': 15, #
            'batch_size': 128, #
            'val_chack_interval': 0.5, #
            'verbose': True,
            'weight_decay': 2.335809622586189e-7 #<----- from opta hyper parameter tuning
        }

        dnn_feature_scaler = 'minmax'
        dnn_target_scaler = 'minmax'
        dnn_preprocessor = 'quarters'
        dnn_target_encoder = True

    dnn_feature_scaler = pre.choose_scaler(dnn_feature_scaler)
    dnn_target_scaler = pre.choose_scaler(dnn_target_scaler)
    dnn_preprocessor = pre.choose_transformer(dnn_preprocessor)
    dnn_target_encoder = pre.choose_encoder(dnn_target_encoder)
    
    dnn_preprocessing = Pipeline([
        ('custom_transformer', dnn_preprocessor),
        ('target_encoder', dnn_target_encoder), 
        ('feature_scaler', dnn_feature_scaler)
    ])

    dnn_target_preprocessing = Pipeline([
        ('target_scaler', dnn_target_scaler)
    ])

    #fit the preprocessing:
    dnn_preprocessing.fit(X_train, y_train)
    dnn_target_preprocessing.fit(pd.DataFrame(y_train))

    #transform the data:
    X_train_dnn = pd.DataFrame((dnn_preprocessing.transform(X_train)))
    y_train_dnn = pd.DataFrame(dnn_target_preprocessing.transform(pd.DataFrame(y_train)))
    X_val_dnn = pd.DataFrame(dnn_preprocessing.transform(X_val))
    y_val_dnn = pd.DataFrame(dnn_target_preprocessing.transform(pd.DataFrame(y_val)))

    #fit the model:
    dnn_model = HenrikDNN(n_features =X_train_dnn.shape[1] , **dnn_params)
    dnn_model.train(X_train_dnn, y_train_dnn, X_val_dnn, y_val_dnn)
    #predict:
    dnn_pred = dnn_model.predict(X_val_dnn)
    #scale back:
    dnn_pred = dnn_target_preprocessing.inverse_transform(dnn_pred).reshape(-1)

    print(f"MAE DNN location {letter}: {mean_absolute_error(y_val, dnn_pred)}")

    return dnn_model, dnn_preprocessing, dnn_target_preprocessing, mean_absolute_error(y_val, dnn_pred)


models = {"A": 
          {"best_score": 10000, "best_model": None, "best_preprocessor": None, "best_target_preprocessor": None},
          "B": 
          {"best_score": 10000, "best_model": None, "best_preprocessor": None, "best_target_preprocessor": None},
          "C": 
          {"best_score": 10000, "best_model": None, "best_preprocessor": None, "best_target_preprocessor": None}
}



num_iterations = 1


for letter in ["A", "B", "C"]:
    for i in range(num_iterations):
        print(f"training {letter}, iteration {i+1} of {num_iterations}")
        model, preprocessor, target_preprocessor, score = trainDNN(letter)
        if score < models[letter]["best_score"]:
            print(f"new best score for {letter}: {score}")
            models[letter]["best_score"] = score
            models[letter]["best_model"] = model
            models[letter]["best_preprocessor"] = preprocessor
            models[letter]["best_target_preprocessor"] = target_preprocessor

post.make_dnn_prediction(models["A"]["best_model"],
                         models["A"]["best_preprocessor"],
                         models["A"]["best_target_preprocessor"],
                         models["B"]["best_model"],
                         models["B"]["best_preprocessor"],
                         models["B"]["best_target_preprocessor"],
                         models["C"]["best_model"],
                         models["C"]["best_preprocessor"],
                         models["C"]["best_target_preprocessor"],
                         f"{FOLDER_NAME}/DNN.csv")


training A, iteration 1 of 1


Global seed set to 69
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name  | Type              | Params
--------------------------------------------
0 | model | FullyConnectedDNN | 46.4 K
--------------------------------------------
46.4 K    Trainable params
0         Non-trainable params
46.4 K    Total params
0.185     Total estimated model params size (MB)


hei
hei


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 1, global step 498: 'val_loss' reached 0.07809 (best 0.07809), saving model to '/Users/henrikhorpedal/Documents/Skolearbeid/Maskinlæring/Group Task/submission notebooks/MLC/HenrikDNN_checkpoints/model-epoch=01-val_loss=0.08.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


MAE DNN location A: 329.5761192322475
new best score for A: 329.5761192322475
training B, iteration 1 of 1


Global seed set to 69
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name  | Type              | Params
--------------------------------------------
0 | model | FullyConnectedDNN | 67.8 K
--------------------------------------------
67.8 K    Trainable params
0         Non-trainable params
67.8 K    Total params
0.271     Total estimated model params size (MB)


hei
hei


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 1, global step 386: 'val_loss' reached 0.12988 (best 0.12988), saving model to '/Users/henrikhorpedal/Documents/Skolearbeid/Maskinlæring/Group Task/submission notebooks/MLC/HenrikDNN_checkpoints/model-epoch=01-val_loss=0.13-v1.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 772: 'val_loss' reached 0.09252 (best 0.09252), saving model to '/Users/henrikhorpedal/Documents/Skolearbeid/Maskinlæring/Group Task/submission notebooks/MLC/HenrikDNN_checkpoints/model-epoch=03-val_loss=0.09.ckpt' as top 1
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


MAE DNN location B: 71.50019573666992
new best score for B: 71.50019573666992
training C, iteration 1 of 1


Global seed set to 69
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name  | Type              | Params
--------------------------------------------
0 | model | FullyConnectedDNN | 60.5 K
--------------------------------------------
60.5 K    Trainable params
0         Non-trainable params
60.5 K    Total params
0.242     Total estimated model params size (MB)


hei
hei


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 1, global step 304: 'val_loss' reached 0.03928 (best 0.03928), saving model to '/Users/henrikhorpedal/Documents/Skolearbeid/Maskinlæring/Group Task/submission notebooks/MLC/HenrikDNN_checkpoints/model-epoch=01-val_loss=0.04-v1.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 608: 'val_loss' reached 0.03810 (best 0.03810), saving model to '/Users/henrikhorpedal/Documents/Skolearbeid/Maskinlæring/Group Task/submission notebooks/MLC/HenrikDNN_checkpoints/model-epoch=03-val_loss=0.04.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 912: 'val_loss' reached 0.03732 (best 0.03732), saving model to '/Users/henrikhorpedal/Documents/Skolearbeid/Maskinlæring/Group Task/submission notebooks/MLC/HenrikDNN_checkpoints/model-epoch=05-val_loss=0.04.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


MAE DNN location C: 57.72308643838329
new best score for C: 57.72308643838329


In [8]:
from flaml import AutoML

FLAML_TIME_BUDGET_A =30 #30*60
FLAML_TIME_BUDGET_B_C =30 #15*60

def trainFlamAutoML(letter, preprocessor, time_budget=60):
    X, y = pre.general_read_flaml(letter)
    X = pre.concatenate_dfs(X)
    X_train, y_train,X_test, y_test = pre.train_test_split_may_june_july(X,y,letter)
    X_train = preprocessor.transform(X_train)
    X_test = preprocessor.transform(X_test)
    y_train = y_train["target"]
    y_test = y_test["target"]
    automl = AutoML()

    automl_settings = {
        "time_budget": time_budget,  # in seconds
        "metric": 'mae',
        "task": 'regression',
        "log_file_name": f"flaml_{letter}.log",
        "seed": RANDOM_STATE
    }

    automl.fit(X_train=X_train, y_train=y_train, X_val=X_test, y_val=y_test, **automl_settings)
    return automl

PREPROCESSORS = ["quarters"]

for preprocessor in PREPROCESSORS:
    flaml_preprocessor = pre.choose_transformer(preprocessor)
    flaml_A = trainFlamAutoML("A", preprocessor=flaml_preprocessor, time_budget=FLAML_TIME_BUDGET_A)
    flaml_B = trainFlamAutoML("B", preprocessor=flaml_preprocessor, time_budget=FLAML_TIME_BUDGET_B_C)
    flaml_C = trainFlamAutoML("C", preprocessor=flaml_preprocessor, time_budget=FLAML_TIME_BUDGET_B_C)
    
post.makePredictionWithModelAndPreprocessor(flaml_A,flaml_B,flaml_C,flaml_preprocessor,f"{FOLDER_NAME}/flaml_{preprocessor}.csv")

[flaml.automl.logger: 11-12 12:55:34] {1679} INFO - task = regression
[flaml.automl.logger: 11-12 12:55:34] {1687} INFO - Data split method: uniform
[flaml.automl.logger: 11-12 12:55:34] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 11-12 12:55:34] {1788} INFO - Minimizing error metric: mae
[flaml.automl.logger: 11-12 12:55:34] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 11-12 12:55:34] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 11-12 12:55:34] {2344} INFO - Estimated sufficient time budget=2170s. Estimated necessary time budget=19s.
[flaml.automl.logger: 11-12 12:55:34] {2391} INFO -  at 1.6s,	estimator lgbm's best error=721.2925,	best estimator lgbm's best error=721.2925
[flaml.automl.logger: 11-12 12:55:34] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 11-12 12:55:34] {2391} INFO -  at 1.9s,	estimator lgbm's best error=6

### Ensemble

## Jallastacking the csvfiles