In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from ipynb.fs.full.plotter_class import Plotter
from ipynb.fs.full.time_series_data import TimeSeriesData
from ipynb.fs.full.dataset_creator import SupervisedDatasetCreator
from ipynb.fs.full.dataset_scaler_manager import DataScalerManager
from ipynb.fs.full.lstm import LSTMModel

class WeatherPipeline:
    def __init__(self, config):
        self.config = config
        self.data_scaler_manager = DataScalerManager()
        self.ts_data = TimeSeriesData(self.config['data_path'], self.config['file_name'])
        self.column_target = self.config['column_target']
        self.lstm_model = None
        self.scaler = None
        self.data = None
        self.dataset = None
        self.y_test_pred = None
                       
    def create_supervised_data(self):            
        target_column_name = self.config['column_target']
        target_column_index = self.data['test'].columns.get_loc(self.config['column_target'])
        
        preprocessor = SupervisedDatasetCreator(self.config['input_length'], self.config['output_length'], target_column_index)
        x_train, y_train = preprocessor.create_supervised_dataset(self.data['train'].values)
        x_val, y_val = preprocessor.create_supervised_dataset(self.data['val'].values)
        x_test, y_test = preprocessor.create_supervised_dataset(self.data['test'].values)

        self.data.update({
            'x_train': x_train, 'y_train': y_train,
            'x_val': x_val, 'y_val': y_val,
            'x_test': x_test, 'y_test': y_test
        })
                                
    def build_model(self):
        input_shape = (self.data['x_train'].shape[1], self.data['x_train'].shape[2])
        lstm_model = LSTMModel(n_units=self.config['n_units'], input_shape=input_shape, output_shape=self.config['output_length'], learning_rate=self.config['learning_rate'])
        self.lstm_model = lstm_model.model   
        
    def drop_column(self, column):
        return self.dataset.drop(column, axis=1)
    
    def drop_columns(self, columns):
        for column in columns:
            self.dataset = self.drop_column(column)
        
    def evaluate_model(self):
        rmse_train = self.lstm_model.evaluate(self.data['x_train'], self.data['y_train'], verbose=0)
        rmse_val = self.lstm_model.evaluate(self.data['x_val'], self.data['y_val'], verbose=0)
        rmse_test = self.lstm_model.evaluate(self.data['x_test'], self.data['y_test'], verbose=0)

        print('RMSE Comparison:')
        print(f'  RMSE Train: {rmse_train:.3f}')
        print(f'  RMSE Val: {rmse_val:.3f}')
        print(f'  RMSE Test: {rmse_test:.3f}')
    
    def feature_engineering(self):
        w_dir = self.dataset['wd (deg)'] * np.pi / 180    
        Wx = self.dataset['wv (m/s)'] * np.cos(w_dir)
        Wy = self.dataset['wv (m/s)'] * np.sin(w_dir)
        
        self.dataset = self.dataset.drop(columns=['wd (deg)', 'wv (m/s)', 'max. wv (m/s)'])    
        self.dataset['Wx'] = Wx
        self.dataset['Wy'] = Wy
        
    def plot_datasets(self):
        """
        Plots subplots for each variable in the dataset with corresponding legends and titles.
        """
        fig, ax = plt.subplots(figsize = (16,5))
        ax.plot(self.data['train'][self.config['column_target']], label='Train')
        ax.plot(self.data['val'][self.config['column_target']], label='Val')
        ax.plot(self.data['test'][self.config['column_target']], label='Test')
        ax.set_title(f'{self.config["column_target"]} over Time')
        plt.legend()
        
    def plot_predictions(self):
        column_target = self.config['column_target']      
        plotter = Plotter()
        plotter.plot(self.data['test'][column_target].values, label='True')
        plotter.plot(self.y_test_pred, label='Predicted')
        plotter.configure(title=f'True vs Predicted for {column_target}', xlabel='Time Steps', ylabel=column_target)
        plotter.show()
        
    def plot_prediction_errors(self):
        N = len(self.y_test_pred)  
        ndato = np.linspace(1, N, N)

        errores = self.data['y_test'].flatten() - self.y_test_pred
        plt.plot(ndato, errores)
        plt.xlabel('Prediction index')
        plt.ylabel('Prediction error')
        plt.title('Prediction errors')
        plt.show()
        
    def plot_rmse_over_timesteps(self):
        y_ts_pred_s = self.lstm_model.predict(self.data['x_test'], verbose=0)
        y_ts_pred = self.scaler.inverse_transform(y_ts_pred_s)

        diff_cuad = np.square(self.data['y_test'].squeeze()-y_ts_pred)
        proms = np.mean(diff_cuad, axis=0)
        rmse = np.sqrt(proms) 
        t = np.linspace(1, self.config['output_length'], self.config['output_length'])

        fig, ax = plt.subplots()
        ax.scatter(t,rmse)
        ax.set_xlabel('Predicted Timestep')
        ax.set_ylabel('RMSE Error (°C)')
        plt.xticks(ticks=t, labels=t)
        plt.grid() 
        plt.show()
        
    def plot_training_history(self, history):
        plotter = Plotter()
        
        plotter.plot(history.history['loss'], label='RMSE Train')
        plotter.plot(history.history['val_loss'], label='RMSE Val')
        plotter.configure(title='Epochs vs RMSE', xlabel='Epochs', ylabel='RMSE')
        plotter.show()  
        
    def predict(self, x):
        y_pred_scaled = self.lstm_model.predict(x, verbose=0)
        y_pred = self.scaler.inverse_transform(y_pred_scaled)
        y_pred = y_pred.flatten()   
        self.y_test_pred = y_pred
        
    def read_file(self):
        ts_data = TimeSeriesData(self.config['data_path'], self.config['file_name'])
        
        if 'columns_dataset' in self.config and self.config['columns_dataset']:
            series = ts_data.df[self.config['columns_dataset']]
        else:
            series = ts_data.df  
        
        self.dataset = series
    
    def split_data(self):
        train, val, test = self.ts_data.train_val_test_split(self.dataset)
        self.data = {'train': train, 'val': val, 'test': test}
        
    def scale_data(self):        
        col_ref = self.dataset.columns.get_loc(self.column_target)  
        scaled_data, self.scaler = self.data_scaler_manager.scale_data(self.data, col_ref)
        self.data.update(scaled_data)
        
    def train_model(self):
        early_stopping = EarlyStopping(patience=6, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=1e-7)

        history = self.lstm_model.fit(
            self.data['x_train'], self.data['y_train'],
            epochs=self.config['epochs'],
            batch_size=self.config['batch_size'],
            validation_data=(self.data['x_val'], self.data['y_val']),
            callbacks=[early_stopping, reduce_lr],
            verbose=2
        )

        self.plot_training_history(history)
                
    def plot_violin_plots(self, x_train=None, x_val=None, x_test=None, y_train=None, y_val=None, y_test=None, feature_names=None, figsize=(12, 4)):
        fig, ax = plt.subplots(figsize=figsize)
        
        if x_train is not None and x_val is not None and x_test is not None and feature_names is not None:
            num_features = x_train.shape[2]
            
            for i in range(num_features):
                train_data = x_train[:, :, i].flatten()
                val_data = x_val[:, :, i].flatten()
                test_data = x_test[:, :, i].flatten()
                
                ax.violinplot(dataset=train_data, positions=[i], showmeans=True, showmedians=True)
                ax.violinplot(dataset=val_data, positions=[i], showmeans=True, showmedians=True)
                ax.violinplot(dataset=test_data, positions=[i], showmeans=True, showmedians=True)
            
            ax.set_xticks(list(range(num_features)))
            ax.set_xticklabels(feature_names, rotation=90)
        
        elif y_train is not None and y_val is not None and y_test is not None:
            ax.violinplot(dataset=y_train.flatten(), positions=[0], showmeans=True, showmedians=True)
            ax.violinplot(dataset=y_val.flatten(), positions=[1], showmeans=True, showmedians=True)
            ax.violinplot(dataset=y_test.flatten(), positions=[2], showmeans=True, showmedians=True)
            
            ax.set_xticks([0, 1, 2])
            ax.set_xticklabels(['Train', 'Val', 'Test'])
        
        else:
            raise ValueError("Invalid input: Provide either x_train, x_val, x_test, and feature_names or y_train, y_val, and y_test.")
        
        ax.autoscale()        
        plt.show()
                    
    def run(self):
        self.read_file()
        self.drop_columns(['datetime'])
        if self.config['feature_engineering']:
            self.feature_engineering
        self.split_data()
        if self.config['plot']:
            self.plot_datasets()        
        self.create_supervised_data()
        self.scale_data()
        if self.config['plot']:
            self.plot_violin_plots(x_train=self.data['x_train'], x_val=self.data['x_val'], x_test=self.data['x_test'], feature_names=self.dataset.keys())
            self.plot_violin_plots(y_train=self.data['y_train'], y_val=self.data['y_val'], y_test=self.data['y_test'])
        self.build_model()
        self.train_model()
        self.evaluate_model()
        self.predict(self.data['x_test'])
        if self.config['plot']:
            self.plot_predictions()
            if self.config['output_length'] > 1:
                self.plot_rmse_over_timesteps()