In [None]:
#data imputation to replace NaN values
import pandas as pd
from sklearn.impute import KNNImputer

# Load data
file_path = "/content/GLORIA.csv"
data = pd.read_csv(file_path)

# Select columns 2 to the last column (1-indexed)
cols_to_impute = data.columns[1:]  # Adjust indexing to 0-indexed Python logic

# Apply KNN Imputation
imputer = KNNImputer(n_neighbors=5)  # Default n_neighbors=5
data[cols_to_impute] = imputer.fit_transform(data[cols_to_impute])

# Save the imputed dataset
output_path = "/content/GLORIA_imputed.csv"
data.to_csv(output_path, index=False)

print(f"Imputed data saved to {output_path}")


Imputed data saved to /content/GLORIA_imputed.csv


In [2]:
pip install keras_tuner



In [None]:
#ensembled mdn (gmdn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from keras_tuner import RandomSearch
from keras_tuner import RandomSearch, HyperParameters

# Module 1: Data Preprocessing
def preprocess_data(file_path):
    """
    Preprocess the data by loading it, removing NaN values, and splitting into train and test sets.
    Standardize the features and targets using StandardScaler.
    """
    # Load the data
    data = pd.read_csv(file_path)
    data = data.dropna()  # Remove rows with NaN values

    # Extract FID, features, and targets
    FID = data.iloc[:, 0].values
    features = data.iloc[:, 1:-3].values
    targets = data.iloc[:, -3:].values #the target variables are log converted, so apply exponent on the final results to convert to linear

    # Split the data
    X_train, X_test, y_train, y_test, fid_train, fid_test = train_test_split(
        features, targets, FID, test_size=0.2, random_state=42
    )

    # Standardize the features and targets
    feature_scaler = StandardScaler()
    target_scaler = StandardScaler()

    X_train = feature_scaler.fit_transform(X_train)
    X_test = feature_scaler.transform(X_test)

    y_train = target_scaler.fit_transform(y_train)
    y_test = target_scaler.transform(y_test)

    return X_train, X_test, y_train, y_test, fid_test, target_scaler

# Module 2: Model Builder for Keras Tuner
def build_complex_model(hp):
    """
    Build a complex neural network with tunable hyperparameters.
    """
    model = Sequential()

    # Input layer
    model.add(Dense(
        units=hp.Int('units_layer1', min_value=64, max_value=256, step=32),
        activation='relu',
        kernel_regularizer=l2(hp.Choice('l2_reg', [0.001, 0.01, 0.1])),
        input_dim=X_train.shape[1]
    ))

    # Hidden layers
    for i in range(hp.Int('num_hidden_layers', 7, 9)):  # Between 7 and 9 layers
        model.add(Dense(
            units=hp.Int(f'units_layer{i+2}', min_value=32, max_value=128, step=16),
            activation='relu',
            kernel_regularizer=l2(hp.Choice('l2_reg', [0.001, 0.01, 0.1]))
        ))
        model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.5, step=0.1)))

    # Output layer
    model.add(Dense(3, activation='linear'))  # 3 target variables

    # Compile model
    model.compile(
        optimizer=Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='mean_squared_error',
        metrics=['mae', 'mape']
    )
    return model

# Module 3: Hyperparameter Tuning
def tune_hyperparameters(X_train, y_train):
    """
    Tune hyperparameters using Keras Tuner.
    """
    tuner = RandomSearch(
        build_complex_model,
        objective='val_loss',
        max_trials=10,  # Number of hyperparameter combinations
        executions_per_trial=1,
        directory='hyperparameter_tuning',
        project_name='complex_model_tuning'
    )

    tuner.search(X_train, y_train, epochs=200, validation_split=0.2, batch_size=100)
    return tuner

# Module 4: Training the Best Models for Ensemble
def train_ensemble_models(tuner, X_train, y_train, num_ensemble=10):
    """
    Train an ensemble of models (10 models) and return the best 3 based on validation loss.
    """
    best_models = []
    histories = []
    best_hps = []

    # Train 10 models and track their performance
    for i in range(num_ensemble):
        best_hps_i = tuner.get_best_hyperparameters(num_trials=1)[0]
        model = tuner.hypermodel.build(best_hps_i)

        # Train the model
        early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
        history = model.fit(X_train, y_train, validation_split=0.2, epochs=1000, batch_size=100, callbacks=[early_stopping])

        best_models.append(model)
        histories.append(history)
        best_hps.append(best_hps_i)

    # Select top 3 models based on validation loss
    val_losses = [min(history.history['val_loss']) for history in histories]
    best_indices = np.argsort(val_losses)[:3]

    top_3_models = [best_models[i] for i in best_indices]
    return top_3_models, histories, best_hps, val_losses

# Module 5: Averaging Predictions from Ensemble
def average_predictions(top_3_models, X_test, target_scaler):
    """
    Average the predictions from the top 3 models.
    """
    predictions_list = []
    for model in top_3_models:
        predictions = model.predict(X_test)
        predictions_denormalized = target_scaler.inverse_transform(predictions)
        predictions_list.append(predictions_denormalized)

    # Average the predictions of the top 3 models
    averaged_predictions = np.mean(predictions_list, axis=0)
    return averaged_predictions

# Module 6: Saving Results and Plotting
def save_results_and_plot(top_3_models, histories, averaged_predictions, X_test, y_test, fid_test, target_scaler, output_dir='results'):
    """
    Save the trained models, test results, loss plots, and averaged predictions.
    """
    import os
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save the models
    for i, model in enumerate(top_3_models):
        model.save(f'{output_dir}/best_model_{i+1}.h5')

    # Predict and denormalize
    y_test_denormalized = target_scaler.inverse_transform(y_test)

    # Save test results
    results = pd.DataFrame(fid_test, columns=['FID'])
    results = pd.concat([results, pd.DataFrame(y_test_denormalized, columns=['True1', 'True2', 'True3'])], axis=1)
    results = pd.concat([results, pd.DataFrame(averaged_predictions, columns=['Pred1', 'Pred2', 'Pred3'])], axis=1)
    results.to_csv(f'{output_dir}/ensemble_test_results.csv', index=False)

    # Save loss data for each model
    for i, history in enumerate(histories):
        loss_data = {
            'Epochs': list(range(1, len(history.history['loss']) + 1)),
            'Training Loss': history.history['loss'],
            'Validation Loss': history.history['val_loss']
        }
        loss_df = pd.DataFrame(loss_data)
        loss_df.to_csv(f'{output_dir}/loss_data_model_{i+1}.csv', index=False)

    # Plot and save the loss curves for each model
    plt.figure(figsize=(10, 6))
    for i, history in enumerate(histories):
        plt.plot(history.history['loss'], label=f'Model {i+1} Training Loss')
        plt.plot(history.history['val_loss'], label=f'Model {i+1} Validation Loss')

    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss for Each Model')
    plt.legend()
    plt.grid()
    plt.savefig(f'{output_dir}/ensemble_loss_plot.png')
    plt.show()

# Main Script
if __name__ == "__main__":
    file_path = 'GLORIA_imputed.csv'  # Adjust file path if necessary
    X_train, X_test, y_train, y_test, fid_test, target_scaler = preprocess_data(file_path)

    # Perform hyperparameter tuning
    tuner = tune_hyperparameters(X_train, y_train)

    # Train the ensemble of models
    top_3_models, histories, best_hps, val_losses = train_ensemble_models(tuner, X_train, y_train)

    # Average the predictions of the top 3 models
    averaged_predictions = average_predictions(top_3_models, X_test, target_scaler)

    # Save results and plot
    save_results_and_plot(top_3_models, histories, averaged_predictions, X_test, y_test, fid_test, target_scaler)

    # Print best hyperparameters for each model
    for i, best_hp in enumerate(best_hps):
        print(f"""
        Best Hyperparameters for Model {i+1}:
        - Units Layer 1: {best_hp.get('units_layer1')}
        - L2 Regularization: {best_hp.get('l2_reg')}
        - Dropout Rate: {best_hp.get('dropout_rate')}
        - Learning Rate: {best_hp.get('learning_rate')}
        - Number of Hidden Layers: {best_hp.get('num_hidden_layers')}
        """)


In [4]:
import zipfile
import os

# Define the paths to the directories and file
paths_to_zip = [
    '/content/hyperparameter_tuning',
    '/content/results',
    '/content/GLORIA_imputed.csv'
]

# Define the name of the zip file
zip_filename = '/content/combined_files.zip'

# Create a Zip file
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for path in paths_to_zip:
        # Check if it's a directory
        if os.path.isdir(path):
            # Walk through the directory and add all files
            for foldername, subfolders, filenames in os.walk(path):
                for filename in filenames:
                    file_path = os.path.join(foldername, filename)
                    zipf.write(file_path, os.path.relpath(file_path, '/content'))
        else:
            # Add the file to the zip
            zipf.write(path, os.path.basename(path))

print(f"Zip file created: {zip_filename}")


Zip file created: /content/combined_files.zip


In [None]:
#finetune the saved ensembled models for new data
 from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

def load_best_models(model_dir):
    """
    Load the top 3 models from the specified directory.
    """
    models = []
    for i in range(1, 4):  # Assuming models are named best_model_1.h5, best_model_2.h5, best_model_3.h5
        model_path = os.path.join(model_dir, f'best_model_{i}.h5')
        models.append(load_model(model_path))
    return models

def fine_tune_model(model, X_train, y_train, learning_rate=1e-4, fine_tune_epochs=50):
    """
    Fine-tune the loaded model with a new optimizer and learning rate.
    """
    # Re-compile the model with a new optimizer
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='mean_squared_error',
        metrics=['mae', 'mape']
    )

    # Perform fine-tuning
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=fine_tune_epochs,
        batch_size=32,
        callbacks=[early_stopping]
    )

    return model

def average_predictions_from_saved_models(models, X_test, target_scaler):
    """
    Compute the average predictions from the loaded models.
    """
    predictions_list = []
    for model in models:
        predictions = model.predict(X_test)
        predictions_denormalized = target_scaler.inverse_transform(predictions)
        predictions_list.append(predictions_denormalized)

    averaged_predictions = np.mean(predictions_list, axis=0)
    return averaged_predictions

if __name__ == "__main__":
    # Load the test data
    file_path = '/content/data_target.csv'
    X_train, X_test, y_train, y_test, fid_test, target_scaler = preprocess_data(file_path)

    # Load the top 3 models
    model_dir = '/content/results'
    models = load_best_models(model_dir)

    # Fine-tune the models
    fine_tuned_models = []
    for i, model in enumerate(models):
        print(f"Fine-tuning Model {i+1}...")
        fine_tuned_model = fine_tune_model(model, X_train, y_train)
        fine_tuned_models.append(fine_tuned_model)

    # Save fine-tuned models and their weights
    output_dir = '/content/finetuned'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

for i, fine_tuned_model in enumerate(fine_tuned_models):
    # Save full model
    fine_tuned_model.save(f'{output_dir}/fine_tuned_model_{i+1}.h5')

    # Save only weights with the correct extension
    fine_tuned_model.save_weights(f'{output_dir}/fine_tuned_model_{i+1}.weights.h5')


    # Generate averaged predictions
    averaged_predictions = average_predictions_from_saved_models(fine_tuned_models, X_test, target_scaler)

    # Denormalize ground truth
    y_test_denormalized = target_scaler.inverse_transform(y_test)

    # Save results to a CSV
    results = pd.DataFrame(fid_test, columns=['FID'])
    results = pd.concat([results, pd.DataFrame(y_test_denormalized, columns=['True1', 'True2', 'True3'])], axis=1)
    results = pd.concat([results, pd.DataFrame(averaged_predictions, columns=['Pred1', 'Pred2', 'Pred3'])], axis=1)
    results.to_csv(f'{output_dir}/final_predictions.csv', index=False)

    # Plot true vs predicted values
    for i in range(3):
        plt.figure(figsize=(8, 6))
        plt.scatter(y_test_denormalized[:, i], averaged_predictions[:, i], alpha=0.6)
        plt.plot([y_test_denormalized[:, i].min(), y_test_denormalized[:, i].max()],
                 [y_test_denormalized[:, i].min(), y_test_denormalized[:, i].max()], 'r--')
        plt.xlabel(f'True Value {i+1}')
        plt.ylabel(f'Predicted Value {i+1}')
        plt.title(f'True vs Predicted for Target {i+1}')
        plt.grid()
        plt.savefig(f'{output_dir}/true_vs_pred_target_{i+1}.png')
        plt.show()

    print(f"Fine-tuned models, weights, and results saved to {output_dir}.")




In [16]:
#zip finetuned folder for download
import shutil

# Define the folder path
folder_path = '/content/finetuned'

# Define the output zip file path
zip_file_path = '/content/finetuned.zip'

# Create a zip file of the folder
shutil.make_archive(zip_file_path.replace('.zip', ''), 'zip', folder_path)

print(f"Folder {folder_path} has been zipped to {zip_file_path}.")


Folder /content/finetuned has been zipped to /content/finetuned.zip.


In [None]:
#Use the finetuned models on unseen data with only predictor variables
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model

# Step 1: Load the new dataset
new_data = pd.read_csv('/content/data.csv')

# Assuming 'FID' is in the first column and the remaining are features
FID_new = new_data.iloc[:, 0].values
X_new = new_data.iloc[:, 1:].values

# Step 2: Preprocess the data (Standardize the features using the same scaler used during training)
# Use the feature scaler from preprocessing
feature_scaler = StandardScaler()
X_new_scaled = feature_scaler.fit_transform(X_new)

# Step 3: Load the top 3 best models saved in /content/results/
top_3_models = []
for i in range(3):
    model = load_model(f'/content/finetuned/fine_tuned_model_{i+1}.h5')
    top_3_models.append(model)

# Step 4: Use the top 3 models to generate predictions
predictions_list = []
for model in top_3_models:
    predictions = model.predict(X_new_scaled)
    predictions_list.append(predictions)

# Step 5: Average the predictions from the 3 models
averaged_predictions = np.mean(predictions_list, axis=0)

# Step 6: Denormalize the predictions using the target scaler
# Assuming `target_scaler` is the scaler used during training for target variables
averaged_predictions_denormalized = target_scaler.inverse_transform(averaged_predictions)

# Step 7: Combine FID and predictions into a final result DataFrame
results = pd.DataFrame(FID_new, columns=['FID'])
results[['Pred1', 'Pred2', 'Pred3']] = averaged_predictions_denormalized

# Save the results to a CSV file
results.to_csv('/content/ensemble_predictions.csv', index=False)

# Display the results
print(results.head())
