# Baseline Model

## Table of Contents
1. [Load dataset](#load-dataset)
2. [Feature Selection](#feature-selection)
3. [Implementation](#implementation)
4. [Evaluation](#evaluation)


In [None]:
!pip install openpyxl

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import shutil
import random
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


## 1. Load dataset
Load preprocessed dataset and print basic details.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/opencampus_all_files/combined_data_r1.csv')

## 2. Feature Selection

Selected features describes terrain characteristics and cumulative statistics of current track.

In [None]:
# Feature selection
# Example: Selecting only two features for a simple baseline model
X = df[['Elevation', 'Slope_prev', 'Slope_next', 'Angle', 'Distance', 'Cumulative_Slope']]
y = df['Speed']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [None]:
# Size of datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


##  3. Implementation
In following notebook two models were implemented:
- Linear Regression
- Neural Network





#### Baseline model 1 - Linear Regression

In [None]:
# Initialize and train the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

In [None]:
# Save the model to a file
filename = 'lr_model.joblib'
joblib.dump(lr_model, filename)

#### Baseline model 2 - Simple Neural Network

In [None]:
# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the neural network model
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),  # First hidden layer
    Dense(32, activation='relu'),  # Second hidden layer
    Dense(1)  # Output layer (no activation since we're doing regression)
])

# Compile the model
model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mae'])

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X_test_scaled, y_test))

# Evaluate the model
loss, mae = model.evaluate(X_test_scaled, y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Optionally, plot the training history (e.g., loss or MAE over epochs)
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Save the entire model as a `.keras` zip archive.
model.save('r1_init_nn_model.keras')

## 4. Evaluation

Evaluation based on loss and MAE.


#### Load test file

In [None]:
# unzip test files
!unzip /content/drive/MyDrive/opencampus_all_files/Rider1_test.zip -d /content

#### Evaluation 1 - Linear Regression

In [None]:
# Load linear model
lr_model = joblib.load('/content/drive/MyDrive/opencampus_all_files/models/r1_init_lr_model.joblib')

In [None]:
# Define the folder containing the files
input_folder_path = '/content/content/Rider1_test/'
output_folder_path = '/content/content/Rider1_test_LR/'

# Create output_folder_path
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# List all CSV files in the folder
csv_files = [f for f in os.listdir(input_folder_path) if f.endswith('.csv')]

# Iterate through each file in the folder
for file in csv_files:
    file_path = os.path.join(input_folder_path, file)

    # Read test file
    df = pd.read_csv(file_path)

    # Feature selection
    real_time = df['Time']
    X = df[['Elevation', 'Slope_prev', 'Slope_next', 'Angle', 'Distance', 'Cumulative_Slope']]
    y = df['Speed']

    # Make predictions on the new data
    y_pred = lr_model.predict(X)

    # Add the predicted values as a new column 'Speed_pred' in the original DataFrame
    df['Speed_pred'] = y_pred

    # Calculate MAE between df['Speed_pred'] and df['Speed']
    mae = mean_absolute_error(df['Speed'], df['Speed_pred'])
    print(f"File: {file} | Mean Absolute Error: {mae}")

    # Initialize Time column
    df['Time_pred'] = float(df['Time'].iloc[0])

    # Compute predicted time
    for i in range(2, len(df)):
        if df.loc[i, 'Speed_pred'] < 0:
            df.loc[i, 'Speed_pred'] = 0
        if pd.notna(df.loc[i, 'Speed_pred']):
            if df.loc[i, 'Speed_pred'] != 0:
                df.loc[i, 'Time_pred'] = (
                    df.loc[i - 1, 'Time_pred'] +
                    (df.loc[i, 'Distance'] - df.loc[i - 1, 'Distance']) / df.loc[i, 'Speed_pred']
                )
            else:
                df.loc[i, 'Time_pred'] = df.loc[i - 1, 'Time_pred']

    # Save the processed DataFrame to a new file (optional)
    output_file_path = os.path.join(output_folder_path, f"lr_{file}")
    df.to_csv(output_file_path, index=False)


#### Evaluation 2 - Neural Network

In [None]:
# Load linear model
nn_model = tf.keras.models.load_model('/content/drive/MyDrive/opencampus_all_files/models/r1_init_nn_model.keras')

In [None]:
# Define the folder containing the files
input_folder_path = '/content/content/Rider1_test/'
output_folder_path = '/content/content/Rider1_test_NN/'

# Create output_folder_path
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# List all CSV files in the folder
csv_files = [f for f in os.listdir(input_folder_path) if f.endswith('.csv')]

# Iterate through each file in the folder
for file in csv_files:
    file_path = os.path.join(input_folder_path, file)

    # Read test file
    df = pd.read_csv(file_path)

    # Feature selection
    real_time = df['Time']
    X = df[['Elevation', 'Slope_prev', 'Slope_next', 'Angle', 'Distance', 'Cumulative_Slope']]
    y = df['Speed']

    # Make predictions on the new data
    # Normalize the features using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    X_new_scaled = scaler.transform(X)
    y_pred = nn_model.predict(X_new_scaled)

    # Add the predicted values as a new column 'Speed_pred' in the original DataFrame
    df['Speed_pred'] = y_pred

    # Calculate MAE between df['Speed_pred'] and df['Speed']
    mae = mean_absolute_error(df['Speed'], df['Speed_pred'])
    print(f"File: {file} | Mean Absolute Error: {mae}")

    # Initialize Time column
    df['Time_pred'] = float(df['Time'].iloc[0])

    # Compute predicted time
    for i in range(2, len(df)):
        if df.loc[i, 'Speed_pred'] < 0:
            df.loc[i, 'Speed_pred'] = 0
        if pd.notna(df.loc[i, 'Speed_pred']):
            if df.loc[i, 'Speed_pred'] != 0:
                df.loc[i, 'Time_pred'] = (
                    df.loc[i - 1, 'Time_pred'] +
                    (df.loc[i, 'Distance'] - df.loc[i - 1, 'Distance']) / df.loc[i, 'Speed_pred']
                )
            else:
                df.loc[i, 'Time_pred'] = df.loc[i - 1, 'Time_pred']

    # Save the processed DataFrame to a new file (optional)
    output_file_path = os.path.join(output_folder_path, f"nn_{file}")
    df.to_csv(output_file_path, index=False)


### Evaluation summary

In [None]:
# Summary statistics for processed files
output_folder_path = '/content/content/Rider1_test_LR/'
summary_lr = []
processed_files = [f for f in os.listdir(output_folder_path) if f.endswith('.csv')]

for file in processed_files:
    file_path = os.path.join(output_folder_path, file)
    df = pd.read_csv(file_path)

    # Extract the last value of 'Time' and 'Time_pred'
    last_time = df['Time'].iloc[-1]
    last_time_pred = df['Time_pred'].iloc[-1]

    # Calculate percentage difference
    percentage_diff = ((last_time_pred - last_time) / last_time) * 100 if last_time != 0 else None

    # Append to summary
    summary_lr.append({
        'file': file,
        'last_time': last_time,
        'last_time_pred': last_time_pred,
        'percentage_diff': percentage_diff
    })

In [None]:
# Summary statistics for processed files
output_folder_path = '/content/content/Rider1_test_NN/'
summary_nn = []
processed_files = [f for f in os.listdir(output_folder_path) if f.endswith('.csv')]

for file in processed_files:
    file_path = os.path.join(output_folder_path, file)
    df = pd.read_csv(file_path)

    # Extract the last value of 'Time' and 'Time_pred'
    last_time = df['Time'].iloc[-1]
    last_time_pred = df['Time_pred'].iloc[-1]

    # Calculate percentage difference
    percentage_diff = ((last_time_pred - last_time) / last_time) * 100 if last_time != 0 else None

    # Append to summary
    summary_nn.append({
        'file': file,
        'last_time': last_time,
        'last_time_pred': last_time_pred,
        'percentage_diff': percentage_diff
    })

In [None]:
# Print summary
summary_lr_df = pd.DataFrame(summary_lr).sort_values(by=['file'])
print(summary_lr_df)
print(f"Percentage difference in LR model: {summary_lr_df['percentage_diff'].abs().mean()}\n")

summary_nn_df = pd.DataFrame(summary_nn).sort_values(by=['file'])
print(summary_nn_df)
print(f"Percentage difference in NN model: {summary_nn_df['percentage_diff'].abs().mean()}")