# Rider 1 - Split into train and test

## Table of Contents
1. [Libraries](#libraries)
2. [Load dataset](#load-dataset)
3. [Feature engineering](#feature-engineering)
4. [Data cleaning](#data-cleaning)
5. [Export train and test](#export-train-and-test)


## 1. Libraries


In [None]:
!pip install openpyxl

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import shutil
import random
import os
import openpyxl
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error


# 2. Load dataset
Load preprocessed dataset and print basic details.

### Directory preparation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import zipfile

# Replace 'path/to/your/zipped_folder.zip' with the actual path to your zipped folder in Google Drive.
zip_path = '/content/drive/MyDrive/opencampus_all_files/Rider1_preprocessed.zip'

# Extract the contents of the zip file to a specified directory.
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/Rider1_preprocessed')

print(f"Successfully unzipped {zip_path} to /content/Rider1_preprocessed")


In [None]:
source_dir = '/content/Rider1_preprocessed/content/content/Sport_xlsx/Rider1_preprocessed'
destination_dir = '/content/Rider1_preprocessed'

# Iterate through all files in the source directory
for filename in os.listdir(source_dir):
    source_path = os.path.join(source_dir, filename)
    destination_path = os.path.join(destination_dir, filename)

    # Check if it's a file (not a subdirectory)
    if os.path.isfile(source_path):
        # Move the file
        shutil.move(source_path, destination_path)

In [None]:
# Define the directory to clean
directory_to_clean = '/content/Rider1_preprocessed'

# Check if the directory exists
if os.path.exists(directory_to_clean):
    for filename in os.listdir(directory_to_clean):
        filepath = os.path.join(directory_to_clean, filename)
        try:
            if os.path.isdir(filepath):
                shutil.rmtree(filepath)  # Remove folders recursively
                print(f"Removed directory: {filepath}")
        except OSError as e:
            print(f"Error removing {filepath}: {e}")
else:
    print(f"Directory '{directory_to_clean}' not found.")

# 3. Feature engineering

#### Prepare pipeline

In [None]:
# Define the function to calculate the angle
def calculate_angle(lat, lon, lat_prev, lon_prev, lat_next, lon_next):
    # Convert degrees to radians
    lat = math.radians(lat)
    lon = math.radians(lon)
    lat_prev = math.radians(lat_prev)
    lon_prev = math.radians(lon_prev)
    lat_next = math.radians(lat_next)
    lon_next = math.radians(lon_next)

    # Compute vectors in 3D Cartesian coordinates
    def to_cartesian(lat, lon):
        x = math.cos(lat) * math.cos(lon)
        y = math.cos(lat) * math.sin(lon)
        z = math.sin(lat)
        return (x, y, z)

    p1 = to_cartesian(lat_prev, lon_prev)
    p2 = to_cartesian(lat, lon)
    p3 = to_cartesian(lat_next, lon_next)

    # Calculate vectors
    v1 = (p1[0] - p2[0], p1[1] - p2[1], p1[2] - p2[2])
    v2 = (p3[0] - p2[0], p3[1] - p2[1], p3[2] - p2[2])

    # Compute magnitudes of vectors
    mag_v1 = math.sqrt(sum(v1[i]**2 for i in range(3)))
    mag_v2 = math.sqrt(sum(v2[i]**2 for i in range(3)))

    # Handle zero magnitude vectors
    if mag_v1 == 0 or mag_v2 == 0:
        return float('nan')  # Return NaN for undefined angle

    # Compute dot product
    dot_product = sum(v1[i] * v2[i] for i in range(3))

    # Calculate the angle using the dot product formula
    cos_theta = dot_product / (mag_v1 * mag_v2)
    angle = math.acos(max(-1, min(1, cos_theta)))  # Clamp to avoid numerical issues

    # Convert radians to degrees
    angle_degrees = math.degrees(angle)
    return angle_degrees

In [None]:
# Read one test file
df = pd.read_excel('/content/Rider1_preprocessed/f1.xlsx')

# Assuming 'df' is your DataFrame as defined in the previous code.
df['Latitude_prev'] = df['Latitude'].shift(1)
df['Longitude_prev'] = df['Longitude'].shift(1)
df['Elevation_prev'] = df['Elevation'].shift(1)
df['Latitude_next'] = df['Latitude'].shift(-1)
df['Longitude_next'] = df['Longitude'].shift(-1)
df['Elevation_next'] = df['Elevation'].shift(-1)

# Calculate speed based on distance and time differences
df['Time_prev'] = df['Time'].shift(1)
df['Time_next'] = df['Time'].shift(-1)
df['Distance_prev'] = df['Distance'].shift(1)
df['Distance_next'] = df['Distance'].shift(-1)

# Calculate time difference
df['Time_diff_prev'] = df['Time'] - df['Time_prev']
df['Time_diff_next'] = df['Time_next'] - df['Time']

# Calculate distance difference
df['Distance_diff_prev'] = df['Distance'] - df['Distance_prev']
df['Distance_diff_next'] = df['Distance_next'] - df['Distance']

# Calculate speed (m/s)
df['Speed'] = (df['Distance_diff_next'] + df['Distance_diff_prev']) / (df['Time_diff_next'] + df['Time_diff_prev'])

# Calculate slope
df['Slope_prev'] = (df['Elevation'] - df['Elevation_prev']) / (df['Distance'] - df['Distance_prev'])
df['Slope_next'] = (df['Elevation_next'] - df['Elevation']) / (df['Distance_next'] - df['Distance'])

# Handle potential divisions by zero
df['Slope_prev'] = df['Slope_prev'].fillna(0).replace([float('inf'), -float('inf')], 0)
df['Slope_next'] = df['Slope_next'].fillna(0).replace([float('inf'), -float('inf')], 0)

# Calculate angle between next and previous points
df['Angle'] = df.apply(
    lambda row: calculate_angle(
        row['Latitude'], row['Longitude'],
        row['Latitude_prev'], row['Longitude_prev'],
        row['Latitude_next'], row['Longitude_next']
    ), axis=1
)

# Calculate the sum of the slopes driven so far
df['Cumulative_Slope'] = df['Slope_prev'].cumsum()

# Rearrange columns
df = df[['Elevation', 'Slope_prev', 'Slope_next',  'Angle', 'Distance', 'Cumulative_Slope', 'Speed', 'Time']]

print(df.head())

### Divide into train/test

In [None]:
# Directory containing the Excel files
directory = '/content/Rider1_preprocessed'
destination_directory = '/content/Rider1_test'

# Initialize an empty list to store DataFrames
dataframes = []

# Counters
all_files = 0
corrupted_files = 0
files_count = 0

# Create destination directory if it does not exist
if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.xlsx'):  # Process only .xlsx files
        filepath = os.path.join(directory, filename)

        # Read the Excel file into a DataFrame
        df = pd.read_excel(filepath)

        # Apply preprocessing
        df['Latitude_prev'] = df['Latitude'].shift(1)
        df['Longitude_prev'] = df['Longitude'].shift(1)
        df['Elevation_prev'] = df['Elevation'].shift(1)
        df['Latitude_next'] = df['Latitude'].shift(-1)
        df['Longitude_next'] = df['Longitude'].shift(-1)
        df['Elevation_next'] = df['Elevation'].shift(-1)

        # Calculate speed based on distance and time differences
        df['Time_prev'] = df['Time'].shift(1)
        df['Time_next'] = df['Time'].shift(-1)
        df['Distance_prev'] = df['Distance'].shift(1)
        df['Distance_next'] = df['Distance'].shift(-1)

        # Calculate time difference
        df['Time_diff_prev'] = df['Time'] - df['Time_prev']
        df['Time_diff_next'] = df['Time_next'] - df['Time']

        # Calculate distance difference
        df['Distance_diff_prev'] = df['Distance'] - df['Distance_prev']
        df['Distance_diff_next'] = df['Distance_next'] - df['Distance']

        # Calculate speed (m/s)
        df['Speed'] = (df['Distance_diff_next'] + df['Distance_diff_prev']) / (df['Time_diff_next'] + df['Time_diff_prev'])

        # Calculate slope
        df['Slope_prev'] = (df['Elevation'] - df['Elevation_prev']) / (df['Distance'] - df['Distance_prev'])
        df['Slope_next'] = (df['Elevation_next'] - df['Elevation']) / (df['Distance_next'] - df['Distance'])

        # Handle potential divisions by zero
        df['Slope_prev'] = df['Slope_prev'].fillna(0).replace([float('inf'), -float('inf')], 0)
        df['Slope_next'] = df['Slope_next'].fillna(0).replace([float('inf'), -float('inf')], 0)

        # Calculate angle between next and previous points
        df['Angle'] = df.apply(
            lambda row: calculate_angle(
                row['Latitude'], row['Longitude'],
                row['Latitude_prev'], row['Longitude_prev'],
                row['Latitude_next'], row['Longitude_next']
            ), axis=1
        )

        # Calculate the sum of the slopes driven so far
        df['Cumulative_Slope'] = df['Slope_prev'].cumsum()

        # Rearrange columns
        df = df[['Elevation', 'Slope_prev', 'Slope_next',  'Angle', 'Distance', 'Cumulative_Slope', 'Speed', 'Time']]

        # Drop NaN values
        df = df.dropna()

        # if negative elevation
        if (df['Elevation'] < 0).any() or (df['Slope_prev'].abs() > 1.0).any() or (df['Slope_next'].abs() > 1.0).any() or (df['Speed'] > 25).any():
            all_files += 1
            corrupted_files += 1
            # Append the processed DataFrame to the list
            dataframes.append(df)
            continue

        # Count all files
        all_files += 1

        # Move first 10 uncorrupted files to test folder
        if files_count < 10:
            destination_dir = os.path.join(destination_directory, filename)
            try:
                destination_dir_csv = destination_dir[:-5] + ".csv"
                df.to_csv(destination_dir_csv, index=False)
                print(f"Converted {filename} to {destination_dir_csv}")
                files_count += 1
            except Exception as e:
                print(f"Error moving {filename}: {e}")
        else:
            # Append the processed DataFrame to the list
            dataframes.append(df)

# Combine all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the combined DataFrame
print(combined_df.head())

# Print results
print(f"Corrupted files: {corrupted_files}")
print(f"All files: {all_files}")

### Explore dataset

In [None]:
# Plot histograms for all columns in the DataFrame
for column in combined_df.columns:
    plt.figure(figsize=(10, 6))
    plt.hist(combined_df[column], bins=50, edgecolor='k', alpha=0.7)
    plt.title(f"Histogram of {column}")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

## 4. Data cleaning

In [None]:
# Clean combined_df: Remove rows with negative elevation, remove slope higher than abs(1.0), remove speed > 75, write a summary how many rows were deleted and how many are left
# Original row count
original_row_count = len(combined_df)

# Remove rows with negative elevation
combined_df = combined_df[combined_df['Elevation'] >= 0]

# Remove rows with slope higher than abs(1.0)
combined_df = combined_df[combined_df['Slope_prev'].abs() <= 1.0]
combined_df = combined_df[combined_df['Slope_next'].abs() <= 1.0]

# Remove rows with speed > 25
combined_df = combined_df[combined_df['Speed'] <= 25]

# Calculate the number of deleted rows
deleted_rows = original_row_count - len(combined_df)

# Print summary
print(f"Original number of rows: {original_row_count}")
print(f"Number of deleted rows: {deleted_rows}")
print(f"Remaining number of rows: {len(combined_df)}")

In [None]:
# Plot histograms for all columns in the DataFrame
for column in combined_df.columns:
    plt.figure(figsize=(10, 6))
    plt.hist(combined_df[column], bins=50, edgecolor='k', alpha=0.7)
    plt.title(f"Histogram of {column}")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

## 5. Export train and test
Export csv file containig all the training rows, and export zipped test files.

In [None]:
# Save the combined DataFrame to a file
combined_df.to_csv('/content/combined_data_r1.csv', index=False)

In [None]:
# zip /content/Rider1_test directory
!zip -r /content/Rider1_test.zip /content/Rider1_test