# Exploratory Data Analysis (EDA)

## Table of Contents
1. [Libraries](#libraries)
2. [Load data](#load-data)
3. [Dataset overview](#dataset-overview)
4. [Data format conversion](#data-format-conversion)
5. [Dataset overview (single rider)](#possible-biases)


## 1. Libraries
Install and load necessary libraries.

In [None]:
#Install necessary files
!pip install gpxpy
!pip install tcxreader
!pip install openpyxl

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gpxpy
import gpxpy.gpx
import tcxreader
import tcxreader.tcxreader
import openpyxl
import os
import shutil


## 2. Load data
Load whole dataset containing all tracks of 9 riders.

In [None]:
# Load data from google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Or upload local dataset
from google.colab import files
uploaded = files.upload()

In [None]:
!unzip /content/drive/MyDrive/opencampus_all_files/Sport1.zip -d /content

## 3. Dataset Overview
Basic exploration of the entire dataset.

In [None]:
# Load gpx.
gpx_path = '/content/Sport/Rider1/f1.gpx'
with open(gpx_path) as f:
    gpx = gpxpy.parse(f)

# Convert to a dataframe one point at a time.
points = []
for segment in gpx.tracks[0].segments:
    for p in segment.points:
        points.append({
            'time': p.time,
            'latitude': p.latitude,
            'longitude': p.longitude,
            'elevation': p.elevation,
        })
df = pd.DataFrame.from_records(points)

# Number of samples
num_samples = df.shape[0]

# Number of features
num_features = df.shape[1]

# Display these dataset characteristics
print(f"Number of samples: {num_samples}")
print(f"Number of features: {num_features}")

# Display the first few rows of the dataframe to show the structure
print("Example data:")
print(df.head())



In [None]:
# Path to the main folder containing Rider folders
main_path = '/content/Sport'

# Iterate through each Rider folder and count the number of .gpx files
for rider_folder in sorted(os.listdir(main_path)):
    folder_path = os.path.join(main_path, rider_folder)
    if os.path.isdir(folder_path):
        gpx_files = [f for f in os.listdir(folder_path) if f.endswith('.gpx')]
        tcx_files = [f for f in os.listdir(folder_path) if f.endswith('.tcx')]
        print(f"Folder '{rider_folder}' contains {len(gpx_files)} .gpx files and {len(tcx_files)} .tcx files.")


## 4. Data format conversion
Convert GPX and TCX to XLS format. In order to correctly analyse all the data its format should be firstly unified.

In [None]:
def convert_gpx_to_excel(gpx_file_path, output_file_path):
  # Initialize a DataFrame to store data
  all_data = []

  # Parse the GPX file
  with open(gpx_file_path, 'r') as gpx_file:
      gpx = gpxpy.parse(gpx_file)

  # Extract data (latitude, longitude, elevation, time, etc.)
  for track in gpx.tracks:
      for segment in track.segments:
          for point in segment.points:
              # Convert timezone-aware datetime to timezone-naive
              naive_time = point.time.replace(tzinfo=None) if point.time else None

              all_data.append({
                  'Latitude': point.latitude,
                  'Longitude': point.longitude,
                  'Elevation': point.elevation,
                  'Time': naive_time
              })

  # Convert the data into a DataFrame
  df = pd.DataFrame(all_data)

  # Write the DataFrame to an Excel file
  df.to_excel(output_file_path, index=False)


In [None]:
# TCX to XLSX
def convert_tcx_to_excel(tcx_file_path, output_file_path):
    """
    Converts a TCX file to an Excel file with trackpoint data.

    Parameters:
    - tcx_file_path: str, path to the input TCX file
    - output_file_path: str, path to save the output Excel file
    """
    # Initialize the TCX reader
    tcx_reader = TCXReader()

    # Read the TCX file
    data: TCXExercise = tcx_reader.read(tcx_file_path)

    # List to store the trackpoint data
    trackpoint_data = []

    # Loop through all trackpoints and extract relevant information
    for trackpoint in data.trackpoints:
        trackpoint_data.append({
            'Time': trackpoint.time,
            'Latitude': trackpoint.latitude,
            'Longitude': trackpoint.longitude,
            'Elevation': trackpoint.elevation,
            'Distance': trackpoint.distance,
            'Heartrate': trackpoint.hr_value,
            'Cadence': trackpoint.cadence,
            'Speed': trackpoint.tpx_ext['Speed']
        })

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(trackpoint_data)

    # Save the DataFrame to an Excel file
    df.to_excel(output_file_path, index=False, engine='openpyxl')


In [None]:
# Define the directories
sport_dir = "/content/Sport"
sport_xlsx_dir = "/content/Sport_xlsx"

# Create Sport_xlsx directory if it doesn't exist
if not os.path.exists(sport_xlsx_dir):
    os.makedirs(sport_xlsx_dir)

# Function to convert files in a folder
def convert_files_in_folder(rider_folder):
    rider_xlsx_folder = os.path.join(sport_xlsx_dir, rider_folder)

    # Create the rider folder in Sport_xlsx directory if it doesn't exist
    if not os.path.exists(rider_xlsx_folder):
        os.makedirs(rider_xlsx_folder)

    rider_folder_path = os.path.join(sport_dir, rider_folder)

    # Process .gpx files
    gpx_files = [f for f in os.listdir(rider_folder_path) if f.endswith('.gpx')]
    for gpx_file in gpx_files:
        convert_gpx_to_excel(os.path.join(rider_folder_path, gpx_file),
                           os.path.join(rider_xlsx_folder, gpx_file.replace('.gpx', '.xlsx')))

    # Process .tcx files
    tcx_files = [f for f in os.listdir(rider_folder_path) if f.endswith('.tcx')]
    for tcx_file in tcx_files:
        convert_tcx_to_excel(os.path.join(rider_folder_path, tcx_file),
                           os.path.join(rider_xlsx_folder, tcx_file.replace('.tcx', '.xlsx')))


In [None]:
Riders = ['Rider1', 'Rider2', 'Rider3', 'Rider4', 'Rider5', 'Rider6', 'Rider7', 'Rider8', 'Rider9']
for rider in Riders:
  convert_files_in_folder(rider)

In [None]:
!zip -r /content/Sport_xlsx/Rider1.zip /content/Sport_xlsx/Rider1

# 5. Dataset overview (Single rider)
Load and exploration of a single rider files after succesful conversion to xlsx format.

In [None]:
!unzip /content/drive/MyDrive/opencampus_all_files/Rider1.zip -d /content

In [None]:
directory = '/content/content/Sport_xlsx/Rider1/'

# Initialize lists and dictionaries to store results
row_counts = []
missing_values = 0

# Loop through all files in the directory
for file in os.listdir(directory):
    if file.endswith(".xlsx"):
        file_path = os.path.join(directory, file)

        # Read the Excel file
        df = pd.read_excel(file_path)

        # Count rows and add to list
        row_counts.append(df.shape[0])

        # Count missing values
        missing_values += df.isnull().sum()

In [None]:
# Print the missing values summary
print(missing_values)

In [None]:
# Plot the histogram of row counts
plt.figure(figsize=(10, 6))
plt.hist(row_counts, bins=30, color='blue', alpha=0.7)
plt.title("Histogram of Row Counts")
plt.xlabel("Number of Rows")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
# Plot the boxplot of row counts
plt.figure(figsize=(10, 6))
plt.boxplot(row_counts)
plt.title("Boxplot of Row Counts")
plt.grid(True)
plt.show()

In [None]:
print(sorted(row_counts))

In [None]:
print(sum(row_counts))

In [None]:
directory = '/content/content/Sport_xlsx/Rider1/'

# Initialize lists to store results
total_seconds = []
file_names = []

# Loop through all files in the directory
for file in os.listdir(directory):
    if file.endswith(".xlsx"):
        file_path = os.path.join(directory, file)

        # Read the Excel file
        df = pd.read_excel(file_path)

        # Convert timestamps to absolute value
        df['Time'] = pd.to_datetime(df['Time'])
        df['Time'] = (df['Time'] - df['Time'][0]).dt.total_seconds()

        # Count length of each file
        total_seconds.append(df['Time'].iloc[-1])
        file_names.append(file)  # Store the filename

# Create a DataFrame for analysis
results_df = pd.DataFrame({'File': file_names, 'TrackLength': total_seconds})

In [None]:
# Plot the histogram of file lengths
plt.figure(figsize=(10, 6))
plt.hist(total_seconds, bins=30, color='blue', alpha=0.7)
plt.title("Histogram of Track Lenght")
plt.xlabel("Number of Rows")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
# Plot the boxplot of file lenghts
plt.figure(figsize=(10, 6))
plt.boxplot(total_seconds)
plt.title("Boxplot of Track Lenght")
plt.grid(True)
plt.show()

In [None]:
print(sorted(total_seconds))

In [None]:
# Find the file with the outlier
outlier_value = results_df['TrackLength'].max()  # Assuming the outlier is the maximum value
outlier_file = results_df[results_df['TrackLength'] == outlier_value]
print("Outlier file(s):")
print(outlier_file)

In [None]:
# Define the folder path
folder_path = '/content/Sport/Rider1/'

# Collect all GPX files in the folder
gpx_files = [file for file in os.listdir(folder_path) if file.endswith('.gpx')]

# Prepare a list for storing all tracks
all_tracks = []

# Parse each GPX file and extract track data
for gpx_file in gpx_files:
    file_path = os.path.join(folder_path, gpx_file)
    with open(file_path, 'r') as f:
        gpx = gpxpy.parse(f)
        for track in gpx.tracks:
            for segment in track.segments:
                latitudes = [point.latitude for point in segment.points]
                longitudes = [point.longitude for point in segment.points]
                all_tracks.append((latitudes, longitudes))

# Plot the tracks
plt.figure(figsize=(10, 8))
for latitudes, longitudes in all_tracks:
    plt.plot(longitudes, latitudes, color='black', linewidth=0.5)

plt.title("Tracks from GPX Files")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.grid(True)
plt.show()

In [None]:
def find_tracks_with_low_coords(folder_path, max_longitude=10, max_latitude=40):
    """
    Identifies GPX files with tracks extending below the specified longitude or latitude.

    Parameters:
        folder_path (str): Path to the folder containing GPX files.
        max_longitude (float): Longitude threshold to check (files with points below this are flagged).
        max_latitude (float): Latitude threshold to check (files with points below this are flagged).

    Returns:
        List of filenames matching the criteria.
    """
    flagged_files = []

    # Loop through all GPX files in the folder
    for gpx_file in os.listdir(folder_path):
        if gpx_file.endswith('.gpx'):
            file_path = os.path.join(folder_path, gpx_file)
            with open(file_path, 'r') as f:
                try:
                    gpx = gpxpy.parse(f)

                    # Check all track points
                    for track in gpx.tracks:
                        for segment in track.segments:
                            for point in segment.points:
                                if point.longitude < max_longitude or point.latitude < max_latitude:
                                    flagged_files.append(gpx_file)
                                    raise StopIteration  # Exit nested loops early
                except StopIteration:
                    continue  # Move to the next file
                except Exception as e:
                    print(f"Error processing {gpx_file}: {e}")

    return flagged_files

# Example usage
folder_path = '/content/Sport/Rider1/'
files_with_low_coords = find_tracks_with_low_coords(folder_path)

print("Files with tracks extending below 10 Longitude or 40 Latitude:")
for file in files_with_low_coords:
    print(file)