<a href="https://colab.research.google.com/github/shashaaankk/GradientAscent/blob/main/GradientAscent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# imports
import pandas as pd
import numpy as np
import gpxpy
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import math


In [3]:
LOCAL = True

In [4]:
if not LOCAL:
    !pip install --quiet kaggle kagglehub[pandas-datasets]
    from google.colab import files
    uploaded = files.upload()   # click to select your kaggle.json
    if 'kaggle.json' not in uploaded:
        raise FileNotFoundError("You must upload the kaggle.json you downloaded from Kaggle.")


In [5]:
if not LOCAL:
    import os, shutil
    # make sure ~/.kaggle exists
    kaggle_dir = os.path.expanduser("~/.kaggle")
    os.makedirs(kaggle_dir, exist_ok=True)

    # move and secure
    shutil.move("kaggle.json", os.path.join(kaggle_dir, "kaggle.json"))
    os.chmod(os.path.join(kaggle_dir, "kaggle.json"), 0o600)

    # sometimes needed:
    os.environ['KAGGLE_CONFIG_DIR'] = kaggle_dir


In [6]:
if not LOCAL:
    import kagglehub
    path = kagglehub.dataset_download("roccoli/gpx-hike-tracks")
else:
    !mkdir data
    !curl -L -o ./data/gpx-hike-tracks.zip https://www.kaggle.com/api/v1/datasets/download/roccoli/gpx-hike-tracks
    !unzip -o data/gpx-hike-tracks.zip -d data/

A subdirectory or file data already exists.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

  0 57.4M    0 79664    0     0   108k      0  0:09:02 --:--:--  0:09:02  108k
 22 57.4M   22 12.9M    0     0  7748k      0  0:00:07  0:00:01  0:00:06 12.9M
 60 57.4M   60 34.5M    0     0  12.7M      0  0:00:04  0:00:02  0:00:02 17.3M
 96 57.4M   96 55.6M    0     0  14.9M      0  0:00:03  0:00:03 --:--:-- 18.5M
100 57.4M  100 57.4M    0     0  15.1M      0  0:00:03  0:00:03 --:--:-- 18.7M
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [7]:
# 1. Install gpxpy (to parse .gpx files)
import gpxpy
import pandas as pd
if not LOCAL:
    !pip install --quiet gpxpy

    # 2. Import libs
    import os, glob


    csv_files = glob.glob(os.path.join(path, "**", "*.csv"), recursive=True)
    csv_path = csv_files[0]
    print("Loading:", csv_path)

else:
    csv_path = "data/gpx-tracks-from-hikr.org.csv"

# Read and inspect
df = pd.read_csv(csv_path)
# print("Shape:", df.shape)
# print("Columns:", df.columns.tolist())
# print(df.head())



In [8]:
# Getting distance between start and end points of GPX tracks

def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # radius in meters
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    dphi = math.radians(lat2 - lat1)
    dlambda = math.radians(lon2 - lon1)
    a = math.sin(dphi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(dlambda / 2)**2
    return R * 2 * math.asin(math.sqrt(a))


def start_end_distance_from_gpx_string(gpx_string):
    """
    Parse a GPX XML string, extract the first and last trkpt coordinates,
    and return the Haversine distance between them in kilometers.
    Returns None if parsing fails or fewer than 2 track points.
    """
    try:
        root = ET.fromstring(gpx_string)
        trkpts = []
        for elem in root.iter():
            if elem.tag.endswith('trkpt'):
                lat = elem.attrib.get('lat')
                lon = elem.attrib.get('lon')
                if lat is not None and lon is not None:
                    trkpts.append((float(lat), float(lon)))
        if len(trkpts) < 2:
            return None
        lat1, lon1 = trkpts[0]
        lat2, lon2 = trkpts[-1]
        dist_mm = haversine(lat1, lon1, lat2, lon2)
        return dist_mm 
    except ET.ParseError:
        return None

In [9]:
#Pre-processing 1

df = df.dropna()
# Convert time columns to datetime
df["start_time"] = pd.to_datetime(df["start_time"], format="%Y-%m-%d %H:%M:%S" , errors='coerce')
df["end_time"] = pd.to_datetime(df["end_time"], format="%Y-%m-%d %H:%M:%S" ,errors='coerce')

# Compute total duration in seconds
df["duration"] =  df["moving_time"]

# Compute break time: duration - moving_time
df["break_time"] = (df["end_time"] - df["start_time"]).dt.total_seconds() - df["moving_time"]


df['start_end_distance'] = df['gpx'].apply(start_end_distance_from_gpx_string)


In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
import math

# Haversine distance for horizontal distance calculation
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # radius in meters
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    dphi = math.radians(lat2 - lat1)
    dlambda = math.radians(lon2 - lon1)
    a = math.sin(dphi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(dlambda / 2)**2
    return R * 2 * math.asin(math.sqrt(a))

def slope_stats_from_gpx_string(gpx_string):
    """
    Parse GPX XML string, extract track points, compute slopes between consecutive points.
    Returns a dict with statistics for ascent and descent slopes (in percent).
    """
    try:
        root = ET.fromstring(gpx_string)
        pts = []
        for elem in root.iter():
            if elem.tag.endswith('trkpt'):
                lat = elem.attrib.get('lat')
                lon = elem.attrib.get('lon')
                ele_elem = elem.find('{*}ele')
                if lat is not None and lon is not None and ele_elem is not None:
                    try:
                        ele = float(ele_elem.text)
                    except (TypeError, ValueError):
                        ele = None
                    if ele is not None:
                        pts.append((float(lat), float(lon), ele))
        if len(pts) < 2:
            return None  # insufficient points
        ascent_slopes = []
        descent_slopes = []
        for i in range(1, len(pts)):
            lat1, lon1, ele1 = pts[i-1]
            lat2, lon2, ele2 = pts[i]
            horiz = haversine(lat1, lon1, lat2, lon2)
            if horiz <= 0:
                continue
            ele_diff = ele2 - ele1
            slope_pct = (ele_diff / horiz) * 100  # percent grade
            if ele_diff > 0:
                ascent_slopes.append(slope_pct)
            elif ele_diff < 0:
                descent_slopes.append(abs(slope_pct))
        stats = {
            'mean_ascent_slope_pct': None,
            'max_ascent_slope_pct': None,
            'median_ascent_slope_pct': None,
            'count_ascent_segments': 0,
            'mean_descent_slope_pct': None,
            'max_descent_slope_pct': None,
            'median_descent_slope_pct': None,
            'count_descent_segments': 0
        }
        if ascent_slopes:
            stats.update({
                'mean_ascent_slope_pct': sum(ascent_slopes) / len(ascent_slopes),
                'max_ascent_slope_pct': max(ascent_slopes),
                'median_ascent_slope_pct': pd.Series(ascent_slopes).median(),
                'count_ascent_segments': len(ascent_slopes)
            })
        if descent_slopes:
            stats.update({
                'mean_descent_slope_pct': sum(descent_slopes) / len(descent_slopes),
                'max_descent_slope_pct': max(descent_slopes),
                'median_descent_slope_pct': pd.Series(descent_slopes).median(),
                'count_descent_segments': len(descent_slopes)
            })
        return stats
    except ET.ParseError:
        return None

# Example: sample GPX snippet
gpx_snippet = """<?xml version="1.0" encoding="UTF-8"?>
<gpx xmlns="http://www.topografix.com/GPX/1/1">
  <trk><trkseg>
    <trkpt lat="47.231143" lon="13.227007"><ele>1322.96</ele></trkpt>
    <trkpt lat="47.230543" lon="13.227488"><ele>1330.89</ele></trkpt>
    <trkpt lat="47.23103" lon="13.226896"><ele>1326.17</ele></trkpt>
  </trkseg></trk>
</gpx>"""

# Create example DataFrame
df_example = pd.DataFrame({'gpx': [gpx_snippet]})
# Apply slope stats function
df_stats = df_example['gpx'].apply(slope_stats_from_gpx_string).apply(pd.Series)
df_result = pd.concat([df_example, df_stats], axis=1)
print("Example slope stats:")
print(df_result)


KeyError: 'gpx'

In [None]:
#Pre-processing 0 (get rid of values with equal start and end time)

try:
    # Convert to datetime if not already
    df['start_time'] = pd.to_datetime(df['start_time'], errors='coerce')
    df['end_time'] = pd.to_datetime(df['end_time'], errors='coerce')
    
    # Drop rows where start_time equals end_time
    initial_count = len(df)
    df = df[df['start_time'] != df['end_time']].copy()
    dropped_count = initial_count - len(df)
    
    # Display result summary and first few rows
    print(f"Dropped {dropped_count} rows where start_time == end_time.")
    import ace_tools as tools; tools.display_dataframe_to_user(name="Filtered DataFrame", dataframe=df.head())
except NameError:
    print("DataFrame 'df' is not defined. Please ensure your DataFrame is named 'df' and has 'start_time' and 'end_time' columns.")


In [None]:

# Select relevant features
selected = df[["duration","length_3d", "min_elevation", "max_elevation", "break_time", "uphill", "downhill"]]

X = selected
y = df['difficulty'].str[1].astype(int)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train logistic regression
mask = X.notnull().all(axis=1) & y.notnull()
X_clean = X_scaled[mask]
y_clean = y[mask]


In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(
        X_clean, y_clean, test_size=0.2, random_state=42, stratify=y_clean
    )

In [None]:
# Model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
scaler = StandardScaler()
X_clean = scaler.fit_transform(X)

pca = PCA()
X_pca = pca.fit_transform(X_clean)
# explained = pca.explained_variance_ratio_