In [1]:
import pandas as pd
import numpy as np
import os
import time
import datetime
import requests
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from haversine import haversine, Unit
from geopy.geocoders import Nominatim
from tenacity import retry, wait_fixed, stop_after_attempt
import requests_cache
import openmeteo_requests
from openmeteo_sdk import Variable

from sklearn.model_selection import train_test_split

In [2]:
def filter_csvs(folder_path, heart_rate_col, cadence_col):
    valid_files = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            if heart_rate_col in df.columns and cadence_col in df.columns:
                if not df[heart_rate_col].isnull().any() and not df[cadence_col].isnull().any():
                    valid_files.append(folder_path + '/' + filename)
        else:
          pass

    return valid_files

In [3]:
folder_path = "./CSVFiles/CSVFiles"
heart_rate_column = "Heart Rate"
cadence_column = "Cadence"

csvs = filter_csvs(folder_path, heart_rate_column, cadence_column)

In [4]:
print(csvs)

['./CSVFiles/CSVFiles/10261151.csv', './CSVFiles/CSVFiles/10555425.csv', './CSVFiles/CSVFiles/11136327.csv', './CSVFiles/CSVFiles/11174241.csv', './CSVFiles/CSVFiles/11174244.csv', './CSVFiles/CSVFiles/11197455.csv', './CSVFiles/CSVFiles/11198898.csv', './CSVFiles/CSVFiles/11198901.csv', './CSVFiles/CSVFiles/11198906.csv', './CSVFiles/CSVFiles/11198921.csv', './CSVFiles/CSVFiles/11233593.csv', './CSVFiles/CSVFiles/11247820.csv', './CSVFiles/CSVFiles/11289745.csv', './CSVFiles/CSVFiles/11296851.csv', './CSVFiles/CSVFiles/11302592.csv', './CSVFiles/CSVFiles/11303691.csv', './CSVFiles/CSVFiles/11318464.csv', './CSVFiles/CSVFiles/11332063.csv', './CSVFiles/CSVFiles/11361304.csv', './CSVFiles/CSVFiles/11365167.csv', './CSVFiles/CSVFiles/11365170.csv', './CSVFiles/CSVFiles/11381199.csv', './CSVFiles/CSVFiles/11416438.csv', './CSVFiles/CSVFiles/11417703.csv', './CSVFiles/CSVFiles/11420658.csv', './CSVFiles/CSVFiles/11421011.csv', './CSVFiles/CSVFiles/11441259.csv', './CSVFiles/CSVFiles/114554

In [5]:
dfs = [pd.read_csv(csv_file) for csv_file in csvs]

In [6]:
from openmeteo_sdk.Variable import Variable

om = openmeteo_requests.Client()
params = {
    "latitude": 52.54,
    "longitude": 13.41,
    "hourly": ["temperature_2m", "precipitation", "wind_speed_10m"],
    "current": ["temperature_2m", "relative_humidity_2m"]
}

responses = om.weather_api("https://api.open-meteo.com/v1/forecast", params=params)
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Current values
current = response.Current()
current_variables = list(map(lambda i: current.Variables(i), range(0, current.VariablesLength())))
current_temperature_2m = next(filter(lambda x: x.Variable() == Variable.temperature and x.Altitude() == 2, current_variables))
current_relative_humidity_2m = next(filter(lambda x: x.Variable() == Variable.relative_humidity and x.Altitude() == 2, current_variables))

print(f"Current time {current.Time()}")
print(f"Current temperature_2m {current_temperature_2m.Value()}")
print(f"Current relative_humidity_2m {current_relative_humidity_2m.Value()}")

Coordinates 52.540000915527344°N 13.419998168945312°E
Elevation 60.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Current time 1743104700
Current temperature_2m 6.650000095367432
Current relative_humidity_2m 83.0


In [7]:
def get_weather_at_timestamp(lat, lon, date_time):
    # Initialize Open-Meteo client
    om = openmeteo_requests.Client()
    
    # Define API parameters
    params = {
        "latitude": lat,
        "longitude": lon,
        "hourly": ["temperature_2m", "precipitation", "wind_speed_10m", "relative_humidity_2m"],  # Add more variables here if needed
        "timezone": "auto"
    }

    # Fetch data from Open-Meteo API
    responses = om.weather_api("https://api.open-meteo.com/v1/forecast", params=params)
    response = responses[0]  # Extract first response

    # Extract metadata
    weather_info = {
        "latitude": response.Latitude(),
        "longitude": response.Longitude(),
        "elevation": response.Elevation(),
        "timezone": response.Timezone(),
        "timezone_abbreviation": response.TimezoneAbbreviation(),
        "utc_offset_seconds": response.UtcOffsetSeconds(),
    }

    # Extract hourly weather data
    hourly = response.Hourly()
    times = pd.date_range(
        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=hourly.Interval()),
        inclusive="left"
    )

    # Retrieve values for temperature and precipitation
    temperature_values = hourly.Variables(0).ValuesAsNumpy()
    precipitation_values = hourly.Variables(1).ValuesAsNumpy()
    wind_speed_values = hourly.Variables(2).ValuesAsNumpy()
    relative_humidity_values = hourly.Variables(3).ValuesAsNumpy()

    # Find the closest timestamp to the requested time
    closest_idx = min(range(len(times)), key=lambda i: abs(times[i] - date_time))

    weather_info.update({
        "time": times[closest_idx],
        "temperature_2m": temperature_values[closest_idx],
        "precipitation": precipitation_values[closest_idx],
        "wind_speed": wind_speed_values[closest_idx],
        "relative_humidity": relative_humidity_values[closest_idx]
    })

    return weather_info


In [8]:
pd.to_datetime(dfs[0]['Time'].iloc[0])

Timestamp('2023-09-16 08:05:40+0000', tz='UTC')

In [9]:
lat, lon = dfs[0]['Latitude'].iloc[0], dfs[0]['Longitude'].iloc[0]
timestamp = dfs[0]['Time'].iloc[0]

weather_data = get_weather_at_timestamp(lat, lon, pd.to_datetime(timestamp))
print(weather_data)

{'latitude': 51.44000244140625, 'longitude': 0.17999982833862305, 'elevation': 36.0, 'timezone': b'Europe/London', 'timezone_abbreviation': None, 'utc_offset_seconds': 0, 'time': Timestamp('2025-03-27 00:00:00+0000', tz='UTC'), 'temperature_2m': 7.8785, 'precipitation': 0.0, 'wind_speed': 5.95906, 'relative_humidity': 74.0}


In [None]:
from retrying import retry
def convert_time_to_seconds(df):
    df['Time'] = pd.to_datetime(df['Time'])

    time_diff = df['Time'] - df['Time'].iloc[0]

    df['Time'] = time_diff.dt.total_seconds()

    return df

@retry(stop_max_attempt_number=3, wait_fixed=5000)  # Retry 3 times with a 5-second wait between retries
def fetch_hourly_weather(lat, lon):
    """
    Fetch hourly weather data for a given latitude and longitude.
    Returns a DataFrame with weather values for each hour.
    """
    # Initialize Open-Meteo client
    om = openmeteo_requests.Client()
    
    # Define API parameters
    params = {
        "latitude": lat,
        "longitude": lon,
        "hourly": ["temperature_2m", "precipitation", "wind_speed_10m", "relative_humidity_2m"],
        "timezone": "auto"
    }

    try:
        # Fetch data from Open-Meteo API with a longer timeout (e.g., 60 seconds)
        responses = om.weather_api("https://api.open-meteo.com/v1/forecast", params=params)
        response = responses[0]  # Extract first response

        # Extract hourly weather data
        hourly = response.Hourly()
        times = pd.date_range(
            start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
            end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=hourly.Interval()),
            inclusive="left"
        )

        # Retrieve values for each weather variable
        weather_data = pd.DataFrame({
            "time": times,
            "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
            "precipitation": hourly.Variables(1).ValuesAsNumpy(),
            "wind_speed_10m": hourly.Variables(2).ValuesAsNumpy(),
            "relative_humidity_2m": hourly.Variables(3).ValuesAsNumpy()
        })

        return weather_data

    except requests.exceptions.Timeout:
        print("Request timed out. Please try again later.")
        return pd.DataFrame()  # Return an empty DataFrame in case of timeout


def get_weather_for_dataframe(df, lat, lon, time_column='Time'):
    # Fetch weather data for the given latitude and longitude
    weather_data = fetch_hourly_weather(lat, lon)

    # Ensure both dataframes' time columns are in the same time zone (UTC) and in the same format
    df[time_column] = pd.to_datetime(df[time_column])  # Ensure the column is in datetime format

    if df[time_column].dt.tz is None:  # If timezone is not set
        df[time_column] = df[time_column].dt.tz_localize('UTC')  # Localize to UTC
    else:  # If timezone is already set
        df[time_column] = df[time_column].dt.tz_convert('UTC')  # Convert to UTC

    # Check the column names in weather_data to confirm the correct time column name
    print(weather_data.columns)  # Debugging step to check the column names

    # Adjust this line if the time column is named differently
    if 'time' not in weather_data.columns:
        raise KeyError("'time' column is missing in the weather data")

    # Ensure weather data time is in UTC
    weather_data['time'] = pd.to_datetime(weather_data['time'], utc=True)  # Ensure time is in UTC

    # Merge weather data with the existing dataframe based on time
    merged_df = pd.merge_asof(df.sort_values(by=time_column),
                              weather_data.sort_values(by='time'),
                              left_on=time_column, right_on='time',
                              direction='nearest')

    # Return the dataframe with the weather features included
    return merged_df



In [38]:
og_df = dfs[0]

In [41]:
og_df.head()

Unnamed: 0,Latitude,Longitude,Elevation,Time,Cadence,Heart Rate
0,51.434847,0.189615,35.9,2023-09-16 08:05:40+00:00,0,91
1,51.434849,0.189616,35.9,2023-09-16 08:05:41+00:00,0,90
2,51.434849,0.189617,35.9,2023-09-16 08:05:42+00:00,0,90
3,51.434841,0.189612,35.9,2023-09-16 08:05:43+00:00,0,90
4,51.43483,0.189598,35.9,2023-09-16 08:05:44+00:00,0,90


In [39]:
dfs[0] = og_df

In [11]:
i = 1
for index in range(len(dfs)):
    dfs[index]['Time'] = pd.to_datetime(dfs[index]['Time'])

    # Update the dataframe in the list
    dfs[index] = get_weather_for_dataframe(
        dfs[index], 
        dfs[index]["Latitude"].iloc[0], 
        dfs[index]['Longitude'].iloc[0], 
        time_column='Time'
    )

    convert_time_to_seconds(dfs[index])
    dfs[index]['Altitude_diff'] = dfs[index]['Elevation'].diff()

    print(i)
    i += 1

dfs[0].head()



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
Request timed out. Please try again later.


KeyError: 'time'

In [44]:
dfs[2].head()

Unnamed: 0,Latitude,Longitude,Elevation,Time,Cadence,Heart Rate
0,43.694719,-73.34271,115.6,2023-10-21 20:50:32+00:00,73,146
1,43.694743,-73.342724,115.6,2023-10-21 20:50:33+00:00,74,148
2,43.694766,-73.342737,115.6,2023-10-21 20:50:34+00:00,74,148
3,43.694792,-73.342747,115.6,2023-10-21 20:50:35+00:00,74,150
4,43.694827,-73.342759,115.6,2023-10-21 20:50:36+00:00,74,150


In [19]:
dfs[45].tail()

Unnamed: 0,Latitude,Longitude,Elevation,Time,Cadence,Heart Rate,Hour,Temperature,Precipitation,Wind Speed,Relative Humidity,Dewpoint
1505,51.903335,-0.177427,117.400002,2024-11-23 09:26:03+00:00,83,150,2024-11-23 09:00:00+00:00,,,,,
1506,51.903298,-0.177405,117.400002,2024-11-23 09:26:04+00:00,83,149,2024-11-23 09:00:00+00:00,,,,,
1507,51.903265,-0.177393,117.599998,2024-11-23 09:26:05+00:00,83,150,2024-11-23 09:00:00+00:00,,,,,
1508,51.903232,-0.177379,117.599998,2024-11-23 09:26:06+00:00,83,150,2024-11-23 09:00:00+00:00,,,,,
1509,51.9032,-0.177366,117.800003,2024-11-23 09:26:07+00:00,83,150,2024-11-23 09:00:00+00:00,,,,,
