In [27]:
from datetime import timedelta, datetime, timezone
import bs4 as bs
import os, shutil
import requests
import pandas as pd
from glob import glob
import numpy as np
import random

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', None)

root_dir = os.path.dirname(os.getcwd()) + "/data/"

In [28]:
def random_time_window(start_date: datetime, end_date: datetime, minutes: int):
    """
    Returns a random (start, end) datetime tuple within [start_date, end_date],
    where the difference between start and end is exactly `minutes`.
    """
    delta = end_date - start_date
    max_start = delta.total_seconds() - minutes * 60
    if max_start < 0:
        raise ValueError("Time window is too large for the given range.")
    random_offset = random.uniform(0, max_start)
    random_start = start_date + timedelta(seconds=random_offset)
    random_end = random_start + timedelta(minutes=minutes)
    return random_start, random_end

In [29]:
def download_files(folder_url, destination_folder, start_time, end_time, granularity_minutes=2):
    """
    Downloads files from folder_url to destination_folder within the time window,
    only keeping files at intervals of granularity_minutes.
    """
    file_type = "npy"

    try:
        # Create the destination folder if it doesn't exist
        if not os.path.exists(destination_folder):
            os.makedirs(destination_folder)

        # Delete old files
        for filename in os.listdir(destination_folder):
            file_path = os.path.join(destination_folder, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print('Failed to delete %s. Reason: %s' % (file_path, e))

        # Get the list of files in the folder
        response = requests.get(folder_url)
        data = bs.BeautifulSoup(response.text, "html.parser")

        csv_files = data.find_all("a", href=lambda href: href and href.endswith(f'.{file_type}'))

        # Filter files based on time range and granularity
        filtered_files = []
        last_selected_time = None
        for file_name in csv_files:
            file_type_filename = file_name['href'].split('/')[-1]
            try:
                # Parse the filename to extract the creation or modification time
                file_time_str = file_type_filename.split('.')[0]  # Extract '20250605-053644'
                file_time = datetime.strptime(file_time_str, '%Y%m%d-%H%M%S').replace(tzinfo=timezone.utc)
                # Ignore seconds and microseconds for granularity comparison
                file_time_rounded = file_time.replace(second=0, microsecond=0)
                if start_time <= file_time <= end_time:
                    if (last_selected_time is None or 
                        (file_time_rounded - last_selected_time).total_seconds() >= granularity_minutes * 60):
                        filtered_files.append(file_name)
                        last_selected_time = file_time_rounded
            except ValueError as e:
                # Handle parsing errors (e.g., invalid filename format)
                print(f"Error parsing filename: {file_type_filename}, {e.args[0]}")
        if len(filtered_files) == 0:
            print("No files matching that criteria")

        for file_name in filtered_files:
            file_type_url = file_name['href']  # Get the file URL
            file_type_filename = file_type_url.split('/')[-1]  # Extract the filename
            link = folder_url + file_type_filename
            destination_link = os.path.join(destination_folder, file_type_filename)

            response = requests.get(link)
            with open(destination_link, 'wb') as f:
                f.write(response.content)

        print("Downloading finished, outcome unknown.")

    except requests.exceptions.RequestException as e:
        print(f"Error downloading files: {e}")


In [30]:
data_start_time = datetime(2025, 6, 3, 16, 38, tzinfo=timezone.utc)
data_end_time = datetime.now(timezone.utc)

train_data_start_time, train_data_end_time = random_time_window(data_start_time, data_end_time, 240)
validation_data_start_time, validation_data_end_time = random_time_window(data_start_time, data_end_time, 120)
test_data_start_time, test_data_end_time = random_time_window(data_start_time, data_end_time, 120)

print(f"Train data time window: {train_data_start_time} to {train_data_end_time}")
print(f"Validation data time window: {validation_data_start_time} to {validation_data_end_time}")
print(f"Test data time window: {test_data_start_time} to {test_data_end_time}")


Train data time window: 2025-06-04 07:24:05.083714+00:00 to 2025-06-04 11:24:05.083714+00:00
Validation data time window: 2025-06-05 08:11:06.763740+00:00 to 2025-06-05 10:11:06.763740+00:00
Test data time window: 2025-06-04 15:21:48.635249+00:00 to 2025-06-04 17:21:48.635249+00:00


In [31]:
# Initiate download of files
folder_url = f'http://66.213.177.43/v2_npy_files/'
granularity_minutes = 10

train_destination_folder = root_dir + f'train_npy_files'
download_files(folder_url, train_destination_folder, train_data_start_time, train_data_end_time, granularity_minutes)

validation_destination_folder = root_dir + f'validation_npy_files'
download_files(folder_url, validation_destination_folder, validation_data_start_time, validation_data_end_time, granularity_minutes)

test_destination_folder = root_dir + f'test_npy_files'
download_files(folder_url, test_destination_folder, test_data_start_time, test_data_end_time, granularity_minutes)

Downloading finished, outcome unknown.
Downloading finished, outcome unknown.
Downloading finished, outcome unknown.


In [32]:
# Load the .npy files into a list
npy_files = sorted(glob(root_dir + 'train_npy_files/*.npy'))
npy_files += sorted(glob(root_dir + 'validation_npy_files/*.npy'))
npy_files += sorted(glob(root_dir + 'test_npy_files/*.npy'))

for i in enumerate(npy_files):
    array = np.load(i[1])
    
    array[array < 20] = 0
    array[array > 80] = 80

    np.save(i[1], array)

In [33]:
import subprocess

print("Finished downloading and data processing at " + datetime.now().strftime('%d/%m/%y %H:%M:%S.%f'))

Finished downloading and data processing at 05/06/25 09:04:56.650420
