In [3]:
import os, shutil
import bs4 as bs
import requests
import datetime
import pandas as pd
import glob

def download_csv_files(folder_url, destination_folder, start_time, end_time):

  try:

    # Create the destination folder if it doesn't exist
    if not os.path.exists(destination_folder):
      os.makedirs(destination_folder)

    # Delete old files
    for filename in os.listdir(destination_folder):
      file_path = os.path.join(destination_folder, filename)
      try:
          if os.path.isfile(file_path) or os.path.islink(file_path):
              os.unlink(file_path)
          elif os.path.isdir(file_path):
              shutil.rmtree(file_path)
      except Exception as e:
          print('Failed to delete %s. Reason: %s' % (file_path, e))


    # Get the list of files in the folder
    response = requests.get(folder_url)
    data = bs.BeautifulSoup(response.text, "html.parser")

    csv_files = data.find_all("a", href=lambda href: href and href.endswith(".csv"))

    # Filter CSV files based on time range
    filtered_files = []
    for file_name in csv_files:
        csv_filename = file_name['href'].split('/')[-1]
        try:
            # Parse the filename to extract the creation or modification time
            file_time_str = csv_filename.split('_')[0]  # Assuming the time is in the first part
            file_time = datetime.datetime.strptime(file_time_str, '%d-%m-%Y-%H%M')
            if start_time <= file_time <= end_time:
                filtered_files.append(file_name)
        except ValueError as e:
            # Handle parsing errors (e.g., invalid filename format)
            print(f"Error parsing filename: {csv_filename}, {e.args[0]}")

    for file_name in filtered_files:
      csv_url = file_name['href']  # Get the CSV file URL
      csv_filename = csv_url.split('/')[-1]  # Extract the filename
      link = folder_url + csv_filename
      destination_link = os.path.join(destination_folder, csv_filename)

      response = requests.get(link)
      with open(destination_link, 'wb') as f:
         f.write(response.content)

    print("Downloading finished, outcome unknown.")

  except requests.exceptions.RequestException as e:
    print(f"Error downloading files: {e}")



# Initiate download of files
folder_url = "http://69.48.179.226/csv_files/"  # Replace with the actual folder URL
destination_folder = "csv_files"  # Replace with the desired destination folder
start_time = datetime.datetime(2024, 9, 13, 1, 10)
end_time = datetime.datetime(2024, 9, 13, 1, 40)

download_csv_files(folder_url, destination_folder, start_time, end_time)

Downloading finished, outcome unknown.


In [4]:
# Combines multiple CSV files into one.
def combine_csv_files(input_folder, output_file):
  
    # Get a list of all CSV files in the input folder
    csv_files = glob.glob(input_folder + "/*.csv")

    # Initialize an empty list to store DataFrames
    dataframes = []

    # Iterate through each CSV file
    for file in csv_files:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file)

        df = df.rename(columns={'time': 'dateTime'})
        df = df.rename(columns={'unknown': 'reflectivity'})

        # Select the desired columns
        df = df[['dateTime', 'latitude', 'longitude', 'reflectivity']]

        # Append the DataFrame to the list
        dataframes.append(df)

    # Concatenate all DataFrames into one
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Save the combined DataFrame to a CSV file
    combined_df.to_csv(output_file, index=False)

input_folder = "csv_files"
output_file = "combined_data.csv"

combine_csv_files(input_folder, output_file)

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

pd.set_option('display.max_columns', None)
np.set_printoptions(precision=15)

data = pd.read_csv('combined_data.csv')

In [6]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [7]:
from copy import deepcopy as dc

# prepare data frame to be dateTime | latitude | longitude | reflectivity | reflectivity_1 to _15
def prepare_dataframe_for_lstm(df, n_steps):
  df = dc(df)

  df['datetime'] = pd.to_datetime(df['dateTime'])

  # Encode 'datetime' as cyclical features (excluding day encoding)
  df['minute_sin'] = np.sin(2 * np.pi * df['datetime'].dt.minute / 60)
  df['minute_cos'] = np.cos(2 * np.pi * df['datetime'].dt.minute / 60)
  df['hour_sin'] = np.sin(2 * np.pi * df['datetime'].dt.hour / 24)
  df['hour_cos'] = np.cos(2 * np.pi * df['datetime'].dt.hour / 24)
  df['month_sin'] = np.sin(2 * np.pi * df['datetime'].dt.month / 12)
  df['month_cos'] = np.cos(2 * np.pi * df['datetime'].dt.month / 12)

  # Move new columns to the front
  new_columns = ['minute_sin', 'minute_cos', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
  remaining_columns = [col for col in df.columns if col not in new_columns]
  df = df[new_columns + remaining_columns]

  df = df.drop(columns=['dateTime'])
  df = df.drop(columns=['datetime'])

  for i in range(1, n_steps+1):
    df[f'reflectivity_{i}'] = df['reflectivity'].shift(i)

  df.dropna(inplace=True)

  return df

lookback = 15
df = prepare_dataframe_for_lstm(data, lookback)
df

Unnamed: 0,minute_sin,minute_cos,hour_sin,hour_cos,month_sin,month_cos,latitude,longitude,reflectivity,reflectivity_1,reflectivity_2,reflectivity_3,reflectivity_4,reflectivity_5,reflectivity_6,reflectivity_7,reflectivity_8,reflectivity_9,reflectivity_10,reflectivity_11,reflectivity_12,reflectivity_13,reflectivity_14,reflectivity_15
15,0.207912,-0.978148,0.258819,0.965926,-1.0,-1.836970e-16,46.745,256.704999,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
16,0.207912,-0.978148,0.258819,0.965926,-1.0,-1.836970e-16,46.745,256.714999,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
17,0.207912,-0.978148,0.258819,0.965926,-1.0,-1.836970e-16,46.745,256.724999,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
18,0.207912,-0.978148,0.258819,0.965926,-1.0,-1.836970e-16,46.745,256.734999,17.5,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
19,0.207912,-0.978148,0.258819,0.965926,-1.0,-1.836970e-16,46.745,256.744999,17.5,17.5,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013755,0.866025,-0.500000,0.258819,0.965926,-1.0,-1.836970e-16,49.635,260.844999,2.3,2.8,-99.0,-99.0,-99.0,1.5,1.7,1.8,-99.0,-99.0,-99.0,9.5,11.8,12.3,3.5,7.8
2013756,0.866025,-0.500000,0.258819,0.965926,-1.0,-1.836970e-16,49.635,260.854999,1.0,2.3,2.8,-99.0,-99.0,-99.0,1.5,1.7,1.8,-99.0,-99.0,-99.0,9.5,11.8,12.3,3.5
2013757,0.866025,-0.500000,0.258819,0.965926,-1.0,-1.836970e-16,49.635,260.864999,-99.0,1.0,2.3,2.8,-99.0,-99.0,-99.0,1.5,1.7,1.8,-99.0,-99.0,-99.0,9.5,11.8,12.3
2013758,0.866025,-0.500000,0.258819,0.965926,-1.0,-1.836970e-16,49.635,260.874999,-99.0,-99.0,1.0,2.3,2.8,-99.0,-99.0,-99.0,1.5,1.7,1.8,-99.0,-99.0,-99.0,9.5,11.8


In [35]:
shifted_df_as_np = df.to_numpy()

In [36]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(shifted_df_as_np)

In [37]:
X = scaled_data[:, 9:]
y = scaled_data[:, :9]

X = dc(np.flip(X, axis=1))

In [38]:
X_sequences = []
y_targets = []

for i in range(lookback, len(X)):
    # Collect the last `lookback` points as the input sequence
    X_seq = X[i-lookback:i, :]
    X_sequences.append(X_seq)
    
    # The next reflectivity point to predict
    y_targets.append(y[i, :])  # Here, you might want to change `y` to represent the next point correctly

X_sequences = np.array(X_sequences)
y_targets = np.array(y_targets)

X_sequences[-1:], y_targets[-1:]

(array([[[0.616472034419176, 0.61339889366933 , 0.61339889366933 ,
          0.620774431468961, 0.               , 0.               ,
          0.               , 0.686539643515673, 0.71419791026429 ,
          0.722188076213891, 0.685310387215735, 0.656422864167179,
          0.6299938537185  , 0.684081130915796, 0.68100799016595 ],
         [0.61339889366933 , 0.61339889366933 , 0.620774431468961,
          0.               , 0.               , 0.               ,
          0.686539643515673, 0.71419791026429 , 0.722188076213891,
          0.685310387215735, 0.656422864167179, 0.6299938537185  ,
          0.684081130915796, 0.68100799016595 , 0.666871542716656],
         [0.61339889366933 , 0.620774431468961, 0.               ,
          0.               , 0.               , 0.686539643515673,
          0.71419791026429 , 0.722188076213891, 0.685310387215735,
          0.656422864167179, 0.6299938537185  , 0.684081130915796,
          0.68100799016595 , 0.666871542716656, 0.          

In [11]:
split_index = int(len(X_sequences) * 0.95)

In [12]:
# Split into train and test sets
X_train, X_test = X_sequences[:split_index], X_sequences[split_index:]
y_train, y_test = y_targets[:split_index], y_targets[split_index:]

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [13]:
from torch.utils.data import Dataset

class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y


    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)
train_dataset

<__main__.TimeSeriesDataset at 0x147e16d20>

In [15]:
from torch.utils.data import DataLoader

batch_size = 512

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [16]:
for _, batch in enumerate(train_dataloader):
  x_batch, y_batch = batch[0].to(device), batch[1].to(device)
  print(x_batch.shape, y_batch.shape)
  break

torch.Size([512, 15, 15]) torch.Size([512, 9])


In [22]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_stacked_layers, output_size):
      super(LSTM, self).__init__()
      self.hidden_size = hidden_size
      self.num_stacked_layers = num_stacked_layers

      self.lstm = nn.LSTM(input_size, hidden_size, num_stacked_layers, batch_first=True)

      self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
      batch_size = x.size(0)
      h0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
      c0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
      out, _ = self.lstm(x)
      out = self.fc(out[:, -1, :])
      return out

input_size = X_train.shape[2]
hidden_size = 64
num_stack_layers = 2
output_size = y_train.shape[1]
model = LSTM(input_size=input_size, hidden_size=hidden_size, num_stacked_layers=num_stack_layers, output_size=output_size).to(device)
model

LSTM(
  (lstm): LSTM(15, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=64, out_features=9, bias=True)
)

In [26]:
def train_one_epoch():
  model.train(True)
  print(f'Epoch: {epoch+1}')
  running_loss = 0.0

  for batch_index, batch in enumerate(train_dataloader):
    x_batch, y_batch = batch[0].to(device), batch[1].to(device)

    output = model(x_batch)
    loss = loss_function(output, y_batch)

    running_loss += loss.item()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch_index % 100 == 99:
      avg_loss_across_batch = running_loss / 100
      print(f'running loss {running_loss}')
      print(f'Batch: {batch_index+1}, Loss: {avg_loss_across_batch}')
      running_loss = 0.0

  print()

In [27]:
def validate_one_epoch():
  model.train(False)
  running_loss = 0.0

  for batch_index, batch in enumerate(test_dataloader):
    x_batch, y_batch = batch[0].to(device), batch[1].to(device)

    with torch.no_grad():
      output = model(x_batch)
      loss = loss_function(output, y_batch)

      running_loss += loss.item()

  avg_loss_across_batch = running_loss / len(test_dataloader)
  print(f'Running loss {running_loss} / test_dataloader length {len(test_dataloader)}')
  print(f'Validation Loss: {avg_loss_across_batch}')
  print()

In [28]:
learning_rate = 0.001
num_epochs = 1

loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
  train_one_epoch()
  validate_one_epoch()

Epoch: 1
running loss 4.319975692778826
Batch: 100, Loss: 0.043199756927788255
running loss 4.31719970330596
Batch: 200, Loss: 0.0431719970330596
running loss 4.31133820861578
Batch: 300, Loss: 0.0431133820861578
running loss 4.304677050560713
Batch: 400, Loss: 0.04304677050560713
running loss 4.2929847575724125
Batch: 500, Loss: 0.04292984757572413
running loss 4.286273013800383
Batch: 600, Loss: 0.042862730138003825
running loss 4.282859731465578
Batch: 700, Loss: 0.042828597314655784
running loss 4.280798465013504
Batch: 800, Loss: 0.04280798465013504
running loss 4.301626067608595
Batch: 900, Loss: 0.04301626067608595
running loss 4.295467708259821
Batch: 1000, Loss: 0.04295467708259821
running loss 4.278264492750168
Batch: 1100, Loss: 0.04278264492750168
running loss 4.286611258983612
Batch: 1200, Loss: 0.04286611258983612
running loss 4.283159833401442
Batch: 1300, Loss: 0.04283159833401442
running loss 4.299402695149183
Batch: 1400, Loss: 0.042994026951491834
running loss 4.2625

KeyboardInterrupt: 