In [141]:
#!/usr/bin/env python
# coding: utf-8

In[121]:

In [142]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

Load the data

In[122]:

In [143]:
!mkdir data && wget https://github.com/sdip15fa/weather-predict/raw/master/data/processed_data.csv -O data/processed_data.csv

input_file = 'data/processed_data.csv'
df = pd.read_csv('data/processed_data.csv',
                 parse_dates=['DateTime'], index_col='DateTime')
df = df.resample('60T').mean()
df = df[(df['Temperature'] >= df['Temperature'].quantile(0.02)) &
        (df['Temperature'] <= df['Temperature'].quantile(0.98))]
# df["DateTime"] = pd.to_datetime(df.index)
df = df.reset_index()

mkdir: cannot create directory ‘data’: File exists


Rename the columns for better readability

In[123]:

df.columns = ['DateTime', 'Year', 'Month', 'Date', 'Time', 'Minute', 'Temperature', 'Previous Day Average', 'Two Days Before Average',<br>
'Three Days Before average', 'Last 7 Days Average', 'Previous Day Wind Speed', 'Previous Day Rainfall']

Convert the 'Date' and 'Time' columns to integers

In[124]:

In [144]:
df['Date'] = df['Date'].astype(int)
df['Time'] = df['Time'].astype(int)

Fill leading zeros for the 'Time' column

In[125]:

In [145]:
df['Time'] = df['Time'].apply(lambda x: str(x).zfill(4))

Combine the 'Date' and 'Time' columns into a single 'DateTime' column

In[126]:

df['DateTime'] = pd.to_datetime(df["DateTime"], format="%Y-%m-%d %H:%M:%S")

In[ ]:

Remove rows with a specific value (e.g., 32767) in 'Temperature' column

In[127]:

In [146]:
df = df[df['Temperature'] != 32767]

Calculate the mean and standard deviation of Y

In[128]:

In [147]:
threshold = 5
mean_Y = np.mean(df['Temperature'])
std_Y = np.std(df['Temperature'])

Define the range of acceptable Y values

In[129]:

In [148]:
lower_bound = mean_Y - threshold * std_Y
upper_bound = mean_Y + threshold * std_Y

Filter out rows with Y values outside the acceptable range

In[130]:

In [149]:
df = df[(df['Temperature'] >= lower_bound) &
        (df['Temperature'] <= upper_bound)]

Prepare the data for LSTM

In[131]:

In [150]:
time_steps = 60  # Number of time steps for the LSTM model
scaler = MinMaxScaler(feature_range=(0, 1))  # Scale the data to [0, 1]

Create sequences of input data and corresponding target values

In[133]:

Filter out the outliers and invalid values for the new features<br>
Replace the specific invalid values (e.g., 32767) with np.nan

In [151]:
df['Wind Speed'] = df['Wind Speed'].replace(32767, np.nan)
df['Rainfall'] = df['Rainfall'].replace(32767, np.nan)
df['Wind Direction'] = df['Wind Direction'].replace(32767, np.nan)
df = df[(df['Wind Direction'] >= 0) & (df['Wind Direction'] <= 360)]
df = df[(df['Wind Speed'] <= df['Wind Speed'].quantile(0.99))]
df = df[(df['Rainfall'] <= df['Rainfall'].quantile(0.99))]


Remove rows with missing values

In [152]:
df = df.dropna()

In [153]:
df

Unnamed: 0,DateTime,Year,Month,Date,Time,Minute,Temperature,Wind Speed,Wind Direction,Rainfall
0,2008-01-01 12:00:00,2008.0,1.0,1,0012,1.0,5.1,44.0,39.0,0.0
1,2008-01-01 13:00:00,2008.0,1.0,1,0013,1.0,5.8,31.0,49.0,0.0
2,2008-01-01 14:00:00,2008.0,1.0,1,0014,1.0,6.6,32.0,36.0,0.0
3,2008-01-01 15:00:00,2008.0,1.0,1,0015,1.0,7.2,37.0,33.0,0.0
4,2008-01-01 16:00:00,2008.0,1.0,1,0016,1.0,6.7,60.0,6.0,0.0
...,...,...,...,...,...,...,...,...,...,...
119698,2023-06-29 17:00:00,2023.0,6.0,29,0017,1.0,23.4,19.0,178.0,0.0
119699,2023-06-29 18:00:00,2023.0,6.0,29,0018,1.0,22.8,18.0,165.0,0.0
119700,2023-06-29 19:00:00,2023.0,6.0,29,0019,1.0,22.7,50.0,129.0,0.0
119701,2023-06-29 20:00:00,2023.0,6.0,29,0011,1.0,22.2,62.0,113.5,0.0


In [154]:
features_keys = ['Temperature', 'Wind Speed',
                'Wind Direction', 'Rainfall']

Scale the temperature, wind speed, rainfall, and wind direction values

In [155]:
features = df[features_keys].copy()
scaled_features = scaler.fit_transform(features)

In [156]:
scaled_features

array([[0.        , 0.24571429, 0.10584958, 0.        ],
       [0.03414634, 0.17142857, 0.13370474, 0.        ],
       [0.07317073, 0.17714286, 0.09749304, 0.        ],
       ...,
       [0.85853659, 0.28      , 0.35654596, 0.        ],
       [0.83414634, 0.34857143, 0.31337047, 0.        ],
       [0.83902439, 0.16571429, 0.42896936, 0.30769231]])

Create sequences of input data and corresponding target values

In [157]:
data = []
target = []
for i in range(1, len(scaled_features) - time_steps):
    # exclude the target
    data.append(scaled_features[i-1:i+time_steps-1])
    # Only the temperature is the target
    target.append(scaled_features[i+time_steps])

In [158]:
data = np.array(data)
target = np.array(target)


In[135]:

Define exclude date (year, month, and day)

In[136]:

Split the dataset into training and validation sets

In[137]:

In [159]:
split = 0.1
exclude_index = int(len(data) * (1-split))
train_data, train_target = data[:exclude_index], target[:exclude_index]
val_data, val_target = data[exclude_index:], target[exclude_index:]


Build the LSTM model architecture

In[138]:

In [160]:
import tensorflow as tf
"""
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(256, return_sequences=True,
                         input_shape=(time_steps, 4)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(256, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(256, return_sequences=True),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1)
])
"""
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, return_sequences=True, input_shape=(time_steps, 4)),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(4)
])

2023-06-30 19:07:04.792193: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-30 19:07:04.793937: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-30 19:07:04.796031: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Compile the model

In[139]:

In [161]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='mean_absolute_error', optimizer=optimizer)

Define early stopping and learning rate scheduler

In[140]:

In [162]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=10, restore_best_weights=True)
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)

Train the LSTM model

In[141]:

In [163]:
batch_size = 64
epochs = 100
history = model.fit(train_data, train_target, batch_size=batch_size, epochs=epochs, validation_data=(val_data, val_target),
                    callbacks=[early_stopping, lr_scheduler])

Epoch 1/100


2023-06-30 19:07:05.574194: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-30 19:07:05.575664: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-30 19:07:05.578332: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Make predictions using the trained LSTM model

In[142]:

In [None]:
predictions = model.predict(val_data)

Rescale the predictions back to the original range

In[143]:

In [None]:
scaled_predictions = pd.DataFrame(predictions, columns=features_keys)
predictions = pd.DataFrame(scaler.inverse_transform(
    predictions), columns=features_keys)
predicted_temperature = predictions["Temperature"]


Calculate MSE and MAE

In[144]:

In [None]:
mse = mean_squared_error(val_target["Temperature"], scaled_predictions["Temperature"])
mae = mean_absolute_error(val_target["Temperature"], scaled_predictions["Temperature"])

In[145]:

In [None]:
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)

Visualize the actual vs. predicted temperatures for the validation set

In[146]:

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(len(val_target)), scaler.inverse_transform(
    val_target["Temperature"].reshape(-1, 1)), label='Actual')
plt.plot(range(len(val_target)), predicted_temperature, label='Predicted')
plt.xlabel('Time')
plt.ylabel('Temperature')
plt.title('Actual vs. Predicted Temperatures (Validation Set)')
plt.legend()
plt.show()

Visualize the actual vs. predicted temperatures for the training set

In[147]:

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(len(train_target)), train_target, label='Actual')
plt.plot(range(len(train_target)), model.predict(
    train_data), label='Predicted')
plt.xlabel('Time')
plt.ylabel('Temperature')
plt.title('Actual vs. Predicted Temperatures (Training Set)')
plt.legend()
plt.show()

Save the trained model

In[148]:

In [None]:
model.save("lstm.keras")

In [None]:
model = tf.keras.models.load_model("lstm.keras")

data = scaled_features.copy()[-time_steps:]

for i in range(30):
    prediction = model.predict(data)
    print(f"{i} hour(s) prediction:", scaler.inverse_transform(prediction.reshape(-1, 4)))
    data.append(prediction)
    data.roll()