# Temperature and Rainfall Analysis using PyCaret - Regression

### Step 1: Merging the dataset

In [1]:
# import pandas as pd
#
# temperature_df = pd.read_excel('dataset/Temp_SRS_1Jan1964_to_31Oct2020.xlsx')
# rainfall_df = pd.read_csv('dataset/Rainfall_at_SRS_30Nov1960_to_18Nov2020.csv')
#
# rainfall_df = rainfall_df.fillna(0)
#
# temperature_df['DATE'] = pd.to_datetime(temperature_df['DATE'], format='%Y-%m-%d')
# rainfall_df['DATE'] = pd.to_datetime(rainfall_df['DATE'], format='%m/%d/%Y')
#
# temperature_df['YEAR'] = pd.to_datetime(temperature_df.DATE).dt.year
# rainfall_df['YEAR'] = pd.to_datetime(rainfall_df.DATE).dt.year
#
# temperature_df_grouped = temperature_df.groupby('YEAR').agg({'LOW TEMP': 'mean', 'HIGH TEMP': 'mean'})
# temperature_df_grouped['RAINFALL'] = rainfall_df.groupby('YEAR').agg({'200-F Rainfall (inches/day)': 'mean'})
# rain_temp_df = pd.merge(temperature_df, rainfall_df, on='DATE')
# temperature_df_grouped.reset_index(inplace=True)
# temperature_df_grouped.head()
# # print(rain_temp_df.head(10))

## Predicting rainfall using average yearly maximum and minimum temperatures using LSTM

In [2]:
# from pycaret.time_series import *
#
# train_df = temperature_df_grouped.query('YEAR < 2010')
# test_df = temperature_df_grouped.query('YEAR >= 2010')
#
# reg = setup(data=train_df, target='RAINFALL')
#
# model = models()


# LSTM Implementation

In [3]:
import pandas as pd

rainfall_df = pd.read_csv('dataset/Rainfall_at_SRS_30Nov1960_to_18Nov2020.csv')
rainfall_df = rainfall_df.fillna(0)
rainfall_df.rename(columns={'200-F Rainfall (inches/day)': 'RAINFALL'}, inplace=True)
rainfall_df = rainfall_df.drop(columns={'DATE'})

rainfall_df.head()

Unnamed: 0,RAINFALL
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


## Data preparation for passing to LSTM

In [4]:
# import numpy as np
#
# rainfall_data = rainfall_df.values
# sequence_length = 80
# no_of_features = 1
#
# samples = list()
# for i in range(0, len(rainfall_data), sequence_length):
#     sample = rainfall_data[i:i+sequence_length]
#     if len(sample) == sequence_length:
#         samples.append(sample)
#
# samples = np.array(samples)
# samples = samples.reshape(len(samples), sequence_length, no_of_features)
#
# train_size = 80 # int(len(samples) * 0.8)
# train, val = samples[:train_size], samples[train_size:]
#
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# train = scaler.fit_transform(train.reshape(-1, no_of_features)).reshape(train.shape)
# val = scaler.transform(val.reshape(-1, no_of_features)).reshape(val.shape)

# LSTM Implementation

In [5]:
# import torch.nn as nn
# import torch
#
# lstm = nn.LSTM(input_size=1, hidden_size=10) # input size is 1 because the input is rainfall level only
#
# input_data = train[:, :, 0] # input shape is (80, 60), sequence length is 60, batch size is 80, input size is 1
# input_data = torch.from_numpy(input_data)
# input_data = input_data.to(torch.float32) # casted to float32 because the extra dimension step encounters data type error
# output_data = train[:, -1, 0] # output shape is (80), output size is 1
#
# hidden = torch.zeros(1, 80, 10) # hidden state shape is (1, 80, 10), num_layers is 1, num_directions is 1, batch size is 80, hidden size is 10
# cell = torch.zeros(1, 80, 10) # cell state shape is (1, 80, 10), same as hidden state
#
# out, (hidden, cell) = lstm(input_data.unsqueeze(-1), (hidden, cell)) # adds an extra dimension for the input size
#
# linear = nn.Linear(10, 1)
# prediction = linear(out[-1]) # take the last output of the sequence and pass it to the linear layer
#
# print(prediction)
# print(output_data)

In [6]:
import torch
import numpy as np
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter

# Define the hyperparameters
input_size = 1 # The number of features in the input
hidden_size = 32 # The number of hidden units in the LSTM
num_layers = 2 # The number of LSTM layers
output_size = 1 # The number of features in the output
batch_size = 16 # The size of each batch of data
num_epochs = 20 # The number of epochs to train the model
learning_rate = 0.0001 # The learning rate for the optimizer

# Convert the dataframe to a numpy array and normalize the rainfall values
data = rainfall_df.values.astype(float)
data = (data - data.min()) / (data.max() - data.min())

# Split the data into train and test sets (80% train, 20% test)
train_size = int(len(data) * 0.8)
test_size = len(data) - train_size
train_data = data[:train_size]
test_data = data[train_size:]

# Define a function to create sequences of data with a given window size
def create_sequences(data, window_size):
    sequences = []
    for i in range(len(data) - window_size):
        seq = data[i:i+window_size+1]
        sequences.append(seq)
    return np.array(sequences)

# Create sequences of data with a window size of 12 (one year)
window_size = 12
train_sequences = create_sequences(train_data, window_size)
test_sequences = create_sequences(test_data, window_size)

# Define a custom dataset class to load the sequences into PyTorch tensors
class RainfallDataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        seq = self.sequences[index]
        x = torch.tensor(seq[:-1], dtype=torch.float32).unsqueeze(-1) # The input is the sequence except the last element
        y = torch.tensor(seq[-1], dtype=torch.float32).unsqueeze(-1) # The output is the last element of the sequence
        x = x.squeeze(-1)
        y = y.squeeze(-1)
        return x, y

# Create train and test datasets using the custom dataset class
datasets = {
    'train': RainfallDataset(train_sequences),
    'test': RainfallDataset(test_sequences)
}

# Create train and test dataloaders using the datasets
dataloaders = {
    'train': torch.utils.data.DataLoader(datasets['train'], batch_size=batch_size, shuffle=True),
    'test': torch.utils.data.DataLoader(datasets['test'], batch_size=batch_size, shuffle=False)
}

# Define the LSTM model class
class LSTMModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True) # The LSTM layer
        self.fc = torch.nn.Linear(hidden_size, output_size) # The linear layer

    def forward(self, x):
        # Initialize the hidden and cell states with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        # Pass the input through the LSTM layer
        out, _ = self.lstm(x, (h0, c0))

        # Pass the last output of the LSTM layer through the linear layer
        out = self.fc(out[:, -1, :])
        return out

def train_model(model, criterion, optimizer, tensorboard, num_epochs = 3):
    for epoch in tqdm(range(num_epochs)):
        for phase in ['train', 'test']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for i, data in enumerate(dataloaders[phase]):
                x_batch, y_batch = data
                y_pred = model(x_batch)
                loss = criterion(y_pred, y_batch)

                if phase == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                _, preds = torch.max(y_pred, 1)
                running_loss += loss.detach() * x_batch.size(0)
                running_corrects += torch.sum(preds == y_batch.data)

            epoch_loss = running_loss / len(datasets[phase])
            epoch_acc = running_corrects.float() / len(datasets[phase])

            tensorboard.add_scalar(f"Loss during {phase}", epoch_loss, epoch)
            tensorboard.add_scalar(f"Accuracy during {phase}", epoch_acc, epoch)
    tensorboard.close()

model = LSTMModel(input_size, hidden_size, num_layers, output_size)
tensorboard = SummaryWriter()
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
train_model(model, criterion, optimizer, tensorboard, 40)

# # Train the model
# for epoch in tqdm(range(num_epochs)):
#     # Set the model to training mode
#     model.train()
#     # Loop over the batches of data in the train dataloader
#     for x_batch, y_batch in train_dataloader:
#         optimizer.zero_grad()
#         y_pred = model(x_batch)
#         loss = criterion(y_pred, y_batch)
#         loss.backward()
#         optimizer.step()
#
#     # Print the loss every 10 epochs
#     if (epoch + 1) % 10 == 0:
#         print(f'Epoch {epoch + 1}, Loss: {loss.item():.4f}')
#
# # Evaluate the model
# # Set the model to evaluation mode
# model.eval()
#
# # Initialize the lists to store the predictions and the actual values
# predictions = []
# actuals = []
#
# # Loop over the batches of data in the test dataloader
# with torch.no_grad():
#     for x_batch, y_batch in test_dataloader:
#         # Forward pass
#         y_pred = model(x_batch)
#
#         # Append the predictions and the actual values to the lists
#         predictions.extend(y_pred.squeeze().tolist())
#         actuals.extend(y_batch.squeeze().tolist())
#
# # Compute the root mean squared error (RMSE) between the predictions and the actual values
# rmse = np.sqrt(np.mean((np.array(predictions) - np.array(actuals))**2))
# print(f'RMSE: {rmse:.4f}')


  0%|          | 0/40 [00:00<?, ?it/s]