# Temperature and Rainfall Analysis using PyCaret - Regression

### Step 1: Merging the dataset

In [37]:
# import pandas as pd
#
# temperature_df = pd.read_excel('dataset/Temp_SRS_1Jan1964_to_31Oct2020.xlsx')
# rainfall_df = pd.read_csv('dataset/Rainfall_at_SRS_30Nov1960_to_18Nov2020.csv')
#
# rainfall_df = rainfall_df.fillna(0)
#
# temperature_df['DATE'] = pd.to_datetime(temperature_df['DATE'], format='%Y-%m-%d')
# rainfall_df['DATE'] = pd.to_datetime(rainfall_df['DATE'], format='%m/%d/%Y')
#
# temperature_df['YEAR'] = pd.to_datetime(temperature_df.DATE).dt.year
# rainfall_df['YEAR'] = pd.to_datetime(rainfall_df.DATE).dt.year
#
# temperature_df_grouped = temperature_df.groupby('YEAR').agg({'LOW TEMP': 'mean', 'HIGH TEMP': 'mean'})
# temperature_df_grouped['RAINFALL'] = rainfall_df.groupby('YEAR').agg({'200-F Rainfall (inches/day)': 'mean'})
# rain_temp_df = pd.merge(temperature_df, rainfall_df, on='DATE')
# temperature_df_grouped.reset_index(inplace=True)
# temperature_df_grouped.head()
# # print(rain_temp_df.head(10))

## Predicting rainfall using average yearly maximum and minimum temperatures using LSTM

In [38]:
# from pycaret.time_series import *
#
# train_df = temperature_df_grouped.query('YEAR < 2010')
# test_df = temperature_df_grouped.query('YEAR >= 2010')
#
# reg = setup(data=train_df, target='RAINFALL')
#
# model = models()


# LSTM Implementation

In [4]:
import pandas as pd

temperature_df = pd.read_excel('dataset/Temp_SRS_1Jan1964_to_31Oct2020.xlsx')
rainfall_df = pd.read_csv('dataset/Rainfall_at_SRS_30Nov1960_to_18Nov2020.csv')
salamander_df = pd.read_excel('dataset/RB-GBdataForAImodel.xlsx')

salamander_df['Date'] = salamander_df['Date'].astype(str)
salamander_df.loc[salamander_df['Date'].str.startswith('1'), 'Date'] = salamander_df['Date'].str.replace('1', '20', 1)
salamander_df.loc[~salamander_df['Date'].str.startswith('20'), 'Date'] = '19' + salamander_df['Date']
salamander_df = salamander_df.rename(columns={'Date': 'DATE'})
salamander_df = salamander_df.query("Site == 'RB'")
salamander_df.head()

rainfall_df = rainfall_df.fillna(0)

temperature_df['DATE'] = pd.to_datetime(temperature_df['DATE'], format='%Y-%m-%d')
rainfall_df['DATE'] = pd.to_datetime(rainfall_df['DATE'], format='%m/%d/%Y')
salamander_df['DATE'] = pd.to_datetime(salamander_df['DATE'], format='%Y%m%d')

total_salamander_pop_df = salamander_df.groupby(salamander_df['DATE'].dt.date)["Number"].sum().reset_index()
total_salamander_pop_df.DATE = pd.to_datetime(total_salamander_pop_df.DATE, format='%Y-%m-%d')
total_salamander_pop_df.head()

rain_temp_df = pd.merge(temperature_df, rainfall_df, on='DATE', how="outer")
rain_temp_salamander_df = pd.merge(rain_temp_df, total_salamander_pop_df, on='DATE', how="outer")
rain_temp_salamander_df = rain_temp_salamander_df.rename(columns={'LOW TEMP': 'MIN_TEMP', 'HIGH TEMP': 'MAX_TEMP', '200-F Rainfall (inches/day)': 'RAINFALL', 'Number': 'POPULATION'})
start_date = '1979-09-17'
end_date = '2020-10-31'
rain_temp_salamander_df = rain_temp_salamander_df.query("DATE >= @start_date and DATE <= @end_date")
# the dates above are selected because salamander data are available only from 1979-09-17 and the temperature data are available only till 2020-10-31
rain_temp_salamander_df['POPULATION'].fillna(0, inplace=True)
rain_temp_salamander_df['RAINFALL'].fillna(0, inplace=True)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x = rain_temp_salamander_df[["MIN_TEMP", "MAX_TEMP", "RAINFALL", "POPULATION"]].values[:-1]
X = scaler.fit_transform(rain_temp_salamander_df[["MIN_TEMP", "MAX_TEMP", "RAINFALL", "POPULATION"]].values[:-1])
y = rain_temp_salamander_df["POPULATION"].values[1:]

print(x)

[[6.1e+01 7.0e+01 5.0e-02 1.0e+00]
 [6.4e+01 7.4e+01 7.0e-02 0.0e+00]
 [6.4e+01 8.3e+01 3.0e-02 0.0e+00]
 ...
 [6.9e+01 8.5e+01 0.0e+00 0.0e+00]
 [6.6e+01 8.4e+01 1.0e-02 0.0e+00]
 [5.0e+01 7.3e+01 3.0e-02 0.0e+00]]


## Data preparation for passing to LSTM

In [40]:
# import numpy as np
#
# rainfall_data = rainfall_df.values
# sequence_length = 80
# no_of_features = 1
#
# samples = list()
# for i in range(0, len(rainfall_data), sequence_length):
#     sample = rainfall_data[i:i+sequence_length]
#     if len(sample) == sequence_length:
#         samples.append(sample)
#
# samples = np.array(samples)
# samples = samples.reshape(len(samples), sequence_length, no_of_features)
#
# train_size = 80 # int(len(samples) * 0.8)
# train, val = samples[:train_size], samples[train_size:]
#
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# train = scaler.fit_transform(train.reshape(-1, no_of_features)).reshape(train.shape)
# val = scaler.transform(val.reshape(-1, no_of_features)).reshape(val.shape)

# LSTM Implementation

In [41]:
# import torch.nn as nn
# import torch
#
# lstm = nn.LSTM(input_size=1, hidden_size=10) # input size is 1 because the input is rainfall level only
#
# input_data = train[:, :, 0] # input shape is (80, 60), sequence length is 60, batch size is 80, input size is 1
# input_data = torch.from_numpy(input_data)
# input_data = input_data.to(torch.float32) # casted to float32 because the extra dimension step encounters data type error
# output_data = train[:, -1, 0] # output shape is (80), output size is 1
#
# hidden = torch.zeros(1, 80, 10) # hidden state shape is (1, 80, 10), num_layers is 1, num_directions is 1, batch size is 80, hidden size is 10
# cell = torch.zeros(1, 80, 10) # cell state shape is (1, 80, 10), same as hidden state
#
# out, (hidden, cell) = lstm(input_data.unsqueeze(-1), (hidden, cell)) # adds an extra dimension for the input size
#
# linear = nn.Linear(10, 1)
# prediction = linear(out[-1]) # take the last output of the sequence and pass it to the linear layer
#
# print(prediction)
# print(output_data)

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
import numpy as np
import torch.nn.functional as F
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter

# Define the hyperparameters
input_size = 4 # The number of features in the input
hidden_size = 32 # The number of hidden units in the LSTM
num_layers = 1 # The number of LSTM layers
output_size = 1 # The number of features in the output
batch_size = 32 # The size of each batch of data
num_epochs = 100 # The number of epochs to train the model
learning_rate = 0.0001 # The learning rate for the optimizer

# Define a custom dataset class to load the sequences into PyTorch tensors
class RainTempSalamanderDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        df = df.drop("DATE", axis=1)
        scaler = MinMaxScaler()
        X = scaler.fit_transform(df[["MIN_TEMP", "MAX_TEMP", "RAINFALL", "POPULATION"]].values[:-1])
        y = df["POPULATION"].values[1:]
        self.X = torch.tensor(X, dtype=torch.float)
        self.y = torch.tensor(y, dtype=torch.float)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

train_df, test_df = train_test_split(rain_temp_salamander_df, test_size=0.2, random_state=42)

train_dataset = RainTempSalamanderDataset(train_df)
test_dataset = RainTempSalamanderDataset(test_df)

# Create train and test datasets using the custom dataset class
datasets = {
    'train': train_dataset,
    'test': test_dataset
}

# Create train and test dataloaders using the datasets
dataloaders = {
    'train': torch.utils.data.DataLoader(datasets['train'], batch_size=batch_size, shuffle=False),
    'test': torch.utils.data.DataLoader(datasets['test'], batch_size=batch_size, shuffle=False)
}

# Define the LSTM model class
class LSTMModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc1 = torch.nn.Linear(hidden_size, 16)
        self.fc2 = torch.nn.Linear(16, output_size)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        # Initialize the hidden and cell states with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        out, (hn, cn) = self.lstm(x, (h0, c0))
        hn = hn.view(-1, self.hidden_size)

        out = self.relu(hn)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        return out

def train_model(model, criterion, optimizer, tensorboard, num_epochs = 3):
    for epoch in tqdm(range(num_epochs)):
        for phase in ['train', 'test']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_mae = 0.0
            running_rmse = 0.0

            for i, data in enumerate(dataloaders[phase]):
                X_batch, y_batch = data
                print(X_batch.size())
                y_pred = model(X_batch.unsqueeze(1))
                print(X_batch.unsqueeze(1).size())

                print(f"*" * 10)
                print(f"Target: {y_batch}")
                print(f"Prediction: {y_pred}")
                print(f"\n")
                loss = criterion(y_pred.squeeze(), y_batch)

                if phase == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                running_loss += loss.item() * X_batch.size(0)
                running_mae += torch.abs(y_pred.squeeze() - y_batch).sum().item()
                running_rmse += torch.sqrt(torch.pow(y_pred.squeeze() - y_batch, 2).sum()).item()
            epoch_loss = running_loss / len(datasets[phase])
            epoch_mae = running_mae / len(datasets[phase])
            epoch_rmse = running_rmse / len(datasets[phase])

            tensorboard.add_scalar(f"Loss during {phase}", epoch_loss, epoch)
            tensorboard.add_scalar(f"MAE during {phase}", epoch_mae, epoch)
            tensorboard.add_scalar(f"RMSE during {phase}", epoch_rmse, epoch)
    tensorboard.close()

model = LSTMModel(input_size, hidden_size, num_layers, output_size)
tensorboard = SummaryWriter()
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
train_model(model, criterion, optimizer, tensorboard, num_epochs)

  0%|          | 0/100 [00:00<?, ?it/s]

torch.Size([32, 4])
torch.Size([32, 1, 4])
**********
Target: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Prediction: tensor([[0.1661],
        [0.1667],
        [0.1655],
        [0.1654],
        [0.1658],
        [0.1650],
        [0.1652],
        [0.1653],
        [0.1654],
        [0.1672],
        [0.1656],
        [0.1671],
        [0.1648],
        [0.1676],
        [0.1656],
        [0.1655],
        [0.1657],
        [0.1661],
        [0.1655],
        [0.1668],
        [0.1652],
        [0.1660],
        [0.1646],
        [0.1664],
        [0.1659],
        [0.1665],
        [0.1653],
        [0.1668],
        [0.1664],
        [0.1672],
        [0.1654],
        [0.1650]], grad_fn=<AddmmBackward0>)


torch.Size([32, 4])
torch.Size([32, 1, 4])
**********
Target: tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.

KeyboardInterrupt: 