In [2]:
import pandas as pd 

# Load your cleaned data
df = pd.read_csv('processedData_cleaned.csv')
# Assuming your DataFrame is named df
# First, convert 'Full date' to datetime if it's not already
df['Full date'] = pd.to_datetime(df['Full date'])

# Extract day and month for grouping
df['Day'] = df['Full date'].dt.day
df['Month'] = df['Full date'].dt.month

# Group by Day and Month, and get the max temperature for each day in each year
max_temp_per_day = df.groupby(['Month', 'Day', 'Year'])['Temp Max'].max().reset_index()

# Pivot the DataFrame to have years as columns and days as rows
pivot_df = max_temp_per_day.pivot_table(index=['Month', 'Day'], columns='Year', values='Temp Max')

# Reset index to make it easier to work with
pivot_df = pivot_df.reset_index()

# Optionally, create a new column for the date
pivot_df['Date'] = pd.to_datetime(pivot_df[['Month', 'Day']].assign(Year=2000))  # Temporary Year for Date column

# Rearranging the DataFrame to have Date as the first column and clean it up
pivot_df = pivot_df[['Date'] + [col for col in pivot_df.columns if col not in ['Month', 'Day', 'Date']]]

# Rename columns to a more understandable format (optional)
pivot_df.columns.name = None  # Remove the name of the columns

# Display the final DataFrame
pivot_df

Unnamed: 0,Date,1951,1952,1953,1954,1955,1956,1957,1958,1959,...,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009
0,2000-01-01,28.940001,28.889999,28.610001,26.209999,28.969999,28.889999,30.670000,28.719999,27.980000,...,29.150000,29.309999,27.870001,29.290001,29.400000,31.010000,28.320000,29.920000,31.350000,30.559999
1,2000-01-02,27.889999,28.780001,28.480000,26.389999,28.270000,28.780001,30.200001,29.840000,27.330000,...,29.030001,27.650000,28.230000,28.430000,30.290001,30.990000,28.209999,29.420000,30.930000,29.799999
2,2000-01-03,27.320000,28.600000,29.190001,26.620001,28.940001,28.600000,29.510000,30.379999,27.940001,...,28.490000,24.920000,27.719999,28.670000,29.170000,28.709999,28.280001,29.410000,29.770000,29.389999
3,2000-01-04,27.670000,28.719999,28.930000,27.260000,28.309999,28.719999,28.559999,29.440001,28.370001,...,28.660000,27.080000,26.940001,28.540001,28.889999,29.639999,28.459999,29.410000,30.670000,29.740000
4,2000-01-05,28.320000,28.900000,28.510000,28.049999,27.700001,28.900000,29.910000,29.830000,29.540001,...,28.510000,28.780001,27.309999,29.780001,29.010000,31.049999,28.690001,29.150000,30.629999,30.219999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,2000-12-27,28.570000,28.830000,28.549999,29.730000,26.600000,28.270000,29.670000,28.680000,27.370001,...,29.510000,29.500000,30.580000,25.820000,29.910000,26.910000,29.809999,33.020000,31.020000,28.860001
361,2000-12-28,28.799999,29.559999,29.100000,31.080000,27.650000,28.920000,29.990000,29.590000,27.290001,...,29.490000,29.150000,30.860001,26.280001,30.150000,29.260000,30.950001,33.070000,30.940001,28.580000
362,2000-12-29,28.770000,29.920000,29.520000,30.820000,27.830000,29.580000,29.200001,27.860001,28.209999,...,28.910000,29.020000,30.790001,29.350000,29.740000,28.469999,31.120001,33.220001,31.040001,28.240000
363,2000-12-30,28.930000,29.270000,28.860001,30.290001,27.590000,30.110001,28.930000,29.040001,30.750000,...,28.920000,28.450001,30.350000,28.469999,30.129999,28.809999,30.270000,32.419998,30.389999,28.120001


In [3]:
# Convert the pivot DataFrame to the desired two-column format
two_column_df = pivot_df.melt(id_vars=['Date'], var_name='Year', value_name='Temp Max')

# Group by Date and aggregate values into lists
two_column_df = two_column_df.groupby('Date')['Temp Max'].apply(list).reset_index()

# Rename the columns for clarity
two_column_df.columns = ['Date', 'Temperature List']

# Display the final two-column DataFrame
print(two_column_df)


          Date                                   Temperature List
0   2000-01-01  [28.94000053, 28.88999939, 28.61000061, 26.209...
1   2000-01-02  [27.88999939, 28.78000069, 28.47999954, 26.389...
2   2000-01-03  [27.31999969, 28.60000038, 29.19000053, 26.620...
3   2000-01-04  [27.67000008, 28.71999931, 28.93000031, 27.260...
4   2000-01-05  [28.31999969, 28.89999962, 28.51000023, 28.049...
..         ...                                                ...
360 2000-12-27  [28.56999969, 28.82999992, 28.54999924, 29.729...
361 2000-12-28  [28.79999924, 29.55999947, 29.10000038, 31.079...
362 2000-12-29  [28.77000046, 29.92000008, 29.52000046, 30.819...
363 2000-12-30  [28.93000031, 29.27000046, 28.86000061, 30.290...
364 2000-12-31  [28.95000076, 28.53000069, 28.51000023, 29.450...

[365 rows x 2 columns]


In [4]:
lists = list(two_column_df['Temperature List'])

sizeOfArray = len(lists[0])
X = [x[:sizeOfArray-1] for x in lists]  # Extracting all but the last element
Y = [x[sizeOfArray-1] for x in lists]    # Extracting the last element

print(X)


[[28.94000053, 28.88999939, 28.61000061, 26.20999908, 28.96999931, 28.88999939, 30.67000008, 28.71999931, 27.97999954, 28.64999962, 29.78000069, 27.46999931, 27.82999992, 31.70000076, 28.54999924, 28.10000038, 28.37999916, 28.69000053, 26.70999908, 27.5, 29.32999992, 27.95999908, 31.34000015, 27.70000076, 29.76000023, 27.90999985, 30.28000069, 29.23999977, 28.11000061, 30.75, 29.02000046, 28.38999939, 29.69000053, 27.56999969, 29.51000023, 30.25, 27.79000092, 27.29999924, 28.69000053, 26.70000076, 29.64999962, 29.55999947, 29.28000069, 29.27000046, 28.56999969, 29.63999939, 27.62999916, 29.92000008, 28.55999947, 29.14999962, 29.30999947, 27.87000084, 29.29000092, 29.39999962, 31.01000023, 28.31999969, 29.92000008, 31.35000038], [27.88999939, 28.78000069, 28.47999954, 26.38999939, 28.27000046, 28.78000069, 30.20000076, 29.84000015, 27.32999992, 27.12999916, 30.61000061, 27.01000023, 28.04999924, 32.34999847, 28.62000084, 28.36000061, 28.12999916, 28.53000069, 28.30999947, 28.45999908, 2

In [5]:
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
# Convert to numpy arrays
X = np.array(X)
Y = np.array(Y)
# Reshape X to be (samples, sequence length, features)
X = X.reshape(X.shape[0], X.shape[1], 1)  # 1 feature for LSTM

print(X,Y)

[[[28.94000053]
  [28.88999939]
  [28.61000061]
  ...
  [28.31999969]
  [29.92000008]
  [31.35000038]]

 [[27.88999939]
  [28.78000069]
  [28.47999954]
  ...
  [28.20999908]
  [29.42000008]
  [30.93000031]]

 [[27.31999969]
  [28.60000038]
  [29.19000053]
  ...
  [28.28000069]
  [29.40999985]
  [29.77000046]]

 ...

 [[28.77000046]
  [29.92000008]
  [29.52000046]
  ...
  [31.12000084]
  [33.22000122]
  [31.04000092]]

 [[28.93000031]
  [29.27000046]
  [28.86000061]
  ...
  [30.27000046]
  [32.41999817]
  [30.38999939]]

 [[28.95000076]
  [28.53000069]
  [28.51000023]
  ...
  [29.89999962]
  [32.06000137]
  [30.57999992]]] [30.55999947 29.79999924 29.38999939 29.73999977 30.21999931 29.93000031
 30.19000053 29.45000076 29.69000053 30.19000053 30.01000023 30.64999962
 29.81999969 29.98999977 29.29000092 30.64999962 31.25       30.75
 30.86000061 31.29000092 31.60000038 32.38000107 32.52999878 33.61000061
 33.72999954 34.06999969 33.93000031 34.06000137 34.49000168 34.45999908
 34.7000007

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [6]:

X=np.reshape(X,(X.shape[0], X.shape[1],1))
len(X)



365

In [19]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

class LSTMRegressor(nn.Module):
    def __init__(self):
        super(LSTMRegressor, self).__init__()
        self.lstm1 = nn.LSTM(input_size=1, hidden_size=50, batch_first=True)
        self.dropout1 = nn.Dropout(0.2)
        self.lstm2 = nn.LSTM(input_size=50, hidden_size=50, batch_first=True)
        self.dropout2 = nn.Dropout(0.2)
        self.lstm3 = nn.LSTM(input_size=50, hidden_size=50, batch_first=True)
        self.dropout3 = nn.Dropout(0.2)
        self.lstm4 = nn.LSTM(input_size=50, hidden_size=50, batch_first=True)
        self.dropout4 = nn.Dropout(0.2)
        self.fc = nn.Linear(in_features=50, out_features=1)  # Output layer

    def forward(self, x):
        x, _ = self.lstm1(x)  # Forward pass through the first LSTM layer
        x = self.dropout1(x)  # Apply dropout
        x, _ = self.lstm2(x)  # Forward pass through the second LSTM layer
        x = self.dropout2(x)  # Apply dropout
        x, _ = self.lstm3(x)  # Forward pass through the third LSTM layer
        x = self.dropout3(x)  # Apply dropout
        x, _ = self.lstm4(x)  # Forward pass through the fourth LSTM layer
        x = self.dropout4(x)  # Apply dropout
        x = x[:, -1, :]  # Get the last time step
        x = self.fc(x)  # Output layer
        return x

In [20]:
# Initialize model, loss function, and optimizer
model = LSTMRegressor().to('cuda') if torch.cuda.is_available() else LSTMRegressor()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
X_tensor = torch.from_numpy(X).float().to('cuda') if torch.cuda.is_available() else torch.from_numpy(X).float()
Y_tensor = torch.from_numpy(Y).float().to('cuda') if torch.cuda.is_available() else torch.from_numpy(Y).float()
dataset = TensorDataset(X_tensor, Y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# Training loop
epochs = 300
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    for inputs, targets in dataloader:
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs.squeeze(), targets)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

    if (epoch + 1) % 10 == 0:  # Print every 10 epochs
        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')


Epoch [10/300], Loss: 544.2665
Epoch [20/300], Loss: 456.0600
Epoch [30/300], Loss: 200.9499
Epoch [40/300], Loss: 156.7540
Epoch [50/300], Loss: 85.5098
Epoch [60/300], Loss: 78.7120
Epoch [70/300], Loss: 55.3952
Epoch [80/300], Loss: 11.4354
Epoch [90/300], Loss: 27.5140
Epoch [100/300], Loss: 11.5311
Epoch [110/300], Loss: 12.9033
Epoch [120/300], Loss: 12.8996
Epoch [130/300], Loss: 29.9881
Epoch [140/300], Loss: 10.3531
Epoch [150/300], Loss: 22.0991
Epoch [160/300], Loss: 22.7489
Epoch [170/300], Loss: 28.9875
Epoch [180/300], Loss: 18.4435
Epoch [190/300], Loss: 16.1491
Epoch [200/300], Loss: 22.5499
Epoch [210/300], Loss: 26.3361
Epoch [220/300], Loss: 14.1477
Epoch [230/300], Loss: 34.0240
Epoch [240/300], Loss: 19.8755
Epoch [250/300], Loss: 45.0602
Epoch [260/300], Loss: 12.5134
Epoch [270/300], Loss: 17.3365
Epoch [280/300], Loss: 24.1621
Epoch [290/300], Loss: 15.7739
Epoch [300/300], Loss: 9.2122


In [21]:
future_years = 10

# Initialize answers as a list of lists to store predictions for each sequence
answers = [[] for _ in range(len(X))]  # Create a list for each sequence
answers_array = []
with torch.no_grad():  # No need to compute gradients during inference
    current_sequences = [torch.tensor(seq).float().unsqueeze(0).to('cuda') for seq in X]  # Move to GPU if available

    for _ in range(future_years):
        print("Current year:", _)
        for i, sequence in enumerate(current_sequences):
            print(f"Index: {i}, Sequence before prediction: {sequence.shape}")

            # Check the length of the sequence
            if sequence.size(1) != 58:  # Assuming your sequence length is 58
                print(f"Error: Sequence length is {sequence.size(1)} instead of 58.")
                continue

            # Make the prediction
            pred = model(sequence)  # pred shape: (1, 1, 1)

            # Append the prediction to the answers
            answers[i].append(pred.item())  # Use .item() to get the scalar value

            # Update the sequence for the next prediction
            sequence = sequence.squeeze(0)  # Remove the batch dimension; shape: (58,)
            sequence = sequence[1:]  # Remove the oldest value; shape: (57,)

            # Squeeze the prediction to ensure it's a scalar
            new_value = pred.squeeze()  # This gives a shape of (1,)
            new_value = new_value.unsqueeze(0)  # Now new_value has shape (1,)

            # Concatenate the tensors
            
            sequence = sequence.flatten() 
            sequence = torch.cat((sequence, new_value))  # sequence is (57,) + new_value (1,) -> shape (58,)

            sequence = sequence.unsqueeze(0)  # Add the batch dimension back; shape: (1, 58)

# Convert answers to a numpy array for easier handling
answers_array = np.array(answers)

# Print the predicted temperatures for future years
print(answers_array)


Current year: 0
Index: 0, Sequence before prediction: torch.Size([1, 58, 1])
Index: 1, Sequence before prediction: torch.Size([1, 58, 1])
Index: 2, Sequence before prediction: torch.Size([1, 58, 1])
Index: 3, Sequence before prediction: torch.Size([1, 58, 1])
Index: 4, Sequence before prediction: torch.Size([1, 58, 1])
Index: 5, Sequence before prediction: torch.Size([1, 58, 1])
Index: 6, Sequence before prediction: torch.Size([1, 58, 1])
Index: 7, Sequence before prediction: torch.Size([1, 58, 1])
Index: 8, Sequence before prediction: torch.Size([1, 58, 1])
Index: 9, Sequence before prediction: torch.Size([1, 58, 1])
Index: 10, Sequence before prediction: torch.Size([1, 58, 1])
Index: 11, Sequence before prediction: torch.Size([1, 58, 1])
Index: 12, Sequence before prediction: torch.Size([1, 58, 1])
Index: 13, Sequence before prediction: torch.Size([1, 58, 1])
Index: 14, Sequence before prediction: torch.Size([1, 58, 1])
Index: 15, Sequence before prediction: torch.Size([1, 58, 1])
In

In [22]:
column_names = [f'Prediction_{i+1}' for i in range(future_years)]
answers_df = pd.DataFrame(answers_array, columns=column_names)

# Print the DataFrame
print(answers_df)
answers_df.to_csv("testing.csv")

     Prediction_1  Prediction_2  Prediction_3  Prediction_4  Prediction_5  \
0       35.052746     36.630577     35.017693     34.192799     34.992867   
1       34.838684     34.104015     30.729359     34.825161     33.324871   
2       37.380367     31.545399     33.089275     34.858410     35.792072   
3       31.431585     34.953934     34.764900     33.092918     28.958958   
4       35.517902     31.567177     34.901669     38.379635     37.526020   
..            ...           ...           ...           ...           ...   
360     34.834999     29.071709     33.315567     31.370361     36.505798   
361     36.409580     32.979710     32.357952     35.005680     30.659534   
362     34.859081     35.695332     34.286831     31.721733     38.171333   
363     29.932579     36.509663     36.556732     39.008392     34.749393   
364     32.491989     35.701015     33.084942     36.366367     35.771694   

     Prediction_6  Prediction_7  Prediction_8  Prediction_9  Prediction_10 