<a href="https://colab.research.google.com/github/sajabdoli/Temp_predict/blob/main/Simple_temp_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Time series prediction


This notebook demonstrates a simple prediction model based on an LSTM. The input data is a CSV file consisting of recordings of room temperature from both inside and outside.

For more information on the data and downloading please refer to: https://www.kaggle.com/datasets/atulanandjha/temperature-readings-iot-devices

For this project, we consider the temperature from inside the room. The data is recorded every minute. A sequence of 10 recordings is used to predict the immediate future. This number is adjustable.

These features are used for prediction:



> 'year', 'month', 'weekday', 'hour', 'minute', 'season', 'timing'



It is a naive feature selection. More features might be extracted from the raw data or we may reduce this set. A good approach might be using the ANOVA table for the best features selection.

In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
def month2seasons(x):
    if x in [12, 1, 2]:
        season = 'Winter'
    elif x in [3, 4, 5]:
        season = 'Summer'
    elif x in [6, 7, 8, 9]:
        season = 'Monsoon'
    elif x in [10, 11]:
        season = 'Post_Monsoon'
    return season

In [3]:
def hours2timing(x):
    if x in [22,23,0,1,2,3]:
        timing = 'Night'
    elif x in range(4, 12):
        timing = 'Morning'
    elif x in range(12, 17):
        timing = 'Afternoon'
    elif x in range(17, 22):
        timing = 'Evening'
    else:
        timing = 'X'
    return timing

In [5]:
# assume that we have the data in google drive
data_raw = pd.read_csv('/content/sample_data/IOT-temp.csv')
data_raw.drop('room_id/id', axis=1, inplace=True)

# rename column names and adding some extra columns
data_raw.rename(columns={'noted_date':'date', 'out/in':'place'}, inplace=True)
data_raw['date'] = pd.to_datetime(data_raw['date'], format='%d-%m-%Y %H:%M')
data_raw['year'] = data_raw['date'].apply(lambda x : x.year)
data_raw['month'] = data_raw['date'].apply(lambda x : x.month)
data_raw['day'] = data_raw['date'].apply(lambda x : x.day)
data_raw['weekday'] = data_raw['date'].apply(lambda x : x.day_name())
data_raw['weekofyear'] = data_raw['date'].apply(lambda x : x.weekofyear)
data_raw['hour'] = data_raw['date'].apply(lambda x : x.hour)
data_raw['minute'] = data_raw['date'].apply(lambda x : x.minute)
data_raw['season'] = data_raw['month'].apply(month2seasons)
data_raw['timing'] = data_raw['hour'].apply(hours2timing)
data_raw.head(3)

Unnamed: 0,id,date,temp,place,year,month,day,weekday,weekofyear,hour,minute,season,timing
0,__export__.temp_log_196134_bd201015,2018-12-08 09:30:00,29,In,2018,12,8,Saturday,49,9,30,Winter,Morning
1,__export__.temp_log_196131_7bca51bc,2018-12-08 09:30:00,29,In,2018,12,8,Saturday,49,9,30,Winter,Morning
2,__export__.temp_log_196127_522915e3,2018-12-08 09:29:00,41,Out,2018,12,8,Saturday,49,9,29,Winter,Morning


In [6]:
data_first_ver = data_raw[["date", "temp", "place", "year", "month", "weekday", "season", "timing", "hour", "minute"]].copy() # Use a list to select multiple columns
data_first_ver

Unnamed: 0,date,temp,place,year,month,weekday,season,timing,hour,minute
0,2018-12-08 09:30:00,29,In,2018,12,Saturday,Winter,Morning,9,30
1,2018-12-08 09:30:00,29,In,2018,12,Saturday,Winter,Morning,9,30
2,2018-12-08 09:29:00,41,Out,2018,12,Saturday,Winter,Morning,9,29
3,2018-12-08 09:29:00,41,Out,2018,12,Saturday,Winter,Morning,9,29
4,2018-12-08 09:29:00,31,In,2018,12,Saturday,Winter,Morning,9,29
...,...,...,...,...,...,...,...,...,...,...
97601,2018-07-28 07:07:00,31,In,2018,7,Saturday,Monsoon,Morning,7,7
97602,2018-07-28 07:07:00,31,In,2018,7,Saturday,Monsoon,Morning,7,7
97603,2018-07-28 07:06:00,31,In,2018,7,Saturday,Monsoon,Morning,7,6
97604,2018-07-28 07:06:00,31,In,2018,7,Saturday,Monsoon,Morning,7,6


In [7]:
#remove duplicates based on date
data_first_ver.drop_duplicates(subset=['date'], inplace=True)
data_first_ver

Unnamed: 0,date,temp,place,year,month,weekday,season,timing,hour,minute
0,2018-12-08 09:30:00,29,In,2018,12,Saturday,Winter,Morning,9,30
2,2018-12-08 09:29:00,41,Out,2018,12,Saturday,Winter,Morning,9,29
6,2018-12-08 09:28:00,29,In,2018,12,Saturday,Winter,Morning,9,28
8,2018-12-08 09:26:00,29,In,2018,12,Saturday,Winter,Morning,9,26
10,2018-12-08 09:25:00,42,Out,2018,12,Saturday,Winter,Morning,9,25
...,...,...,...,...,...,...,...,...,...,...
97568,2018-07-28 07:10:00,31,In,2018,7,Saturday,Monsoon,Morning,7,10
97569,2018-07-28 07:09:00,32,Out,2018,7,Saturday,Monsoon,Morning,7,9
97571,2018-07-28 07:08:00,31,In,2018,7,Saturday,Monsoon,Morning,7,8
97576,2018-07-28 07:07:00,31,In,2018,7,Saturday,Monsoon,Morning,7,7


In [8]:
#sort data_first_ver based on date
data_first_ver.sort_values(by=['date'], inplace=True)

In [9]:
# Data Preparation

import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split

# SORT all the data according to date of recording
data_first_ver['date'] = pd.to_datetime(data_first_ver['date'])
data = data_first_ver.sort_values('date')


# Extract features and target
features = ['year', 'month', 'weekday', 'hour', 'minute', 'season', 'timing']
X = data[features].copy()
y = data['temp'].values

# MAPPING
# convert weekday to a neumerical value
weekday_mapping = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
X['weekday'] = X['weekday'].map(weekday_mapping)

# create One_HOT vector
# One-hot encode categorical variables (e.g., season, timing)
X = pd.get_dummies(X, columns=['season', 'timing'])


# Prepare sequences for time-series forecasting
sequence_length = 10
X_seq, y_seq = [], []

In [10]:
#make SEQUENCE FROM DATA X and y with one item overlapping
for i in range(len(X) - sequence_length):
    X_seq.append(X[i:i + sequence_length])
    y_seq.append(y[i + sequence_length])

In [11]:
# Split into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [12]:
# make the DATASET
from torch.utils.data import Dataset

class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(np.array([df.values.astype(np.float32) for df in X]), dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create dataset objects
train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)
test_dataset = TimeSeriesDataset(X_test, y_test)


In [13]:
from torch.utils.data import DataLoader

batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [14]:
# We'll define a simple LSTM model for our time-series prediction.
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # h_0 : hidden state and c_0: cell state just for initialization of hidden and memory states at the begining of the seq
        # could also be rnds
        #h_0 = torch.zeros(num_layers, batch_size, hidden_size).to(device)
        h_0 = torch.zeros(2, x.size(0), 50).to(x.device)
        c_0 = torch.zeros(2, x.size(0), 50).to(x.device)

        out, _ = self.lstm(x, (h_0, c_0))
        out = out[:, -1, :]  # Take the output of the last time step
        out = self.fc(out)
        return out

input_size = X_train[0].shape[1]
hidden_size = 50
output_size = 1

model = LSTMModel(input_size, hidden_size, output_size)


In [None]:
#Training the Model

import torch.optim as optim
import tqdm as tqdm

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# iterate through the epochs
for epoch in tqdm.tqdm(range(num_epochs)):
    model.train()
    running_loss = 0.0
    # iterate through the batches and doe the magic!
    for i, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.unsqueeze(1))
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

In [None]:
# Validation loop
model.eval()
val_loss = 0.0
with torch.no_grad():
    for inputs, targets in val_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        val_loss += loss.item()

print(f"Validation Loss: {val_loss/len(val_loader):.4f}")

# Testing loop
test_loss = 0.0
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, targets.unsqueeze(1))
        test_loss += loss.item()

print(f"Test Loss: {test_loss/len(test_loader):.4f}")


In [None]:
# predict for a single sequence
X=torch.tensor(X_test[0].values.astype(np.float32), dtype=torch.float32).unsqueeze(0).to(device)
print (X.shape)
model.eval()
with torch.no_grad():
    pred = model(X)
    print (pred)