# IoT Weather Station Code 

Involves Data Preprocessing & Initial ML development

In [5]:
%pip install pandas numpy matplotlib seaborn scikit-learn plotly

Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp310-cp310-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting plotly
  Using cached plotly-6.5.2-py3-none-any.whl.metadata (8.5 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.2-cp310-cp310-macosx_12_0_arm64.whl (8.7 MB)
Using cached plotly-6.5.2-py3-none-any.whl (9.9 MB)
Using cached joblib-1.5.3-py3-none-any.whl (309 kB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, plotly, joblib, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [scikit-learn][0m [scikit-learn]
[1A[2KSuccessfully installed joblib-1.5.3 plotly-6.5.2 scikit-learn-1.7.2 threadpoolctl-3.6.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of p

In [11]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from sklearn.ensemble import IsolationForest, GradientBoostingRegressor, StackingRegressor
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import plotly.express as px
import plotly.figure_factory as ff 
import math
import pathlib
from pathlib import Path
import os

In [9]:
#getting the directory of notebook 

cwd = os.getcwd()

print(f"Current Working Directory: {cwd}")

Current Working Directory: /Users/saikeerthan/Coding/NYP/IOTA/IoT_Weather_project/model-training/code


In [14]:
DATA_ROOT = Path("/Users/saikeerthan/Coding/NYP/IOTA/IoT_Weather_project/model-training/datasets/official_data.csv")

print(DATA_ROOT.exists())

True


## Data Processing 

In [15]:
# retrieving the dataset

df = pd.read_csv(DATA_ROOT) 

df


Unnamed: 0,time,cidx,cattr,temp,humi,pres,windspeed,winddirection,rainfall,uvindex
0,2026-01-24 20:23:02,1,7,28.6,74,1016.746582,5.5,54,,
1,2026-01-24 20:28:02,2,7,28.5,74,1016.802490,5.3,53,,
2,2026-01-24 20:33:02,3,7,28.6,74,1016.859131,5.0,53,,
3,2026-01-24 20:38:02,4,7,28.5,74,1016.934326,4.9,50,,
4,2026-01-24 20:43:02,5,7,28.6,74,1016.971436,5.4,50,,
...,...,...,...,...,...,...,...,...,...,...
2271,2026-02-02 10:26:15,1045,7,38.5,47,1018.157959,6.9,75,,2.0
2272,2026-02-02 10:31:15,1046,7,38.5,47,1018.206055,8.2,73,,2.0
2273,2026-02-02 10:36:15,1047,7,38.5,47,1018.149414,8.2,66,,2.0
2274,2026-02-02 10:41:15,1048,7,38.5,47,1018.127930,8.1,47,,2.0


In [17]:
#checking for duplicates 

duplicates = df.duplicated().sum()

print(f"Duplicates present in DF: {duplicates}")

Duplicates present in DF: 0


In [None]:
# checking for missing values 

missing = df.isnull().sum().sum()

print(f"Missing values in df: {missing}")

# missing values is because of last two columns

Missing values in df: 3768


In [25]:
#columns in df 

columns = df.columns

print("Columns in Dataset:\n")
for i, col in enumerate(df.columns, 1):
    print(f"{i}. {col}")

Columns in Dataset:

1. time
2. cidx
3. cattr
4. temp
5. humi
6. pres
7. windspeed
8. winddirection
9. rainfall
10. uvindex


In [27]:
# drop entries where cattr is below 7

df = df[df["cattr"] >=7]

df

Unnamed: 0,time,cidx,cattr,temp,humi,pres,windspeed,winddirection,rainfall,uvindex
0,2026-01-24 20:23:02,1,7,28.6,74,1016.746582,5.5,54,,
1,2026-01-24 20:28:02,2,7,28.5,74,1016.802490,5.3,53,,
2,2026-01-24 20:33:02,3,7,28.6,74,1016.859131,5.0,53,,
3,2026-01-24 20:38:02,4,7,28.5,74,1016.934326,4.9,50,,
4,2026-01-24 20:43:02,5,7,28.6,74,1016.971436,5.4,50,,
...,...,...,...,...,...,...,...,...,...,...
2271,2026-02-02 10:26:15,1045,7,38.5,47,1018.157959,6.9,75,,2.0
2272,2026-02-02 10:31:15,1046,7,38.5,47,1018.206055,8.2,73,,2.0
2273,2026-02-02 10:36:15,1047,7,38.5,47,1018.149414,8.2,66,,2.0
2274,2026-02-02 10:41:15,1048,7,38.5,47,1018.127930,8.1,47,,2.0


In [28]:
columns_to_delete = ["cattr", "windspeed", "winddirection", "rainfall", "uvindex", "cidx"]

df = df.drop(columns=columns_to_delete)

columns = df.columns
print(f"Remaining Columns:{columns}")

Remaining Columns:Index(['time', 'temp', 'humi', 'pres'], dtype='object')


In [29]:
# check the dtype of every column in the df 

print(df.dtypes)

time     object
temp    float64
humi      int64
pres    float64
dtype: object


In [30]:
df["time"] = pd.to_datetime(df["time"])

print(f"New Dtypes for columns in df: {df.dtypes}")

New Dtypes for columns in df: time    datetime64[ns]
temp           float64
humi             int64
pres           float64
dtype: object


In [None]:

import pandas as pd
import numpy as np

df = df.copy()

# 1) Ensure correct dtypes + sort by time
df["time"] = pd.to_datetime(df["time"], errors="coerce")
df = df.dropna(subset=["time"]).sort_values("time").reset_index(drop=True)

for c in ["temp", "humi", "pres"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=["temp", "humi", "pres"]).reset_index(drop=True)

# 2) Time-based features
df["hour"] = df["time"].dt.hour
df["dayofweek"] = df["time"].dt.dayofweek
df["month"] = df["time"].dt.month

# 3) Lag features (previous 1,2,3 readings)
lags = [1, 2, 3]
for lag in lags:
    df[f"temp_lag{lag}"] = df["temp"].shift(lag)
    df[f"humi_lag{lag}"] = df["humi"].shift(lag)
    df[f"pres_lag{lag}"] = df["pres"].shift(lag)

# 4) Targets (next reading forecast)
df["temp_next"] = df["temp"].shift(-1)
df["humi_next"] = df["humi"].shift(-1)
df["pres_next"] = df["pres"].shift(-1)

# 5) Drop rows with NaNs created by shifts
df_ml = df.dropna().copy()