# IoT Weather Station Code 

Involves Data Preprocessing & Initial ML development

In [1]:
%pip install pandas numpy matplotlib seaborn scikit-learn plotly pyarrow fastparquet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from sklearn.ensemble import IsolationForest, GradientBoostingRegressor, StackingRegressor
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import plotly.express as px
import plotly.figure_factory as ff 
import math
import pathlib
from pathlib import Path
import os

In [3]:
#getting the directory of notebook 

cwd = os.getcwd()

print(f"Current Working Directory: {cwd}")

Current Working Directory: /Users/saikeerthan/Coding/NYP/IOTA/IoT_Weather_project/model-training/code


In [4]:
DATA_ROOT = Path("/Users/saikeerthan/Coding/NYP/IOTA/IoT_Weather_project/model-training/datasets/official_data.csv")

print(DATA_ROOT.exists())

True


## Data Processing 

In [5]:
# retrieving the dataset

df = pd.read_csv(DATA_ROOT) 

df


Unnamed: 0,time,cidx,cattr,temp,humi,pres,windspeed,winddirection,rainfall,uvindex
0,2026-01-24 20:23:02,1,7,28.6,74,1016.746582,5.5,54,,
1,2026-01-24 20:28:02,2,7,28.5,74,1016.802490,5.3,53,,
2,2026-01-24 20:33:02,3,7,28.6,74,1016.859131,5.0,53,,
3,2026-01-24 20:38:02,4,7,28.5,74,1016.934326,4.9,50,,
4,2026-01-24 20:43:02,5,7,28.6,74,1016.971436,5.4,50,,
...,...,...,...,...,...,...,...,...,...,...
2271,2026-02-02 10:26:15,1045,7,38.5,47,1018.157959,6.9,75,,2.0
2272,2026-02-02 10:31:15,1046,7,38.5,47,1018.206055,8.2,73,,2.0
2273,2026-02-02 10:36:15,1047,7,38.5,47,1018.149414,8.2,66,,2.0
2274,2026-02-02 10:41:15,1048,7,38.5,47,1018.127930,8.1,47,,2.0


In [6]:
#checking for duplicates 

duplicates = df.duplicated().sum()

print(f"Duplicates present in DF: {duplicates}")

Duplicates present in DF: 0


In [7]:
# checking for missing values 

missing = df.isnull().sum().sum()

print(f"Missing values in df: {missing}")

# missing values is because of last two columns

Missing values in df: 3768


In [8]:
#columns in df 

columns = df.columns

print("Columns in Dataset:\n")
for i, col in enumerate(df.columns, 1):
    print(f"{i}. {col}")

Columns in Dataset:

1. time
2. cidx
3. cattr
4. temp
5. humi
6. pres
7. windspeed
8. winddirection
9. rainfall
10. uvindex


In [9]:
# drop entries where cattr is below 7

df = df[df["cattr"] >=7]

df

Unnamed: 0,time,cidx,cattr,temp,humi,pres,windspeed,winddirection,rainfall,uvindex
0,2026-01-24 20:23:02,1,7,28.6,74,1016.746582,5.5,54,,
1,2026-01-24 20:28:02,2,7,28.5,74,1016.802490,5.3,53,,
2,2026-01-24 20:33:02,3,7,28.6,74,1016.859131,5.0,53,,
3,2026-01-24 20:38:02,4,7,28.5,74,1016.934326,4.9,50,,
4,2026-01-24 20:43:02,5,7,28.6,74,1016.971436,5.4,50,,
...,...,...,...,...,...,...,...,...,...,...
2271,2026-02-02 10:26:15,1045,7,38.5,47,1018.157959,6.9,75,,2.0
2272,2026-02-02 10:31:15,1046,7,38.5,47,1018.206055,8.2,73,,2.0
2273,2026-02-02 10:36:15,1047,7,38.5,47,1018.149414,8.2,66,,2.0
2274,2026-02-02 10:41:15,1048,7,38.5,47,1018.127930,8.1,47,,2.0


In [10]:
columns_to_delete = ["cattr", "windspeed", "winddirection", "rainfall", "uvindex", "cidx"]

df = df.drop(columns=columns_to_delete)

columns = df.columns
print(f"Remaining Columns:{columns}")

Remaining Columns:Index(['time', 'temp', 'humi', 'pres'], dtype='object')


In [11]:
# check the dtype of every column in the df 

print(df.dtypes)

time     object
temp    float64
humi      int64
pres    float64
dtype: object


In [12]:
df["time"] = pd.to_datetime(df["time"])

print(f"New Dtypes for columns in df: {df.dtypes}")

New Dtypes for columns in df: time    datetime64[ns]
temp           float64
humi             int64
pres           float64
dtype: object


In [13]:

# Convert to datetime safely
df["time"] = pd.to_datetime(df["time"], errors="coerce")

# Drop invalid timestamps
df = df.dropna(subset=["time"])

# Sort chronologically (CRITICAL for time-series ML)
df = df.sort_values("time").reset_index(drop=True)

# (Recommended) Set as index if doing time-based operations later
df = df.set_index("time")

# Extract time features (daily patterns help weather prediction)
df["hour"] = df.index.hour
df["dayofweek"] = df.index.dayofweek

# Cyclic encoding (prevents 23 → 0 discontinuity)
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)


In [14]:
df

Unnamed: 0_level_0,temp,humi,pres,hour,dayofweek,hour_sin,hour_cos
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2026-01-24 20:23:02,28.6,74,1016.746582,20,5,-0.866025,0.500000
2026-01-24 20:28:02,28.5,74,1016.802490,20,5,-0.866025,0.500000
2026-01-24 20:33:02,28.6,74,1016.859131,20,5,-0.866025,0.500000
2026-01-24 20:38:02,28.5,74,1016.934326,20,5,-0.866025,0.500000
2026-01-24 20:43:02,28.6,74,1016.971436,20,5,-0.866025,0.500000
...,...,...,...,...,...,...,...
2026-02-02 10:26:15,38.5,47,1018.157959,10,0,0.500000,-0.866025
2026-02-02 10:31:15,38.5,47,1018.206055,10,0,0.500000,-0.866025
2026-02-02 10:36:15,38.5,47,1018.149414,10,0,0.500000,-0.866025
2026-02-02 10:41:15,38.5,47,1018.127930,10,0,0.500000,-0.866025


In [15]:
# save the new df 

df.to_csv("df_preprocessed.csv")

In [16]:
%pip install -U pyarrow
%pip instlal -U fastparquet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
ERROR: unknown command "instlal" - maybe you meant "install"
Note: you may need to restart the kernel to use updated packages.


In [17]:
WEATHER_COLS = ["temp", "humi", "pres"]

for col in WEATHER_COLS:
    df[f"{col}_lag1"] = df[col].shift(1)
    df[f"{col}_lag2"] = df[col].shift(2)
    df[f"{col}_lag3"] = df[col].shift(3)

# -----------------------------
# 5️⃣ CREATE NEXT-HOUR TARGETS
# -----------------------------
# ⚠ Adjust this based on sampling rate
# If data every 5 min → 12 steps = 1 hour
# If data every 10 min → 6 steps
# If data hourly → 1 step

SHIFT_STEPS = 12   # ← CHANGE IF NEEDED

for col in WEATHER_COLS:
    df[f"{col}_next1h"] = df[col].shift(-SHIFT_STEPS)

# -----------------------------
# 6️⃣ REMOVE NAN ROWS FROM SHIFTS
# -----------------------------
df = df.dropna()

# -----------------------------
# 7️⃣ OPTIONAL: DROP RAW HOUR IF USING SIN/COS
# -----------------------------
# df = df.drop(columns=["hour"])

# -----------------------------
# 8️⃣ SAVE ML-READY DATASET
# -----------------------------
df.to_parquet("weather_ml_ready.parquet")
df.to_csv("weather_ml_ready.csv")

print("✅ ML-ready dataset created")
print(df.shape)

✅ ML-ready dataset created
(2182, 19)
