# Sensor data prediction

Several sensors (DHT22) have been used to collect air temperature and relative humidity in different rooms (one in the living room (LR), three in the bed room (BR), one outside(O)). Unfortunately, one of the sensors produced an error and did not record any more data after about 25 h. A strong correlation between the sensors is assumed.

All temperatures (T) are given in °C, humidity (H) always refers to relative humidity in %.

In [None]:
# Import (standard) dependencies
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates as mdates


In [None]:
# Import data and get an overview about data
df = pd.read_csv("dht_data.csv")
print(df.info())
df.head()

In [None]:
# convert datetime into actual datetime objects
df.datetime = pd.to_datetime(df.datetime)
df.info()

In [None]:
# split dataset in complete (nonan) and incomplete part (nan), where temperature data is missing
df_nonan = df[df["T_BR3"].notna()].copy()
df_nan = df[df["T_BR3"].isna()].copy()
df_nan.info()

In [None]:
# Visualisation to find correlation
# sns.pairplot(data=df_nonan)
sns.heatmap(data=df_nonan.corr(numeric_only=True), cmap="vlag", annot=True)

### Predict missing values

Train and test LinearRegression and RandomForestRegressor models on complete part of the dataset. Validate the models using the R2 scores (over-underfitting) and compare the two different approaches.

In [None]:
# LinearRegression test to predict temperature T_BR3 (missing values but linear correlation)
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline

X = df_nonan[["T_LR", "T_BR1", "T_BR2"]]
y = df_nonan["T_BR3"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,      # 20% test set
    random_state=42     # reproducibility
)

# define pipeline
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# cross validation
scores = cross_val_score(
    lr_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring='r2'
)

print("R2 Training Set:", scores.mean())

# Model training
lr_pipeline.fit(X_train, y_train)

# prediction on test set
y_pred = lr_pipeline.predict(X_test)


#  evaluation metrics for linear regression
r2 = r2_score(y_test, y_pred)

print("R2 test set:", r2)



In [None]:
# RF Regression to predict temperature T_BR3 (missing values but linear correlation)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

X = df_nonan[["T_LR", "T_BR1", "T_BR2"]]
y = df_nonan["T_BR3"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,      # 20% test set
    random_state=42     # reproducibility
)

# Model training
rf_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

# cross validation
scores = cross_val_score(
    rf_model,
    X_train,
    y_train,
    cv=5,
    scoring='r2'
)

print("R2 Training Set:", scores.mean())

rf_model.fit(X_train, y_train)

# prediction on test set
y_pred = rf_model.predict(X_test)


#  evaluation metrics for linear regression
r2 = r2_score(y_test, y_pred)

print("R2 test set:", r2)

Play around with some hyperparameters and see how the values change. How does the result behave if you take other data fields into consideration?

Use the RF Regression to predict the missing temperature values.

In [None]:
# apply linear regression to predict temperature

X_predict = df_nan[["T_LR", "T_BR1", "T_BR2"]]

df_nan["T_BR3"] = rf_model.predict(X_predict)
df_nan.info()



Do this approach now with humidity (Without doing the cross validation) and see which model performs better judged by the R2 of the test set.

Recombine and visualise the data.

In [None]:
# create new df including the predicted values
df_pred = pd.concat([df_nonan, df_nan], axis=0)
df_pred.info()

In [None]:
# split the data into two dataframes for humidity and temperature.
df_T = df_pred[["datetime", "T_LR", "T_O", "T_BR1", "T_BR2", "T_BR3"]]
df_H = df_pred[["datetime", "H_LR", "H_O", "H_BR1", "H_BR2", "H_BR3"]]

# melt into long table for visualisation
df_T_long = pd.melt(df_T, ["datetime"])
df_H_long = pd.melt(df_H, ["datetime"])

In [None]:
# Temperature plot. Some lines have been commented out (optimised settings for the measured data only)

fig = plt.figure(figsize=(12,5), dpi=300)
T_plot = sns.lineplot(data=df_T_long, x="datetime", y="value", hue="variable")
T_plot.set_ylim(-1, 25)
# T_plot.set_xlim([datetime.datetime(2026, 1, 29, 9,30,0), datetime.datetime(2026,1,30,12,0,0)])
T_plot.tick_params("x", rotation=45)
T_plot.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d\n%H:%M'))
T_plot.xaxis.set_major_locator(mdates.HourLocator(interval=8))  # tick every 8 hours
# T_plot.xaxis.set_major_locator(mdates.HourLocator(interval=2))  # tick every 2 hours
T_plot.yaxis.set_label_text("temperature / °C")
T_plot.xaxis.set_label_text("time")
# plt.legend(loc=(0.91,0.16))
plt.tight_layout()
plt.show()

In [None]:
# Humidity plot. Some lines have been commented out (optimised settings for the measured data only)

fig = plt.figure(figsize=(12,5), dpi=300)
H_plot = sns.lineplot(data=df_H_long, x="datetime", y="value", hue="variable")
# H_plot.set_xlim([datetime.datetime(2026,1,29,9,30,0,0), datetime.datetime(2026,1,30,12,30,0,0)])
H_plot.tick_params("x", rotation=45)
H_plot.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d\n%H:%M'))
H_plot.xaxis.set_major_locator(mdates.HourLocator(interval=8))  # every 8 hours
# H_plot.xaxis.set_major_locator(mdates.HourLocator(interval=2))  # every 2 hours
H_plot.yaxis.set_label_text("rel. humidity / %")
H_plot.xaxis.set_label_text("time")
plt.legend(loc="upper right")
plt.tight_layout()
plt.show()

In [None]:
# or as subplot

fig, (ax1, ax2) = plt.subplots(
    2, 1,
    sharex=True,
    figsize=(10, 6)
)

T_plot = sns.lineplot(data=df_T_long, x="datetime", y="value", hue="variable", ax=ax1)
T_plot.set_ylim(-1, 25)
T_plot.yaxis.set_label_text("temperature / °C")
T_plot.xaxis.set_label_text("time")

H_plot = sns.lineplot(data=df_H_long, x="datetime", y="value", hue="variable", ax=ax2)
H_plot.yaxis.set_label_text("rel. humidity / %")

ax2.tick_params("x", rotation=45)
ax2.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d\n%H:%M'))
ax2.xaxis.set_major_locator(mdates.HourLocator(interval=8))  # tick every 8 hours

plt.tight_layout()
plt.show()