# Regression Models on Clean Data

### Check correlation of Taregt variable "Temperature_F" with other variables

Since the dataset is already clean, we can proceed to train models

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("pirvision_office_dataset_clean.csv")

# Calculate correlation of all numeric features with 'Temperature_F'
correlations_with_label = df.corr(numeric_only=True)['Temperature_F'].sort_values(ascending=False)

# Display sorted correlations
print(correlations_with_label)


Temperature_F    1.000000
PIR_47           0.748802
PIR_46           0.746528
PIR_48           0.739017
PIR_49           0.719185
PIR_45           0.706532
PIR_4            0.697772
PIR_50           0.694808
PIR_51           0.662454
PIR_3            0.650395
PIR_5            0.635102
PIR_52           0.609599
PIR_44           0.535233
PIR_53           0.522956
PIR_6            0.467950
PIR_2            0.430311
PIR_28           0.412462
PIR_27           0.412116
PIR_29           0.399365
PIR_54           0.396627
PIR_26           0.394778
PIR_30           0.379659
PIR_25           0.373766
PIR_31           0.368027
PIR_24           0.335798
PIR_32           0.334034
PIR_33           0.316666
PIR_34           0.308991
PIR_35           0.280624
PIR_23           0.271516
PIR_36           0.258536
PIR_55           0.199889
PIR_1            0.171217
PIR_22           0.158690
PIR_7            0.086809
PIR_43           0.051545
PIR_21          -0.006377
PIR_37          -0.050680
PIR_20      

### Linear Regression (single variable)

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error


# Select the feature and target
X = df[['PIR_47']]
y = df['Temperature_F']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Training
model = LinearRegression()
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)

# Evaluation
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Prepare DataFrame for plotting
plot_df = X_test.copy()
plot_df['Actual Temperature'] = y_test.values
plot_df['Predicted Temperature'] = y_pred

# Plot using Plotly
fig = px.scatter(plot_df, x='PIR_47', y='Actual Temperature',
                 title='Simple Linear Regression: PIR_47 vs Temperature_F',
                 labels={'PIR_47': 'PIR_47 Sensor Reading', 'Actual Temperature': 'Actual Temperature'})
fig.add_scatter(x=plot_df['PIR_47'], y=plot_df['Predicted Temperature'],
                mode='lines', name='Regression Line')
fig.update_layout(width=800, height=500)
fig.show()


R² Score: 0.5623213930060393
Mean Squared Error: 0.686216058579025


### Linear Regression (Multivariable)

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Load data
df = pd.read_csv("pirvision_office_dataset_clean.csv")

# Select top correlated features with Temperature_F
top_features = ['PIR_47', 'PIR_46', 'PIR_48', 'PIR_49', 'PIR_45', 'PIR_4', 'PIR_50']
X = df[top_features]
y = df['Temperature_F']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Visualize predicted vs actual
result_df = pd.DataFrame({'Actual Temperature': y_test, 'Predicted Temperature': y_pred})
fig = px.scatter(result_df, x='Actual Temperature', y='Predicted Temperature',
                 title='Multivariable Linear Regression: Actual vs Predicted Temperature',
                 labels={'Actual Temperature': 'Actual Temperature', 'Predicted Temperature': 'Predicted Temperature'})
fig.add_shape(
    type="line", line=dict(dash="dash", color="red"),
    x0=result_df['Actual Temperature'].min(),
    y0=result_df['Actual Temperature'].min(),
    x1=result_df['Actual Temperature'].max(),
    y1=result_df['Actual Temperature'].max()
)
fig.update_layout(width=800, height=500)
fig.show()


R² Score: 0.5738004993087263
Mean Squared Error: 0.6682184983666561


### Polynomial Regression (Single Variable)

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Feature and target
X_single = df[['PIR_47']]
y = df['Temperature_F']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_single, y, test_size=0.3, random_state=42)


poly = PolynomialFeatures(degree=3)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train polynomial regression model
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

# Predict
y_pred = poly_model.predict(X_test_poly)

# Evaluate
print("Polynomial Regression (Single Variable)")
print("R2 Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Prepare data for plotting
plot_df = X_test.copy()
plot_df['Actual Temperature_F'] = y_test.values
plot_df['Predicted Temperature_F'] = y_pred

# Sort by feature value for smooth plotting
plot_df = plot_df.sort_values(by='PIR_47')

# Plot with Plotly
fig = px.scatter(plot_df, x='PIR_47', y='Actual Temperature_F',
                 title='Polynomial Regression (Degree 2) - Single Variable: PIR_47',
                 labels={'PIR_47': 'PIR_47', 'Actual Temperature_F': 'Actual Temperature_F'})

fig.add_scatter(x=plot_df['PIR_47'], y=plot_df['Predicted Temperature_F'],
                mode='lines', name='Polynomial Regression Line')

fig.update_layout(width=800, height=500)
fig.show()


Polynomial Regression (Single Variable)
R2 Score: 0.6355938938005641
Mean Squared Error: 0.5713354912084085


### Polynomial Regression (Multivariable)

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

feature_cols = ['PIR_47', 'PIR_46', 'PIR_48', 'PIR_49', 'PIR_45', 'PIR_4', 'PIR_50']
X = df[feature_cols]
y = df['Temperature_F']

# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create polynomial features (degree 2)
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train linear regression on polynomial features
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Predict on test set
y_pred = model.predict(X_test_poly)

# Evaluate
print("Multivariable Polynomial Regression (Degree 2)")
print("R2 Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Plot actual vs predicted (scatter plot)
plot_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
plot_df = plot_df.reset_index(drop=True)

fig = px.scatter(plot_df, x='Actual', y='Predicted',
                 title='Actual vs Predicted Temperature_F (Multivariable Polynomial Regression)',
                 labels={'Actual': 'Actual Temperature_F', 'Predicted': 'Predicted Temperature_F'})

fig.add_shape(
    type='line',
    x0=plot_df['Actual'].min(),
    y0=plot_df['Actual'].min(),
    x1=plot_df['Actual'].max(),
    y1=plot_df['Actual'].max(),
    line=dict(color='red', dash='dash')
)

fig.update_layout(width=700, height=500)
fig.show()


Multivariable Polynomial Regression (Degree 2)
R2 Score: 0.6672463325517946
Mean Squared Error: 0.5217090954531711


### Decision tree regressor

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objects as go

# Specify your feature columns and target column
feature_cols = ['PIR_47', 'PIR_46', 'PIR_48', 'PIR_49', 'PIR_45', 'PIR_4', 'PIR_50']
target_col = 'Temperature_F'

# Split features and target
X = df[feature_cols]
y = df[target_col]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize Decision Tree Regressor
regressor = DecisionTreeRegressor(random_state=42)

# Train the model
regressor.fit(X_train, y_train)

# Predict on test data
y_pred = regressor.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")


Mean Squared Error: 0.91
R^2 Score: 0.42


In [None]:
# Create a Plotly scatter plot: Actual vs Predicted
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

fig = px.scatter(results_df, x='Actual', y='Predicted',
                 title='Decision Tree Regression: Actual vs Predicted',
                 labels={'Actual': 'Actual Values', 'Predicted': 'Predicted Values'})
fig.add_trace(go.Scatter(x=results_df['Actual'], y=results_df['Actual'],
                         mode='lines', name='Perfect Prediction', line=dict(color='red')))
fig.show()