In [37]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
df= pd.read_csv("data.csv")

# Handling Missing Values
df.dropna(inplace=True)
df.head()

Unnamed: 0,Equipment SNO,Asset Number,Reading Name,Answer Value,Timestamp,Is Error Set?,Asset,Company
0,68B6B34180C8-3,FSCHN-E-00001,activeenergydla,76646.43,20/02/24 15:58,False,AHU DB,Chennai – Bayline
1,68B6B34180C8-3,FSCHN-E-00001,activeenergydla,76641.62,20/02/24 15:51,False,AHU DB,Chennai – Bayline
2,68B6B34180C8-3,FSCHN-E-00001,activeenergydla,76636.78,20/02/24 15:44,False,AHU DB,Chennai – Bayline
3,68B6B34180C8-3,FSCHN-E-00001,activeenergydla,76633.33,20/02/24 15:39,False,AHU DB,Chennai – Bayline
4,68B6B34180C8-3,FSCHN-E-00001,activeenergydla,76628.46,20/02/24 15:32,False,AHU DB,Chennai – Bayline


In [38]:
# Ensure 'Timestamp' column is in datetime format
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

  df['Timestamp'] = pd.to_datetime(df['Timestamp'])


In [39]:
# Normalization
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[['Answer Value']]), columns=['Answer Value'])


In [40]:
# Combine scaled data with other columns
df_scaled[['Equipment SNO', 'Asset Number', 'Reading Name', 'Timestamp', 'Is Error Set?', 'Asset', 'Company']] = df[['Equipment SNO', 'Asset Number', 'Reading Name', 'Timestamp', 'Is Error Set?', 'Asset', 'Company']]

# Display the preprocessed data
print(df_scaled.head())


   Answer Value   Equipment SNO   Asset Number     Reading Name   
0      1.000000  68B6B34180C8-3  FSCHN-E-00001  activeenergydla  \
1      0.999786  68B6B34180C8-3  FSCHN-E-00001  activeenergydla   
2      0.999570  68B6B34180C8-3  FSCHN-E-00001  activeenergydla   
3      0.999416  68B6B34180C8-3  FSCHN-E-00001  activeenergydla   
4      0.999199  68B6B34180C8-3  FSCHN-E-00001  activeenergydla   

            Timestamp  Is Error Set?   Asset            Company  
0 2024-02-20 15:58:00          False  AHU DB  Chennai – Bayline  
1 2024-02-20 15:51:00          False  AHU DB  Chennai – Bayline  
2 2024-02-20 15:44:00          False  AHU DB  Chennai – Bayline  
3 2024-02-20 15:39:00          False  AHU DB  Chennai – Bayline  
4 2024-02-20 15:32:00          False  AHU DB  Chennai – Bayline  


In [41]:
# Time-based features
df_scaled['hour'] = df_scaled['Timestamp'].dt.hour
df_scaled['day_of_week'] = df_scaled['Timestamp'].dt.dayofweek
df_scaled['month'] = df_scaled['Timestamp'].dt.month

# Event-based features
df_scaled['is_weekend'] = df_scaled['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
df_scaled['is_sunday'] = df_scaled['day_of_week'].apply(lambda x: 1 if x == 6 else 0)
df_scaled['is_weekday'] = df_scaled['day_of_week'].apply(lambda x: 1 if x < 5 else 0)

# Display the data with new features
print(df_scaled.head())


   Answer Value   Equipment SNO   Asset Number     Reading Name   
0      1.000000  68B6B34180C8-3  FSCHN-E-00001  activeenergydla  \
1      0.999786  68B6B34180C8-3  FSCHN-E-00001  activeenergydla   
2      0.999570  68B6B34180C8-3  FSCHN-E-00001  activeenergydla   
3      0.999416  68B6B34180C8-3  FSCHN-E-00001  activeenergydla   
4      0.999199  68B6B34180C8-3  FSCHN-E-00001  activeenergydla   

            Timestamp  Is Error Set?   Asset            Company  hour   
0 2024-02-20 15:58:00          False  AHU DB  Chennai – Bayline    15  \
1 2024-02-20 15:51:00          False  AHU DB  Chennai – Bayline    15   
2 2024-02-20 15:44:00          False  AHU DB  Chennai – Bayline    15   
3 2024-02-20 15:39:00          False  AHU DB  Chennai – Bayline    15   
4 2024-02-20 15:32:00          False  AHU DB  Chennai – Bayline    15   

   day_of_week  month  is_weekend  is_sunday  is_weekday  
0            1      2           0          0           1  
1            1      2           0       

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Prepare features and target
X = df_scaled[['hour', 'day_of_week', 'month', 'is_weekend', 'is_sunday', 'is_weekday']]
y = df_scaled['Answer Value']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize linear regression model
model = LinearRegression()

# Fit the model on training data
model.fit(X_train, y_train)


In [43]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Make predictions on test data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)


Mean Absolute Error (MAE): 0.2461536706111882
Root Mean Squared Error (RMSE): 0.28526935163350825


In [44]:
import joblib

# Save the model to disk
joblib.dump(model, 'linear_regression_model.pkl')

print("Model saved successfully!")

Model saved successfully!
