In [1]:
# Import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import dataset 
df = pd.read_csv(r"C:\Users\cease\Documents\Datasets\End to End Projects\New Data\electricity_bill_dataset.csv")
df.head()

Unnamed: 0,Fan,Refrigerator,AirConditioner,Television,Monitor,MotorPump,Month,City,Company,MonthlyHours,TariffRate,ElectricityBill
0,16,23.0,2.0,6.0,1.0,0,10,Hyderabad,Tata Power Company Ltd.,384,8.4,3225.6
1,19,22.0,2.0,3.0,1.0,0,5,Vadodara,NHPC,488,7.8,3806.4
2,7,20.0,2.0,6.0,7.0,0,7,Shimla,Jyoti Structure,416,7.7,3203.2
3,7,22.0,3.0,21.0,1.0,0,6,Mumbai,Power Grid Corp,475,9.2,4370.0
4,11,23.0,2.0,11.0,1.0,0,2,Mumbai,Ratnagiri Gas and Power Pvt. Ltd. (RGPPL),457,9.2,4204.4


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45345 entries, 0 to 45344
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Fan              45345 non-null  int64  
 1   Refrigerator     45345 non-null  float64
 2   AirConditioner   45345 non-null  float64
 3   Television       45345 non-null  float64
 4   Monitor          45345 non-null  float64
 5   MotorPump        45345 non-null  int64  
 6   Month            45345 non-null  int64  
 7   City             45345 non-null  object 
 8   Company          45345 non-null  object 
 9   MonthlyHours     45345 non-null  int64  
 10  TariffRate       45345 non-null  float64
 11  ElectricityBill  45345 non-null  float64
dtypes: float64(6), int64(4), object(2)
memory usage: 4.2+ MB


In [4]:
# Dictionary for renaming columns
column_names = {
    'AirConditioner': 'Air Conditioner',
    'MotorPump': 'Motor Pump',
    'MonthlyHours': 'Monthly Hours',
    'TariffRate': 'Tariff Rate',
    'ElectricityBill': 'Electricity Bill',

}

# Rename columns using the dictionary
df = df.rename(columns=column_names)

# Display
df.head()

Unnamed: 0,Fan,Refrigerator,Air Conditioner,Television,Monitor,Motor Pump,Month,City,Company,Monthly Hours,Tariff Rate,Electricity Bill
0,16,23.0,2.0,6.0,1.0,0,10,Hyderabad,Tata Power Company Ltd.,384,8.4,3225.6
1,19,22.0,2.0,3.0,1.0,0,5,Vadodara,NHPC,488,7.8,3806.4
2,7,20.0,2.0,6.0,7.0,0,7,Shimla,Jyoti Structure,416,7.7,3203.2
3,7,22.0,3.0,21.0,1.0,0,6,Mumbai,Power Grid Corp,475,9.2,4370.0
4,11,23.0,2.0,11.0,1.0,0,2,Mumbai,Ratnagiri Gas and Power Pvt. Ltd. (RGPPL),457,9.2,4204.4


In [5]:
# df.to_csv('Home Electricity Bill Prediction.csv', index=False)

In [6]:
# Loop through each column and print the count of categorical values
for col in df.columns:
    if df[col].dtype == 'object':  # Check if column data type is object (categorical)
        print(f"Column '{col}' categorical values count:")
        print(df[col].value_counts())
        print('===========================================')

Column 'City' categorical values count:
City
Hyderabad      2914
Navi Mumbai    2914
Ratnagiri      2892
Faridabad      2891
Gurgaon        2878
Ahmedabad      2868
New Delhi      2848
Mumbai         2846
Chennai        2834
Dahej          2821
Nagpur         2816
Noida          2793
Pune           2788
Shimla         2780
Kolkata        2740
Vadodara       2722
Name: count, dtype: int64
Column 'Company' categorical values count:
Company
Ringfeder Power Transmission India Pvt. Ltd.                          1486
JSW Energy Ltd.                                                       1477
Guj Ind Power                                                         1476
SJVN Ltd.                                                             1471
Maha Transco – Maharashtra State Electricity Transmission Co, Ltd.    1470
NTPC Pvt. Ltd.                                                        1466
Optibelt Power Transmission India Private Limited                     1463
Kalpataru Power                  

### One hot Encoding 

In [7]:
from sklearn.preprocessing import OneHotEncoder

# Create a list of columns to encode
categorical_columns = ['City', 'Company']

# Create a copy of the DataFrame with the selected columns
data = df.copy()

# Create an instance of OneHotEncoder
# sparse=False to produce a dense array and drop='first' to drop the first category of each variable
encoder = OneHotEncoder(sparse_output=False, drop='first')

# Iterate through each categorical column

for column in categorical_columns:
    # Fit and transform the selected column
    one_hot_encoded = encoder.fit_transform(data[[column]])

    # Create a DataFrame with one-hot encoded columns
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out([column]))

    # Concatenate the one-hot encoded DataFrame with the original DataFrame
    data = pd.concat([data, one_hot_df], axis=1)

    # Drop the original categorical column
    data = data.drop([column], axis=1)

# Display the resulting DataFrame
df = data.copy()

df.head()

Unnamed: 0,Fan,Refrigerator,Air Conditioner,Television,Monitor,Motor Pump,Month,Monthly Hours,Tariff Rate,Electricity Bill,...,Company_Reliance Power,Company_Ringfeder Power Transmission India Pvt. Ltd.,Company_SJVN Ltd.,Company_Sterlite Power Transmission Ltd,Company_Sunil Hitech Eng,Company_Tata Power Company Ltd.,Company_Torrent Power Ltd.,Company_Toshiba Transmission & Distribution Systems (India) Pvt. Ltd.,Company_TransRail Lighting,Company_Unitech Power Transmission Ltd.
0,16,23.0,2.0,6.0,1.0,0,10,384,8.4,3225.6,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,19,22.0,2.0,3.0,1.0,0,5,488,7.8,3806.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,20.0,2.0,6.0,7.0,0,7,416,7.7,3203.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,22.0,3.0,21.0,1.0,0,6,475,9.2,4370.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11,23.0,2.0,11.0,1.0,0,2,457,9.2,4204.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45345 entries, 0 to 45344
Data columns (total 56 columns):
 #   Column                                                                      Non-Null Count  Dtype  
---  ------                                                                      --------------  -----  
 0   Fan                                                                         45345 non-null  int64  
 1   Refrigerator                                                                45345 non-null  float64
 2   Air Conditioner                                                             45345 non-null  float64
 3   Television                                                                  45345 non-null  float64
 4   Monitor                                                                     45345 non-null  float64
 5   Motor Pump                                                                  45345 non-null  int64  
 6   Month                                         

## Import Necessary libraries for building the model 

In [9]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost.sklearn import XGBRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Evaluation 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [10]:
# Define the features and target variable 

# drop null values
df = df.dropna()

# Features 
X = df.drop(columns={'Electricity Bill'})

# Target variable 
y = df['Electricity Bill']

In [11]:
# Training set and testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [12]:
# Initialize the standard scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data
X_test_scaled = scaler.transform(X_test)

### XGBRegressor

In [13]:

def model(X_train_scaled, y_train, X_test_scaled, y_test):
    # Initialize the XGBRegressor model
    xgb_model = XGBRegressor()

    # Fit the model on the entire training data
    xgb_model.fit(X_train_scaled, y_train)

    # Make predictions on the testing data
    y_pred = xgb_model.predict(X_test_scaled)

    # Evaluate the model on the testing data
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    mape_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)

    # Return the evaluation metrics
    return mae_test, r2_test, mape_test, mse_test, rmse_test

# Assuming you have defined X_train_scaled, y_train, X_test_scaled, and y_test
mae_test, r2_test, mape_test, mse_test, rmse_test = model(X_train_scaled, y_train, X_test_scaled, y_test)

print("MAE:", mae_test)
print("R2:", r2_test)
print("MAPE:", mape_test)
print("MSE:", mse_test)
print("RMSE:", rmse_test)

MAE: 8.862415354353605
R2: 0.9996626012816993
MAPE: 0.24498406860118285
MSE: 384.43850651625604
RMSE: 19.607103470840766


### CatBoostRegressor

In [14]:
def model(X_train_scaled, y_train, X_test_scaled, y_test):
   
    ct_model = CatBoostRegressor(verbose=0)

    # Fit the model on the entire training data
    ct_model.fit(X_train_scaled, y_train)

    # Make predictions on the testing data
    y_pred = ct_model.predict(X_test_scaled)

    # Evaluate the model on the testing data
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    mape_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)

    # Return the evaluation metrics
    return mae_test, r2_test, mape_test, mse_test, rmse_test

mae_test, r2_test, mape_test, mse_test, rmse_test = model(X_train_scaled, y_train, X_test_scaled, y_test)

print("MAE:", mae_test)
print("R2:", r2_test)
print("MAPE:", mape_test)
print("MSE:", mse_test)
print("RMSE:", rmse_test)

MAE: 7.55330228523507
R2: 0.9997555699306967
MAPE: 0.20738048954744792
MSE: 278.5082624612946
RMSE: 16.688566818672435


### RandomForestRegressor

In [15]:
def model(X_train_scaled, y_train, X_test_scaled, y_test):
   
    rf_model = RandomForestRegressor()

    # Fit the model on the entire training data
    rf_model.fit(X_train_scaled, y_train)

    # Make predictions on the testing data
    y_pred = rf_model.predict(X_test_scaled)

    # Evaluate the model on the testing data
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    mape_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)

    # Return the evaluation metrics
    return mae_test, r2_test, mape_test, mse_test, rmse_test


mae_test, r2_test, mape_test, mse_test, rmse_test = model(X_train_scaled, y_train, X_test_scaled, y_test)

print("MAE:", mae_test)
print("R2:", r2_test)
print("MAPE:", mape_test)
print("MSE:", mse_test)
print("RMSE:", rmse_test)

### DecisionTreeRegressor

In [16]:

def model(X_train_scaled, y_train, X_test_scaled, y_test):
    
    dt_model = DecisionTreeRegressor()

    # Fit the model on the entire training data
    dt_model.fit(X_train_scaled, y_train)

    # Make predictions on the testing data
    y_pred = dt_model.predict(X_test_scaled)

    # Evaluate the model on the testing data
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    mape_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)

    # Return the evaluation metrics
    return mae_test, r2_test, mape_test, mse_test, rmse_test

# Example usage:
# Assuming you have defined X_train_scaled, y_train, X_test_scaled, and y_test
mae_test, r2_test, mape_test, mse_test, rmse_test = model(X_train_scaled, y_train, X_test_scaled, y_test)

print("MAE:", mae_test)
print("R2:", r2_test)
print("MAPE:", mape_test)
print("MSE:", mse_test)
print("RMSE:", rmse_test)

MAE: 1.8348439739773659
R2: 0.999916788015119
MAPE: 0.05409534699520234
MSE: 94.81331569081516
RMSE: 9.737212932395757


### LGBMRegressor

In [17]:

def model(X_train_scaled, y_train, X_test_scaled, y_test):
    
    lgb_model = LGBMRegressor()

    # Fit the model on the entire training data
    lgb_model.fit(X_train_scaled, y_train)

    # Make predictions on the testing data
    y_pred = lgb_model.predict(X_test_scaled)

    # Evaluate the model on the testing data
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    mape_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)

    # Return the evaluation metrics
    return mae_test, r2_test, mape_test, mse_test, rmse_test


mae_test, r2_test, mape_test, mse_test, rmse_test = model(X_train_scaled, y_train, X_test_scaled, y_test)

print("MAE:", mae_test)
print("R2:", r2_test)
print("MAPE:", mape_test)
print("MSE:", mse_test)
print("RMSE:", rmse_test)