In [246]:
import numpy as np
import pandas as pd

In [247]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [248]:
df = pd.read_csv('/content/energy_iter11.csv')

In [249]:
df.describe()

Unnamed: 0,Demand(MW),Generation(MW),Temp(C),Year,Month,Season,IsHoliday,DemandGenGap(MW)
count,1636.0,1636.0,1636.0,1636.0,1636.0,1636.0,1636.0,1636.0
mean,12290.974328,14130.438875,27.262592,2022.163203,6.863081,0.678484,0.146699,1839.464548
std,1977.969132,1526.740541,3.787678,1.333048,3.394665,0.467201,0.353914,1580.16916
min,7800.0,9482.0,14.0,2020.0,1.0,0.0,0.0,-2435.0
25%,10700.0,13182.0,25.0,2021.0,4.0,0.0,0.0,626.0
50%,12400.0,14448.0,28.6,2022.0,7.0,1.0,0.0,1772.0
75%,13900.0,15202.0,30.025,2023.0,10.0,1.0,0.0,3017.25
max,17300.0,17638.0,33.8,2024.0,12.0,1.0,1.0,6587.0


# **Outlier Removal Using Z-Score**

In [250]:
# Identify Outliers for temperature using  z score


#As our data is more skewed so its better to use log transformation
df['Temp_Log']=np.log(df['Temp(C)']+1)
df['Z_Score']=(df['Temp_Log']-df['Temp_Log'].mean())/df['Temp_Log'].std()

#define a common threshold value 3
z_threshold=3

#identify outliers
outliers= df[np.abs(df['Z_Score'])>z_threshold]
print('No Of outliers present:',len(outliers))
print(outliers[['Temp(C)','Temp_Log','Z_Score']])




No Of outliers present: 17
      Temp(C)  Temp_Log   Z_Score
17       16.8  2.879198 -3.102278
326      16.9  2.884801 -3.063854
327      15.8  2.821379 -3.498841
328      14.0  2.708050 -4.276122
329      17.0  2.890372 -3.025644
332      16.7  2.873565 -3.140918
333      14.8  2.760010 -3.919749
334      15.5  2.803360 -3.622424
336      15.9  2.827314 -3.458137
337      15.9  2.827314 -3.458137
338      15.7  2.815409 -3.539788
695      16.5  2.862201 -3.218858
696      16.5  2.862201 -3.218858
697      16.2  2.844909 -3.337454
698      16.5  2.862201 -3.218858
699      17.0  2.890372 -3.025644
1392     17.0  2.890372 -3.025644


In [260]:
# Outliers removal for Temperature
df['Temp_Log']=np.log(df['Temp(C)']+1)
df['Z_Score']=(df['Temp_Log']-df['Temp_Log'].mean())/df['Temp_Log'].std()
#calculated mean
mean_temp=df[np.abs(df['Z_Score'])<=z_threshold]['Temp(C)'].mean()

#replace with mean value
df.loc[np.abs(df['Z_Score'])>z_threshold,'Temp(C)'] = mean_temp

df.drop(columns=['Temp_Log','Z_Score'],inplace=True)
print("Done")



Done


In [262]:
#save dataset
df.to_csv('energy_iter13b.csv',index=False)
print("saved with clean data")

saved with clean data


In [263]:
#Generation outliers check using z score

#As our data is more skewed so its better to use log transformation
df['Gen_Log']=np.log(df['Generation(MW)']+1)
df['Z_Score']=(df['Gen_Log']-df['Gen_Log'].mean())/df['Gen_Log'].std()

#define a common threshold value 3
z_threshold=3

#identify outliers
outliers= df[np.abs(df['Z_Score'])>z_threshold]
print('No Of outliers present:',len(outliers))
print(outliers[['Generation(MW)','Gen_Log','Z_Score']])


No Of outliers present: 11
     Generation(MW)   Gen_Log   Z_Score
680            9932  9.203618 -3.062391
681            9675  9.177404 -3.294172
682            9837  9.194008 -3.147363
683            9707  9.180706 -3.264979
685            9629  9.172639 -3.336307
686            9565  9.165970 -3.395266
687            9640  9.173780 -3.326213
688            9673  9.177197 -3.296000
689            9917  9.202107 -3.075754
694            9482  9.157256 -3.472318
695            9879  9.198268 -3.109696


In [273]:
# Outliers removal for Generation


#As our data is more skewed so its better to use log transformation
df['Gen_Log']=np.log(df['Generation(MW)']+1)
df['Z_Score']=(df['Gen_Log']-df['Gen_Log'].mean())/df['Gen_Log'].std()
#calculated mean
mean_Gen=df[np.abs(df['Z_Score'])<=z_threshold]['Generation(MW)'].mean()

#replace with mean value
df.loc[np.abs(df['Z_Score'])>z_threshold,'Generation(MW)'] = mean_Gen


print("Done")

Done


In [274]:
df.drop(columns=['Gen_Log','Z_Score'],inplace=True)

In [275]:
#save dataset
df.to_csv('energy_iter13b.csv',index=False)
print("saved with clean data")

saved with clean data


# **Min-Max Scaling**

In [276]:
from sklearn.preprocessing import MinMaxScaler

In [277]:
#Convert selected features all values between 0 to 1

features=["Demand(MW)","Generation(MW)","Temp(C)","DemandGenGap(MW)"]
scaler=MinMaxScaler(feature_range=(0,1))
df[features]=scaler.fit_transform(df[features])

print("scaled done")

scaled done


In [278]:
# Save the scaler For future reverse transformation(Scaling values to original values)

import joblib
joblib.dump(scaler,"minmax_scaler2b.pkl")

['minmax_scaler2b.pkl']

In [281]:
df.head(2)

Unnamed: 0,Date,Demand(MW),Generation(MW),Temp(C),Year,Month,Season,IsHoliday,DemandGenGap(MW)
0,2024-12-31,0.294737,0.636327,0.032051,2024,12,0,0,0.75194
1,2024-12-30,0.315789,0.707466,0.230769,2024,12,0,0,0.788074


In [282]:
#save dataset
df.to_csv('energy_iter13b.csv',index=False)
print("saved with clean data")

saved with clean data
