In [92]:
import pandas as pd
import numpy as np
data = {
    "Energy Source": ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass", "Nuclear"],
    "Energy Consumption (MWh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (Million $)": [200, 400, np.nan, 150, 250, np.nan]
}
energy_df=pd.DataFrame(data)
print(energy_df.iloc[[1,2]])

  Energy Source  Energy Consumption (MWh)  Cost (Million $)
1          Wind                       NaN             400.0
2    Hydropower                    2900.0               NaN


In [93]:
print("original energy data with missing values")
energy_df.head()

original energy data with missing values


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,,400.0
2,Hydropower,2900.0,
3,Geothermal,,150.0
4,Biomass,2500.0,250.0


In [94]:
energy_df.tail()

Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
1,Wind,,400.0
2,Hydropower,2900.0,
3,Geothermal,,150.0
4,Biomass,2500.0,250.0
5,Nuclear,3200.0,


In [95]:
cleaned_df=energy_df.dropna()
print("data after removing the rows with missing values: ")
cleaned_df.head()

data after removing the rows with missing values: 


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
4,Biomass,2500.0,250.0


In [96]:
ec_mean=energy_df["Energy Consumption (MWh)"].mean()
ec_mean

2450.0

In [97]:
cost_mean=energy_df["Cost (Million $)"].mean()
cost_mean

250.0

In [98]:
energy_df["Energy Consumption (MWh)"].fillna(ec_mean,inplace=True)
energy_df["Cost (Million $)"]=energy_df["Cost (Million $)"].fillna(cost_mean)
print("\nDataFrame with Imputed Values:")
energy_df.head()


DataFrame with Imputed Values:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df["Energy Consumption (MWh)"].fillna(ec_mean,inplace=True)


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0


In [99]:
forward_filled_df = energy_df.fillna(method='ffill')
print("\nData Before Forward Filling:")
print(energy_df)
print("\nData After Forward Filling:")
forward_filled_df.head()

  forward_filled_df = energy_df.fillna(method='ffill')



Data Before Forward Filling:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                    2450.0             400.0
2    Hydropower                    2900.0             250.0
3    Geothermal                    2450.0             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0             250.0

Data After Forward Filling:


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0


In [100]:
#flag values
#normalisation --->  used to reduce time reqiured for computation --->  X' =(X - Xmin)/(Xmax-Xmin) 
#standard scaler / z score  z= (x-mean)/std deivation
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
energy_df[['Energy Consumption (MWh)', 'Cost (Million $)']] = scaler.fit_transform(energy_df[['Energy Consumption (MWh)', 'Cost (Million $)']])
print("\nData after Normalized Data(Min-Max Scaling):")
print(energy_df)


Data after Normalized Data(Min-Max Scaling):
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                     0.000               0.2
1          Wind                     0.625               1.0
2    Hydropower                     0.850               0.4
3    Geothermal                     0.625               0.0
4       Biomass                     0.650               0.4
5       Nuclear                     1.000               0.4


In [101]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
energy_df[['Energy Consumption (MWh)', 'Cost (Million $)']] = scaler.fit_transform(energy_df[['Energy Consumption (MWh)', 'Cost (Million $)']])
print("\nData after Normalized Data(Z-Score Scaling):")
print(energy_df)


Data after Normalized Data(Z-Score Scaling):
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar             -2.005893e+00     -6.546537e-01
1          Wind              3.563181e-16      1.963961e+00
2    Hydropower              7.221213e-01      1.817029e-16
3    Geothermal              3.563181e-16     -1.309307e+00
4       Biomass              8.023570e-02      1.817029e-16
5       Nuclear              1.203536e+00      1.817029e-16


In [120]:
#encoding categorical values
energy_encoded_df=pd.get_dummies(energy_df,columns=["Energy Source"])
print("Data after One-Hot Encoding Categorical variables: ")
energy_encoded_df.head()

Data after One-Hot Encoding Categorical variables: 


Unnamed: 0,Energy Consumption (MWh),Cost (Million $),Energy Source_Biomass,Energy Source_Geothermal,Energy Source_Hydropower,Energy Source_Nuclear,Energy Source_Solar,Energy Source_Wind
0,-2.005893,-0.6546537,False,False,False,False,True,False
1,3.563181e-16,1.963961,False,False,False,False,False,True
2,0.7221213,1.817029e-16,False,False,True,False,False,False
3,3.563181e-16,-1.309307,False,True,False,False,False,False
4,0.0802357,1.817029e-16,True,False,False,False,False,False
