### D

In [5]:
import pandas as pd
import numpy as np

#Sample data with missing Values
data={
    "Energy Source": ["Solar","Wind","Hydropower","Geothermal","Biomass","Nuclear"],
    "Energy Consumption(MWh)":[1200,np.nan,2900,np.nan,2500,3200],
    "Cost (Million $)":[200,400,np.nan,150,250,np.nan]
}


In [2]:
print(data)

{'Energy Source': ['Solar', 'Wind', 'Hydropower', 'Geothermal', 'Biomass', 'Nuclear'], 'Energy Consumption(MWh)': [1200, nan, 2900, nan, 2500, 3200], 'Cost (Million $)': [200, 400, nan, 150, 250, nan]}


In [4]:
energy_df=pd.DataFrame(data)
print("Original Energy Data with Missing Values: ")
print(energy_df)

Original Energy Data with Missing Values: 
  Energy Source  Energy Consumption(MWh)  Cost (Million $)
0         Solar                   1200.0             200.0
1          Wind                      NaN             400.0
2    Hydropower                   2900.0               NaN
3    Geothermal                      NaN             150.0
4       Biomass                   2500.0             250.0
5       Nuclear                   3200.0               NaN


In [6]:
energy_df.isnull().sum()

Energy Source              0
Energy Consumption(MWh)    2
Cost (Million $)           2
dtype: int64

In [7]:
energy_df.isnull().sum().sum()

4

In [8]:
cleaned_df=energy_df.dropna()   #command used for drop a value or remove a rows 
print("\n DAta After Removing Rows With Missing Values : ")
print(cleaned_df)
 


 DAta After Removing Rows With Missing Values : 
  Energy Source  Energy Consumption(MWh)  Cost (Million $)
0         Solar                   1200.0             200.0
4       Biomass                   2500.0             250.0


## Impute Missing Value with the Mean()

In [21]:
import warnings
warnings.filterwarnings('ignore')

energy_df["Energy Consumption(MWh)"].fillna(energy_df["Energy Consumption(MWh)"].mean(), inplace=True)
energy_df["Cost (Million $)"].fillna(energy_df["Cost (Million $)"].mean(), inplace=True)

print("\nData After Imputing Missing Values with Mean:")
print(energy_df)



Data After Imputing Missing Values with Mean:
  Energy Source  Energy Consumption(MWh)  Cost (Million $)  Missing Values
0         Solar                   1200.0             200.0               0
1          Wind                   2450.0             400.0               1
2    Hydropower                   2900.0             250.0               0
3    Geothermal                   2450.0             150.0               1
4       Biomass                   2500.0             250.0               0
5       Nuclear                   3200.0             250.0               0


In [14]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
forward_filled_df=energy_df.fillna(method="ffill")
print("\n Data After Forward Filling Missing Values :")
print(forward_filled_df)


 Data After Forward Filling Missing Values :
  Energy Source  Energy Consumption(MWh)  Cost (Million $)
0         Solar                   1200.0             200.0
1          Wind                   1200.0             400.0
2    Hydropower                   2900.0             400.0
3    Geothermal                   2900.0             150.0
4       Biomass                   2500.0             250.0
5       Nuclear                   3200.0             250.0


In [20]:
energy_df["Missing Values"]=energy_df["Energy Consumption(MWh)"].isna().astype(int)
print("\n Data After Adding Missing Values Column :")
print(energy_df)


 Data After Adding Missing Values Column :
  Energy Source  Energy Consumption(MWh)  Cost (Million $)  Missing Values
0         Solar                   1200.0             200.0               0
1          Wind                      NaN             400.0               1
2    Hydropower                   2900.0               NaN               0
3    Geothermal                      NaN             150.0               1
4       Biomass                   2500.0             250.0               0
5       Nuclear                   3200.0               NaN               0


In [24]:
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()
energy_df[["Energy Consumption(MWh)","Cost (Million $)"]]=scaler.fit_transform(energy_df[["Energy Consumption(MWh)","Cost (Million $)"]])   #command used for scaling the data
print("\n Data After Scaling :")
print(energy_df)


 Data After Scaling :
  Energy Source  Energy Consumption(MWh)  Cost (Million $)  Missing Values
0         Solar                    0.000               0.2               0
1          Wind                    0.625               1.0               1
2    Hydropower                    0.850               0.4               0
3    Geothermal                    0.625               0.0               1
4       Biomass                    0.650               0.4               0
5       Nuclear                    1.000               0.4               0


In [25]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
energy_df[["Energy Consumption(MWh)","Cost (Million $)"]]=scaler.fit_transform(energy_df[["Energy Consumption(MWh)","Cost (Million $)"]])   #command used for scaling the data
print("\n Data After Scaling :")
print(energy_df)


 Data After Scaling :
  Energy Source  Energy Consumption(MWh)  Cost (Million $)  Missing Values
0         Solar            -2.005893e+00     -6.546537e-01               0
1          Wind             3.563181e-16      1.963961e+00               1
2    Hydropower             7.221213e-01      1.817029e-16               0
3    Geothermal             3.563181e-16     -1.309307e+00               1
4       Biomass             8.023570e-02      1.817029e-16               0
5       Nuclear             1.203536e+00      1.817029e-16               0


In [26]:
energy_encode_df=pd.get_dummies(energy_df,columns=["Energy Source"])
print("\n Data After One Hot Encoding :")
print(energy_encode_df)


 Data After One Hot Encoding :
   Energy Consumption(MWh)  Cost (Million $)  Missing Values  \
0            -2.005893e+00     -6.546537e-01               0   
1             3.563181e-16      1.963961e+00               1   
2             7.221213e-01      1.817029e-16               0   
3             3.563181e-16     -1.309307e+00               1   
4             8.023570e-02      1.817029e-16               0   
5             1.203536e+00      1.817029e-16               0   

   Energy Source_Biomass  Energy Source_Geothermal  Energy Source_Hydropower  \
0                  False                     False                     False   
1                  False                     False                     False   
2                  False                     False                      True   
3                  False                      True                     False   
4                   True                     False                     False   
5                  False               

### Feature Engineering

In [27]:
energy_encode_df["Consumption per $Million"]=energy_encode_df["Energy Consumption(MWh)"]/energy_encode_df["Cost (Million $)"]
print("\n Data After Adding New Column :")
print(energy_encode_df)


 Data After Adding New Column :
   Energy Consumption(MWh)  Cost (Million $)  Missing Values  \
0            -2.005893e+00     -6.546537e-01               0   
1             3.563181e-16      1.963961e+00               1   
2             7.221213e-01      1.817029e-16               0   
3             3.563181e-16     -1.309307e+00               1   
4             8.023570e-02      1.817029e-16               0   
5             1.203536e+00      1.817029e-16               0   

   Energy Source_Biomass  Energy Source_Geothermal  Energy Source_Hydropower  \
0                  False                     False                     False   
1                  False                     False                     False   
2                  False                     False                      True   
3                  False                      True                     False   
4                   True                     False                     False   
5                  False              