In [None]:
import pandas as pd
import numpy as np
data = {
    "Energy Source": ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass", "Nuclear"],
    "Energy Consumption (MWh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (Million $)": [200, 400, np.nan, 150, 250,np.nan]
}

In [None]:
print("Original data")
pd.DataFrame(data)

Original data


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,,400.0
2,Hydropower,2900.0,
3,Geothermal,,150.0
4,Biomass,2500.0,250.0
5,Nuclear,3200.0,


In [None]:
#remove rows with missing values
df = pd.DataFrame(data)
df = df.dropna()
print("\nData after removing rows with missing values:")
print(df)


Data after removing rows with missing values:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
4       Biomass                    2500.0             250.0


In [None]:
ec_mean = df["Energy Consumption (MWh)"].mean()
print("Mean Energy Consumption (MWh):", ec_mean)
cost_mean = df["Cost (Million $)"].mean()
print("Mean Cost (Million $):", cost_mean)

Mean Energy Consumption (MWh): 1850.0
Mean Cost (Million $): 225.0


In [None]:
forward_filled_df = df.fillna(method="ffill")
print("\nData after forward-filling missing values:")
print(forward_filled_df)


Data after forward-filling missing values:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
4       Biomass                    2500.0             250.0


  forward_filled_df = df.fillna(method="ffill")


In [None]:
forward_filled_df = df.fillna(method="ffill")
print("\nData before forward filling:")
print(df)
print("\nData after forward-filling:")
forward_filled_df.head()


Data before forward filling:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
4       Biomass                    2500.0             250.0

Data after forward-filling:


  forward_filled_df = df.fillna(method="ffill")


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
4,Biomass,2500.0,250.0


In [None]:
#forward fill method for missing values
# Create a DataFrame called data_df from the 'data' dictionary
data_df = pd.DataFrame(data)
forward_fill = data_df.fillna(method="ffill")
print("Data before forward fill:")
print(data_df)
print("\nData after forward fill:")
forward_fill.head()

Data before forward fill:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                       NaN             400.0
2    Hydropower                    2900.0               NaN
3    Geothermal                       NaN             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0               NaN

Data after forward fill:


  forward_fill = data_df.fillna(method="ffill")


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,1200.0,400.0
2,Hydropower,2900.0,400.0
3,Geothermal,2900.0,150.0
4,Biomass,2500.0,250.0


In [None]:
# Create a flag column indicating missing values in 'Energy Consumption (MWh)'
df["Missing Consumption"] = df["Energy Consumption (MWh)"].isna().astype(int)
df["Missing Cost"] = df["Cost (Million $)"].isna().astype(int)

print("\nData with Missing Values Flagged:")
df.head()


Data with Missing Values Flagged:


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $),Missing Consumption,Missing Cost
0,Solar,1200.0,200.0,0,0
4,Biomass,2500.0,250.0,0,0


In [None]:
#forward fill method for missing values
# Create a DataFrame called data_df from the 'data' dictionary
data_df = pd.DataFrame(data)
forward_fill = data_df.fillna(method="ffill")
print("Data before forward fill:")
print(data_df)
print("\nData after forward fill:")
forward_fill.head()

# Create a DataFrame called energy_df by copying data_df
energy_df = data_df.copy() # Creating energy_df from data_df

# Now you can use energy_df to create new columns
energy_df["Missing Consumption"] = energy_df["Energy Consumption (MWh)"].isna().astype(int)
energy_df["Missing Cost"] = energy_df["Cost (Million $)"].isna().astype(int)

print("\nData with Missing Values Flagged:")
energy_df.head()

Data before forward fill:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                       NaN             400.0
2    Hydropower                    2900.0               NaN
3    Geothermal                       NaN             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0               NaN

Data after forward fill:

Data with Missing Values Flagged:


  forward_fill = data_df.fillna(method="ffill")


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $),Missing Consumption,Missing Cost
0,Solar,1200.0,200.0,0,0
1,Wind,,400.0,1,0
2,Hydropower,2900.0,,0,1
3,Geothermal,,150.0,1,0
4,Biomass,2500.0,250.0,0,0


In [None]:
xmin_ec = np.min(energy_df["Energy Consumption (MWh)"].min())
xmax_ec = np.max(energy_df["Energy Consumption (MWh)"].max())
print("Minimum Energy Consumption (MWh):", xmin_ec)
print("Maximum Energy Consumption (MWh):", xmax_ec)

Minimum Energy Consumption (MWh): 1200.0
Maximum Energy Consumption (MWh): 3200.0


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
energy_df["Energy Consumption (MWh)"] = scaler.fit_transform(energy_df[["Energy Consumption (MWh)"]])
# Corrected the column names in the selection
energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]]

print("\nData after Min-Max Scaling:")
print(energy_df)
#energy_df.head()


Data after Min-Max Scaling:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)  \
0         Solar                      0.00             200.0   
1          Wind                       NaN             400.0   
2    Hydropower                      0.85               NaN   
3    Geothermal                       NaN             150.0   
4       Biomass                      0.65             250.0   
5       Nuclear                      1.00               NaN   

   Missing Consumption  Missing Cost  
0                    0             0  
1                    1             0  
2                    0             1  
3                    1             0  
4                    0             0  
5                    0             1  


In [None]:
#Z-Score Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]] = scaler.fit_transform(energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]])
print("\nData after Z-Score Scaling:")
print(energy_df)



Data after Z-Score Scaling:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)  \
0         Solar                 -1.637804         -0.534522   
1          Wind                       NaN          1.603567   
2    Hydropower                  0.589610               NaN   
3    Geothermal                       NaN         -1.069045   
4       Biomass                  0.065512          0.000000   
5       Nuclear                  0.982683               NaN   

   Missing Consumption  Missing Cost  
0                    0             0  
1                    1             0  
2                    0             1  
3                    1             0  
4                    0             0  
5                    0             1  


In [None]:
#label encoding
energy_encoded_df = pd.get_dummies(energy_df, columns=["Energy Source"])
print("\nData after Label Encoding:")
energy_encoded_df.head()
#print(energy_encoded_df)



Data after Label Encoding:


Unnamed: 0,Energy Consumption (MWh),Cost (Million $),Missing Consumption,Missing Cost,Energy Source_Biomass,Energy Source_Geothermal,Energy Source_Hydropower,Energy Source_Nuclear,Energy Source_Solar,Energy Source_Wind
0,-1.637804,-0.534522,0,0,False,False,False,False,True,False
1,,1.603567,1,0,False,False,False,False,False,True
2,0.58961,,0,1,False,False,True,False,False,False
3,,-1.069045,1,0,False,True,False,False,False,False
4,0.065512,0.0,0,0,True,False,False,False,False,False


In [None]:
# One-hot encode the 'Energy Source' column
energy_encoded_df = pd.get_dummies(energy_df, columns=["Energy Source"])
