In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error

# Step 1: Load the dataset
url = 'https://github.com/dsrscientist/dataset3/blob/main/global_Power_plant_database.csv?raw=true'
data = pd.read_csv(url)

# Step 2: Check the available columns in the dataset
print("Available columns in the dataset:")
print(data.columns)

# Step 3: Handle missing values (if any)
# Fill missing numerical values with the median (if any)
data.fillna(data.median(numeric_only=True), inplace=True)

# Fill missing categorical values with mode (if any)
data.fillna(data.select_dtypes(include=['object']).mode().iloc[0], inplace=True)

# Step 4: Select Features for the Model
# Choosing features that exist in the dataset; replace 'generation_gwh_2019' with available columns
# Example column used here is 'generation_gwh_2017' based on availability
features = ['latitude', 'longitude', 'commissioning_year', 'generation_gwh_2017']  # Adjust according to dataset
X = data[features]

# Step 5: Target Variables
y_primary_fuel = data['primary_fuel']  # for classification
y_capacity_mw = data['capacity_mw']    # for regression

# Step 6: Encode the target variable for primary_fuel (categorical)
le = LabelEncoder()
y_primary_fuel_encoded = le.fit_transform(y_primary_fuel)

# Step 7: Scale the numerical features for better performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 8: Split the dataset into training and testing sets
# For Primary Fuel prediction (classification)
X_train_fuel, X_test_fuel, y_train_fuel, y_test_fuel = train_test_split(X_scaled, y_primary_fuel_encoded, test_size=0.2, random_state=42)

# For Capacity MW prediction (regression)
X_train_mw, X_test_mw, y_train_mw, y_test_mw = train_test_split(X_scaled, y_capacity_mw, test_size=0.2, random_state=42)

# Step 9: Train the model for predicting Primary Fuel (Classification)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_fuel, y_train_fuel)

# Step 10: Predict and Evaluate the Primary Fuel Model
y_pred_fuel = clf.predict(X_test_fuel)
print("\nPrimary Fuel Prediction Accuracy:")
print(accuracy_score(y_test_fuel, y_pred_fuel))
print("\nClassification Report for Primary Fuel Prediction:")
print(classification_report(y_test_fuel, y_pred_fuel))

# Step 11: Train the model for predicting Capacity MW (Regression)
reg = RandomForestRegressor(random_state=42)
reg.fit(X_train_mw, y_train_mw)

# Step 12: Predict and Evaluate the Capacity MW Model
y_pred_capacity = reg.predict(X_test_mw)
print("\nCapacity MW Prediction Mean Squared Error:")
print(mean_squared_error(y_test_mw, y_pred_capacity))

# Step 13: Results Summary
print("\nModel Results Summary:")
print(f"Classification Accuracy (Primary Fuel): {accuracy_score(y_test_fuel, y_pred_fuel)}")
print(f"Mean Squared Error (Capacity MW): {mean_squared_error(y_test_mw, y_pred_capacity)}")


Available columns in the dataset:
Index(['country', 'country_long', 'name', 'gppd_idnr', 'capacity_mw',
       'latitude', 'longitude', 'primary_fuel', 'other_fuel1', 'other_fuel2',
       'other_fuel3', 'commissioning_year', 'owner', 'source', 'url',
       'geolocation_source', 'wepp_id', 'year_of_capacity_data',
       'generation_gwh_2013', 'generation_gwh_2014', 'generation_gwh_2015',
       'generation_gwh_2016', 'generation_gwh_2017', 'generation_data_source',
       'estimated_generation_gwh'],
      dtype='object')

Primary Fuel Prediction Accuracy:
0.7747252747252747

Classification Report for Primary Fuel Prediction:
              precision    recall  f1-score   support

           0       0.40      0.33      0.36         6
           1       0.83      0.80      0.81        55
           2       0.58      0.33      0.42        21
           3       0.83      0.95      0.88        55
           5       0.00      0.00      0.00         5
           6       0.84      0.89      