In [32]:
import pandas as pd
import statsmodels.api as sm
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [33]:
df=pd.read_csv("https://raw.githubusercontent.com/wri/global-power-plant-database/master/source_databases_csv/database_IND.csv")

In [34]:
df.head()

Unnamed: 0,country,country_long,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,other_fuel1,other_fuel2,...,year_of_capacity_data,generation_gwh_2013,generation_gwh_2014,generation_gwh_2015,generation_gwh_2016,generation_gwh_2017,generation_gwh_2018,generation_gwh_2019,generation_data_source,estimated_generation_gwh
0,IND,India,ACME Solar Tower,WRI1020239,2.5,28.1839,73.2407,Solar,,,...,,,,,,,,,,
1,IND,India,ADITYA CEMENT WORKS,WRI1019881,98.0,24.7663,74.609,Coal,,,...,,,,,,,,,,
2,IND,India,AES Saurashtra Windfarms,WRI1026669,39.2,21.9038,69.3732,Wind,,,...,,,,,,,,,,
3,IND,India,AGARTALA GT,IND0000001,135.0,23.8712,91.3602,Gas,,,...,2019.0,,617.789264,843.747,886.004428,663.7745,626.239128,,Central Electricity Authority,
4,IND,India,AKALTARA TPP,IND0000002,1800.0,21.9603,82.4091,Coal,Oil,,...,2019.0,,3035.55,5916.37,6243.0,5385.579736,7279.0,,Central Electricity Authority,


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907 entries, 0 to 906
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   country                   907 non-null    object 
 1   country_long              907 non-null    object 
 2   name                      907 non-null    object 
 3   gppd_idnr                 907 non-null    object 
 4   capacity_mw               907 non-null    float64
 5   latitude                  861 non-null    float64
 6   longitude                 861 non-null    float64
 7   primary_fuel              907 non-null    object 
 8   other_fuel1               198 non-null    object 
 9   other_fuel2               1 non-null      object 
 10  other_fuel3               0 non-null      float64
 11  commissioning_year        527 non-null    float64
 12  owner                     342 non-null    object 
 13  source                    907 non-null    object 
 14  url       

In [36]:
# there seems missing values in data . to chk missing values 
print("Number of missing values:\n", df.isnull().sum())

Number of missing values:
 country                       0
country_long                  0
name                          0
gppd_idnr                     0
capacity_mw                   0
latitude                     46
longitude                    46
primary_fuel                  0
other_fuel1                 709
other_fuel2                 906
other_fuel3                 907
commissioning_year          380
owner                       565
source                        0
url                           0
geolocation_source           19
wepp_id                     907
year_of_capacity_data       388
generation_gwh_2013         907
generation_gwh_2014         509
generation_gwh_2015         485
generation_gwh_2016         473
generation_gwh_2017         467
generation_gwh_2018         459
generation_gwh_2019         907
generation_data_source      458
estimated_generation_gwh    907
dtype: int64


In [37]:
# Impute missing values with mean
df.fillna(df.mean(), inplace=True)

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907 entries, 0 to 906
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   country                   907 non-null    object 
 1   country_long              907 non-null    object 
 2   name                      907 non-null    object 
 3   gppd_idnr                 907 non-null    object 
 4   capacity_mw               907 non-null    float64
 5   latitude                  907 non-null    float64
 6   longitude                 907 non-null    float64
 7   primary_fuel              907 non-null    object 
 8   other_fuel1               198 non-null    object 
 9   other_fuel2               1 non-null      object 
 10  other_fuel3               0 non-null      float64
 11  commissioning_year        907 non-null    float64
 12  owner                     342 non-null    object 
 13  source                    907 non-null    object 
 14  url       

In [39]:
df.drop('country_long', axis=1, inplace=True)


In [40]:
df.shape

(907, 26)

In [41]:
 #Impute missing values with mode
df.fillna(df.mode(), inplace=True)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907 entries, 0 to 906
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   country                   907 non-null    object 
 1   name                      907 non-null    object 
 2   gppd_idnr                 907 non-null    object 
 3   capacity_mw               907 non-null    float64
 4   latitude                  907 non-null    float64
 5   longitude                 907 non-null    float64
 6   primary_fuel              907 non-null    object 
 7   other_fuel1               199 non-null    object 
 8   other_fuel2               2 non-null      object 
 9   other_fuel3               0 non-null      float64
 10  commissioning_year        907 non-null    float64
 11  owner                     342 non-null    object 
 12  source                    907 non-null    object 
 13  url                       907 non-null    object 
 14  geolocatio

In [43]:

from sklearn.preprocessing import LabelEncoder
# Convert categorical variables into numerical representation
label_encoder = LabelEncoder()
df['primary_fuel'] = label_encoder.fit_transform(df['primary_fuel'])


In [44]:
df.head()

Unnamed: 0,country,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,other_fuel1,other_fuel2,other_fuel3,...,year_of_capacity_data,generation_gwh_2013,generation_gwh_2014,generation_gwh_2015,generation_gwh_2016,generation_gwh_2017,generation_gwh_2018,generation_gwh_2019,generation_data_source,estimated_generation_gwh
0,IND,ACME Solar Tower,WRI1020239,2.5,28.1839,73.2407,6,Oil,Oil,,...,2019.0,,2431.82359,2428.226946,2467.936859,2547.759305,2600.804099,,Central Electricity Authority,
1,IND,ADITYA CEMENT WORKS,WRI1019881,98.0,24.7663,74.609,1,,,,...,2019.0,,2431.82359,2428.226946,2467.936859,2547.759305,2600.804099,,,
2,IND,AES Saurashtra Windfarms,WRI1026669,39.2,21.9038,69.3732,7,,,,...,2019.0,,2431.82359,2428.226946,2467.936859,2547.759305,2600.804099,,,
3,IND,AGARTALA GT,IND0000001,135.0,23.8712,91.3602,2,,,,...,2019.0,,617.789264,843.747,886.004428,663.7745,626.239128,,Central Electricity Authority,
4,IND,AKALTARA TPP,IND0000002,1800.0,21.9603,82.4091,1,Oil,,,...,2019.0,,3035.55,5916.37,6243.0,5385.579736,7279.0,,Central Electricity Authority,


In [45]:
columns_to_drop = ['other_fuel1', 'other_fuel2', 'other_fuel3',"wepp_id","url","generation_gwh_2013","generation_gwh_2019","estimated_generation_gwh"]
df.drop(columns_to_drop, axis=1, inplace=True)

In [46]:
df.head()

Unnamed: 0,country,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,commissioning_year,owner,source,geolocation_source,year_of_capacity_data,generation_gwh_2014,generation_gwh_2015,generation_gwh_2016,generation_gwh_2017,generation_gwh_2018,generation_data_source
0,IND,ACME Solar Tower,WRI1020239,2.5,28.1839,73.2407,6,2011.0,Solar Paces,National Renewable Energy Laboratory,National Renewable Energy Laboratory,2019.0,2431.82359,2428.226946,2467.936859,2547.759305,2600.804099,Central Electricity Authority
1,IND,ADITYA CEMENT WORKS,WRI1019881,98.0,24.7663,74.609,1,1997.091082,Ultratech Cement ltd,Ultratech Cement ltd,WRI,2019.0,2431.82359,2428.226946,2467.936859,2547.759305,2600.804099,
2,IND,AES Saurashtra Windfarms,WRI1026669,39.2,21.9038,69.3732,7,1997.091082,AES,CDM,WRI,2019.0,2431.82359,2428.226946,2467.936859,2547.759305,2600.804099,
3,IND,AGARTALA GT,IND0000001,135.0,23.8712,91.3602,2,2004.0,,Central Electricity Authority,WRI,2019.0,617.789264,843.747,886.004428,663.7745,626.239128,Central Electricity Authority
4,IND,AKALTARA TPP,IND0000002,1800.0,21.9603,82.4091,1,2015.0,,Central Electricity Authority,WRI,2019.0,3035.55,5916.37,6243.0,5385.579736,7279.0,Central Electricity Authority


In [47]:
df.shape

(907, 18)

In [48]:
# Fill NaN values with the last valid observation in each column
df_filled = df.fillna(method='ffill')

In [49]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907 entries, 0 to 906
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   country                 907 non-null    object 
 1   name                    907 non-null    object 
 2   gppd_idnr               907 non-null    object 
 3   capacity_mw             907 non-null    float64
 4   latitude                907 non-null    float64
 5   longitude               907 non-null    float64
 6   primary_fuel            907 non-null    int32  
 7   commissioning_year      907 non-null    float64
 8   owner                   342 non-null    object 
 9   source                  907 non-null    object 
 10  geolocation_source      888 non-null    object 
 11  year_of_capacity_data   907 non-null    float64
 12  generation_gwh_2014     907 non-null    float64
 13  generation_gwh_2015     907 non-null    float64
 14  generation_gwh_2016     907 non-null    fl

In [50]:
df.isnull().sum()

country                     0
name                        0
gppd_idnr                   0
capacity_mw                 0
latitude                    0
longitude                   0
primary_fuel                0
commissioning_year          0
owner                     565
source                      0
geolocation_source         19
year_of_capacity_data       0
generation_gwh_2014         0
generation_gwh_2015         0
generation_gwh_2016         0
generation_gwh_2017         0
generation_gwh_2018         0
generation_data_source    457
dtype: int64

In [51]:
# Fill NaN values with the last valid observation in each column
df_filled = df.fillna(method='ffill')

In [52]:
df_filled

Unnamed: 0,country,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,commissioning_year,owner,source,geolocation_source,year_of_capacity_data,generation_gwh_2014,generation_gwh_2015,generation_gwh_2016,generation_gwh_2017,generation_gwh_2018,generation_data_source
0,IND,ACME Solar Tower,WRI1020239,2.5,28.1839,73.2407,6,2011.000000,Solar Paces,National Renewable Energy Laboratory,National Renewable Energy Laboratory,2019.0,2431.823590,2428.226946,2467.936859,2547.759305,2600.804099,Central Electricity Authority
1,IND,ADITYA CEMENT WORKS,WRI1019881,98.0,24.7663,74.6090,1,1997.091082,Ultratech Cement ltd,Ultratech Cement ltd,WRI,2019.0,2431.823590,2428.226946,2467.936859,2547.759305,2600.804099,Central Electricity Authority
2,IND,AES Saurashtra Windfarms,WRI1026669,39.2,21.9038,69.3732,7,1997.091082,AES,CDM,WRI,2019.0,2431.823590,2428.226946,2467.936859,2547.759305,2600.804099,Central Electricity Authority
3,IND,AGARTALA GT,IND0000001,135.0,23.8712,91.3602,2,2004.000000,AES,Central Electricity Authority,WRI,2019.0,617.789264,843.747000,886.004428,663.774500,626.239128,Central Electricity Authority
4,IND,AKALTARA TPP,IND0000002,1800.0,21.9603,82.4091,1,2015.000000,AES,Central Electricity Authority,WRI,2019.0,3035.550000,5916.370000,6243.000000,5385.579736,7279.000000,Central Electricity Authority
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
902,IND,YERMARUS TPP,IND0000513,1600.0,16.2949,77.3568,1,2016.000000,Yashwantrao Krishna ssk,Central Electricity Authority,WRI,2019.0,2431.823590,0.994875,233.596650,865.400000,686.500000,Central Electricity Authority
903,IND,Yelesandra Solar Power Plant,WRI1026222,3.0,12.8932,78.1654,6,1997.091082,Karnataka Power Corporation Limited,Karnataka Power Corporation Limited,Industry About,2019.0,2431.823590,2428.226946,2467.936859,2547.759305,2600.804099,Central Electricity Authority
904,IND,Yelisirur wind power project,WRI1026776,25.5,15.2758,75.5811,7,1997.091082,Karnataka Power Corporation Limited,CDM,WRI,2019.0,2431.823590,2428.226946,2467.936859,2547.759305,2600.804099,Central Electricity Authority
905,IND,ZAWAR MINES,WRI1019901,80.0,24.3500,73.7477,1,1997.091082,Hindustan Zinc ltd,Hindustan Zinc ltd,WRI,2019.0,2431.823590,2428.226946,2467.936859,2547.759305,2600.804099,Central Electricity Authority


In [53]:
df_filled.drop("gppd_idnr", axis=1, inplace=True)

In [54]:
df_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907 entries, 0 to 906
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   country                 907 non-null    object 
 1   name                    907 non-null    object 
 2   capacity_mw             907 non-null    float64
 3   latitude                907 non-null    float64
 4   longitude               907 non-null    float64
 5   primary_fuel            907 non-null    int32  
 6   commissioning_year      907 non-null    float64
 7   owner                   907 non-null    object 
 8   source                  907 non-null    object 
 9   geolocation_source      907 non-null    object 
 10  year_of_capacity_data   907 non-null    float64
 11  generation_gwh_2014     907 non-null    float64
 12  generation_gwh_2015     907 non-null    float64
 13  generation_gwh_2016     907 non-null    float64
 14  generation_gwh_2017     907 non-null    fl

In [55]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()
 # Specify the categorical column names
# Encode categorical columns
categorical_columns = ['owner', 'generation_data_source','source','country',"name","geolocation_source"] 
for column in categorical_columns:
    df_filled[column] = label_encoder.fit_transform(df_filled[column])

In [56]:
df_filled.head()

Unnamed: 0,country,name,capacity_mw,latitude,longitude,primary_fuel,commissioning_year,owner,source,geolocation_source,year_of_capacity_data,generation_gwh_2014,generation_gwh_2015,generation_gwh_2016,generation_gwh_2017,generation_gwh_2018,generation_data_source
0,0,0,2.5,28.1839,73.2407,6,2011.0,229,109,1,2019.0,2431.82359,2428.226946,2467.936859,2547.759305,2600.804099,0
1,0,1,98.0,24.7663,74.609,1,1997.091082,258,174,2,2019.0,2431.82359,2428.226946,2467.936859,2547.759305,2600.804099,0
2,0,2,39.2,21.9038,69.3732,7,1997.091082,2,21,2,2019.0,2431.82359,2428.226946,2467.936859,2547.759305,2600.804099,0
3,0,3,135.0,23.8712,91.3602,2,2004.0,2,22,2,2019.0,617.789264,843.747,886.004428,663.7745,626.239128,0
4,0,4,1800.0,21.9603,82.4091,1,2015.0,2,22,2,2019.0,3035.55,5916.37,6243.0,5385.579736,7279.0,0


In [57]:
X = df_filled.drop('capacity_mw', axis=1)
y_capacity = df_filled['capacity_mw']
X_train, X_test, y_capacity_train, y_capacity_test = train_test_split(X, y_capacity, test_size=0.2, random_state=42)


In [58]:
#  Train the capacity prediction model
capacity_model = DecisionTreeRegressor(random_state=42)
capacity_model.fit(X_train, y_capacity_train)

In [59]:
 #Evaluate model performance for capacity prediction
y_capacity_pred = capacity_model.predict(X_test)


In [60]:
y_capacity_pred

array([2.60000e+02, 2.40000e+00, 6.00000e+01, 4.60000e+02, 4.00000e+00,
       1.50000e+00, 9.90000e+01, 6.88000e+01, 1.04000e+03, 2.50000e+01,
       1.00000e+01, 4.00000e+01, 1.00000e+01, 5.00000e+01, 1.80000e+02,
       1.48500e+01, 1.80000e+01, 1.95000e+02, 6.30000e+02, 2.04000e+01,
       5.00000e+00, 1.12000e+01, 5.00000e+00, 1.26000e+02, 6.00000e+00,
       1.05000e+01, 1.14750e+03, 1.00000e+01, 5.00000e+00, 2.40000e+01,
       9.90000e+02, 1.26000e+02, 6.60000e+02, 6.00000e+02, 9.60000e+01,
       1.14750e+02, 1.80000e+02, 6.00000e+02, 9.00000e+00, 1.89000e+01,
       2.25000e+01, 3.00000e+02, 3.15000e+02, 4.68570e+02, 1.05000e+01,
       1.32000e+03, 1.00000e+03, 1.25000e+01, 4.45674e+02, 4.00000e+01,
       4.00000e+00, 6.00000e+00, 2.40000e+00, 1.34000e+03, 2.40000e+01,
       4.68570e+02, 1.20000e+02, 1.60000e+02, 9.00000e+01, 3.60000e+01,
       7.50000e+01, 1.05000e+03, 3.60000e+01, 4.45674e+02, 5.00000e+01,
       2.40000e+01, 1.00000e+01, 2.80000e+01, 4.25000e+01, 2.400

In [61]:
capacity_r2 = r2_score(y_capacity_test, y_capacity_pred)

In [63]:
capacity_r2

0.7359578550666757

In [62]:
capacity_mse = mean_squared_error(y_capacity_test, y_capacity_pred)
print("Capacity MSE:", capacity_mse)


Capacity MSE: 80477.07649312088


In [64]:
# Now will build the model to prdict primary fuel

In [65]:
y_primary_fuel = df['primary_fuel']
X_train, X_test, y_primary_fuel_train, y_primary_fuel_test = train_test_split(X, y_primary_fuel, test_size=0.2, random_state=42)

In [66]:
primary_fuel_model = DecisionTreeClassifier(random_state=42)
primary_fuel_model.fit(X_train, y_primary_fuel_train)

In [67]:
#  Evaluate model performance for primary fuel prediction
y_primary_fuel_pred = primary_fuel_model.predict(X_test)

In [68]:
y_primary_fuel_pred

array([2, 7, 3, 3, 3, 7, 3, 7, 1, 1, 1, 1, 7, 1, 3, 7, 7, 3, 3, 3, 6, 7,
       6, 3, 1, 7, 1, 6, 6, 7, 1, 3, 2, 1, 3, 3, 3, 1, 3, 1, 2, 3, 1, 3,
       7, 1, 4, 0, 2, 6, 7, 3, 7, 1, 7, 5, 1, 1, 2, 3, 2, 1, 1, 1, 6, 0,
       3, 6, 7, 2, 3, 7, 3, 3, 1, 1, 2, 1, 3, 3, 5, 6, 1, 3, 3, 1, 2, 7,
       1, 6, 3, 3, 0, 6, 3, 3, 7, 3, 1, 3, 2, 3, 1, 1, 3, 7, 1, 3, 2, 1,
       7, 3, 2, 1, 0, 2, 2, 3, 3, 1, 7, 6, 1, 6, 1, 3, 1, 3, 7, 7, 2, 4,
       2, 7, 3, 3, 3, 1, 7, 3, 6, 3, 3, 1, 7, 6, 3, 1, 5, 7, 1, 6, 7, 3,
       7, 3, 2, 5, 1, 5, 1, 3, 2, 0, 3, 1, 3, 3, 1, 3, 3, 1, 5, 3, 1, 4,
       1, 3, 1, 3, 1, 6])

In [69]:
# Chcking Accuracy

primary_fuel_accuracy = accuracy_score(y_primary_fuel_test, y_primary_fuel_pred)

In [70]:
primary_fuel_accuracy 

1.0

In [None]:
#Hence Accuracy for Primary Fuel prediction  using DecisionvTreeClassifier model is 100% 


In [72]:
import joblib

In [75]:
joblib.dump(capacity_model, 'capacity_model.joblib')
joblib.dump(primary_fuel_model, 'primary_fuel_model.joblib')

['primary_fuel_model.joblib']

In [76]:
from joblib import load

In [77]:
# Load the capacity model
capacity_model = load('capacity_model.joblib')


In [78]:
# Load the primary fuel model
primary_fuel_model = load('primary_fuel_model.joblib')