In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


In [None]:
plant_database=pd.read_csv('https://raw.githubusercontent.com/wri/global-power-plant-database/master/source_databases_csv/database_IND.csv')
plant_database

In [None]:
plant_database.shape

In [None]:
plant_database.info()

In [None]:
#plant_database.drop(columns=['country','country_long','url','gppd_idnr','owner','other_fuel2','other_fuel3','year_of_capacity_data','estimated_generation_gwh','wepp_id','generation_gwh_2013'], axis=1,inplace=True)
plant_database.drop(columns=['estimated_generation_gwh','wepp_id','generation_gwh_2013','generation_gwh_2019'],axis=1,inplace=True)

In [None]:
plant_database

In [None]:
plant_database.dtypes

In [None]:
plant_database.isnull().sum()

In [None]:
# check any whitespce,NA and - exist in database
plant_database.isin(['','NA','-','?']).sum().any()

In [None]:
#Percentage of data is missing from dataset
missing_values=plant_database.isnull().sum().sort_values(ascending=False)
percentage_missing_values=(missing_values/len(plant_database))*100
print(pd.concat([missing_values, percentage_missing_values], axis=1, keys=['Missing Values', '%Missing data']))

In [None]:
#Missing value impuatation
plt.figure(figsize=(15,7))
plt.subplot(1,2,1)
sns.boxplot(y='longitude', data=plant_database)
plt.ylabel('longitude')
plt.subplot(1,2,2)
sns.distplot(plant_database['longitude'])
plt.xlabel('longitude')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15,7))
plt.subplot(1,2,1)
sns.boxplot(y='latitude', data=plant_database)
plt.ylabel('latitude')
plt.subplot(1,2,2)
sns.distplot(plant_database['latitude'])
plt.xlabel('latiitude')
plt.tight_layout()
plt.show()

In [None]:
#Missing value imputation of geolocation with mode
plant_database['geolocation_source']=plant_database['geolocation_source'].fillna(plant_database['geolocation_source'].mode()[0])

# Missing value imputation of longitude with median
plant_database['longitude']=plant_database['longitude'].fillna(plant_database['longitude'].median())

# Missing value imputation of latitude with mean
plant_database['latitude']=plant_database['latitude'].fillna(plant_database['latitude'].mean())

                                                                                

In [None]:
#Finding what percentage of data is missing from the dataset
missing_values = plant_database.isnull().sum().sort_values(ascending = False)
percentage_missing_values =(missing_values/len(plant_database))*100
print(pd.concat([missing_values, percentage_missing_values], axis =1, keys =['Missing Values', '% Missing data']))

In [None]:
plant_database.describe()

In [None]:
# check the minimum and maximum capacity of powerplant
print('Maximum capacity of powerplant: ', plant_database.capacity_mw.max())
print('Manimum capacity of powerplant: ', plant_database.capacity_mw.min())

In [None]:
plant_database.loc[plant_database.capacity_mw==4760]

In [None]:
plant_database.loc[plant_database.capacity_mw==0]

In [None]:
plt.figure(figsize=(10,6))
sns.violinplot(plant_database.capacity_mw)
plt.show()

In [None]:
plant_database.loc[plant_database.capacity_mw > 1500].shape

In [None]:
pd.crosstab(plant_database['capacity_mw'],plant_database['primary_fuel'],margins=True)

In [None]:
plt.rcParams["figure.autolayout"] = True
sns.set_palette('husl')
f,ax=plt.subplots(1,2,figsize=(18,8))
plant_database['primary_fuel'].value_counts().plot.pie(autopct='%2.1f%%',
                                          textprops ={ 'fontweight': 'bold','fontsize':13}, ax=ax[0],shadow=True)
ax[0].set_title('primary_fuel', fontsize=20,fontweight ='bold')
ax[0].set_ylabel('')
sns.countplot('primary_fuel',data=plant_database,ax=ax[1])
ax[1].set_title('primary_fuel',fontsize=20,fontweight ='bold')
ax[1].set_xlabel("primary_fuel",fontsize=18,fontweight ='bold')
plt.show()

In [None]:
plant_database.primary_fuel.value_counts()

In [None]:
print('Total power generation capacity of all power plants: ',plant_database['capacity_mw'].sum(),'MW')

In [None]:
# sector wise capacity
plant_database['capacity_mw'].groupby(plant_database['primary_fuel']).agg([sum])

In [None]:
# Comparision between primary fuel and average capacity_mw

#Lets check the relation between primary_fuel and  Average capacity_mw
plt.figure(figsize = (10,8))
y = plant_database["capacity_mw"]
p = sns.barplot(x = "primary_fuel", y = "capacity_mw", data = plant_database)
plt.title('Comparision between Primary fuel Type and Mean Capacity in mw', fontsize=22, fontweight='bold')
p.set_xlabel('Primary fuel Type',fontsize=18,fontweight ='bold')
p.set_ylabel('Mean Capacity in mw',fontsize=18,fontweight ='bold')
plt.xticks(fontsize=16,fontweight ='bold',rotation=30)
plt.yticks(fontsize=16,fontweight ='bold')
plt.show()


In [None]:
fig,axes=plt.subplots(2,2,figsize=(15,12))

# Generation growth in 2017
sns.barplot(x='primary_fuel',y='generation_gwh_2017',ax=axes[0,0],data=plant_database)

# Generation growth in 2014
sns.barplot(x='primary_fuel',y='generation_gwh_2014',ax=axes[0,1],data=plant_database)

# Generation growth in 2015
sns.barplot(x='primary_fuel',y='generation_gwh_2015',ax=axes[1,0],data=plant_database, palette='Set2')

# Generation growth in 2016
sns.barplot(x='primary_fuel',y='generation_gwh_2016',ax=axes[1,1],data=plant_database, palette='ch:.25')
plt.show()



In [None]:
# check how the primary fuel is reated to longitude of the power plant
plt.figure(figsize=[10,6])
plt.style.use('ggplot')
plt.title('Comparision between longitude and primary_fuel')
a = sns.boxplot(plant_database['primary_fuel'],plant_database["longitude"])
a.set_xlabel('Primary fuel Type')
a.set_ylabel('longitude')
plt.show()

In [None]:
# check how the primary fuel is reated to latitude of the power plant
plt.figure(figsize=[10,6])
plt.style.use('ggplot')
plt.title('Comparision between latitude and primary_fuel')
a = sns.boxplot(plant_database['primary_fuel'],plant_database["latitude"])
a.set_xlabel('Primary fuel Type')
a.set_ylabel('latitude')
plt.show()

In [None]:
# check how the capacity is reated to latitude of the power plant
plt.figure(figsize=[10,6])
plt.style.use('ggplot')
plt.title('Comparision between latitude and capacity')
a = sns.scatterplot(plant_database['latitude'],plant_database["capacity_mw"])
a.set_xlabel('capacity')
a.set_ylabel('latitude')
plt.show()

In [None]:
# check how the capacity is reated to longitude of the power plant
plt.figure(figsize=[10,6])
plt.style.use('ggplot')
plt.title('Comparision between longitude and capacity')
a = sns.regplot(plant_database['longitude'],plant_database["capacity_mw"])
a.set_xlabel('capacity')
a.set_ylabel('longitude')
plt.show()

In [None]:
# extract power plant age from commissioning year by subtracting it from the year 2018
plant_database['Power_plant_age']=2018-plant_database['commissioning_year']
plant_database.drop(columns=['commissioning_year'],inplace=True)

In [None]:
print('The oldest powerplant age: ', plant_database.Power_plant_age.max())
print('The youngest powerplant age: ', plant_database.Power_plant_age.min())

In [None]:
plant_database.loc[plant_database.Power_plant_age==91]

In [None]:
#Lets check how the power plant age affects Fuel Type
plt.figure(figsize = (10,6))
plt.title("Comparision between primary_fuel and Power plant age")
a= sns.barplot(x = "primary_fuel", y = "Power_plant_age", data = plant_database,palette="Set2")
a.set_xlabel('Primary fuel Type')
a.set_ylabel('Power_plant_age')
plt.show()

In [None]:
# Let's check how the Power_plant_age affects the capacity of the power plant
plt.figure(figsize=[10,6])
plt.title('Comparision between Power_plant_age and capacity_mw')
sns.regplot(plant_database['Power_plant_age'],plant_database['capacity_mw'])

In [None]:
#Lets check the relation between source and capacity_mw
plt.figure(figsize = (10,6))
sns.barplot(x = "geolocation_source", y = "capacity_mw", data = plant_database)
plt.show()

In [None]:
labels='WRI','Industry About','National Renewable Energy Laboratory'
fig, ax = plt.subplots(figsize=(6,5))
ax.pie(plant_database['geolocation_source'].value_counts(), labels=labels, autopct='%1.2f%%', shadow=True)
plt.show()


In [None]:
#Distribution of features
Numerical=['capacity_mw', 'latitude', 'longitude', 'generation_gwh_2014', 'generation_gwh_2015',
           'generation_gwh_2016', 'generation_gwh_2017', 'Power_plant_age']

In [None]:
# Checking how the data has been distributed in each column

plt.figure(figsize=(20,15))
plotnumber=1
for col in Numerical:
    if plotnumber<=9:
        ax=plt.subplot(3,3,plotnumber)
        sns.distplot(plant_database[col])
        plt.xlabel(col,fontsize=20)
    plotnumber+=1
plt.tight_layout()

In [None]:
fig,axes=plt.subplots(2,2,figsize=(15,12))

# Checking generation growth in 2014
sns.scatterplot(x='generation_gwh_2014',y='capacity_mw',ax=axes[0,0],data=plant_database)

# Checking generation growth in 2015
sns.scatterplot(x='generation_gwh_2015',y='capacity_mw',ax=axes[0,1],data=plant_database)

# Checking generation growth in 2016
sns.scatterplot(x='generation_gwh_2016',y='capacity_mw',ax=axes[1,0],data=plant_database)

# Checking generation growth in 2017
sns.scatterplot(x='generation_gwh_2017',y='capacity_mw',ax=axes[1,1],data=plant_database)
plt.show()

In [None]:
#Checking the pairwise relation between the features and label capacity in megawatt.
sns.pairplot(plant_database)

In [None]:
#Encoding categorical data
#plant_database.drop(columns=['name'],axis=1,inplace=True)
#plant_database.drop(columns=['generation_data_source'],axis=1,inplace=True)
#Category=['primary_fuel', 'other_fuel1', 'source', 'geolocation_source']


In [None]:
# Using Label Encoder on categorical variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in Category:
    plant_database[i] = le.fit_transform(plant_database[i])
plant_database.head()

In [None]:
# Identifying the outliers present in numerical columns using boxplot

plt.figure(figsize=(20,25))
plotnumber=1
for col in Numerical:
    if plotnumber<=9:
        ax=plt.subplot(3,3,plotnumber)
        sns.boxplot(plant_database[col],color='cyan')
        plt.xlabel(col,fontsize=12)
    plotnumber+=1
plt.tight_layout()

In [None]:
# Removal of unnessary columns
plant_database.drop(columns=['source','geolocation_source'],axis=1,inplace=True)

In [None]:
plant_database_2=plant_database.copy

In [None]:
plant_database_3=plant_database.copy()
Q1 = plant_database_3.quantile(0)
Q3= plant_database_3.quantile(0.85)
IQR = Q3 - Q1
print(IQR)

In [None]:
data = plant_database_3[~((plant_database_3 < (Q1 - 1.5 * IQR)) |(plant_database_3 > (Q3 + 1.5 * IQR))).any(axis=1)]
print(data.shape)

In [None]:
#data loss
print("\033[1m"+'Percentage Data Loss :'+"\033[0m",((908-876)/876)*100,'%')

In [None]:
#Correlation matrix
plt.figure(figsize=(21,13))
sns.heatmap(data.corr(), vmin=-1, vmax=1, annot=True, square=True, fmt='0.3f', annot_kws={'size':10}, cmap="cool")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(14,7))
data.corr()['capacity_mw'].sort_values(ascending=False).drop(['capacity_mw']).plot(kind='bar',color='c')
plt.xlabel('Features',fontsize=10)
plt.ylabel('Capacity',fontsize=10)
plt.title('Correlation between capacity and features using bar plot',fontsize=20)
plt.show()

In [None]:
plt.figure(figsize=(14,7))
data.corr()['primary_fuel'].sort_values(ascending=False).drop(['capacity_mw']).plot(kind='bar',color='c')
plt.xlabel('Features',fontsize=10)
plt.ylabel('primary_fuel',fontsize=10)
plt.title('Correlation between primary_fuel and features using bar plot',fontsize=20)
plt.show()

In [None]:
#Skewness
data.skew()

In [None]:
skew = ['longitude','other_fuel1','generation_gwh_2014','generation_gwh_2015','generation_gwh_2016','generation_gwh_2017','Power_plant_age']
from sklearn.preprocessing import PowerTransformer
scaler = PowerTransformer(method = 'yeo-johnson')

In [None]:
data[skew] = scaler.fit_transform(data[skew].values)
data[skew].head()

In [None]:
# checking skewness after using yeo-johnson
data.skew()

In [None]:
# We will drop other fuel as result of poor correlation and lot of missing values
data.drop("other_fuel1",axis=1,inplace=True)

In [None]:
# Missing Value Imputation of latitude with mean
data['Power_plant_age'] = data['Power_plant_age'].fillna(data['Power_plant_age'].mean())

In [None]:
data.drop("generation_gwh_2014",axis=1,inplace=True)
data.drop("generation_gwh_2015",axis=1,inplace=True)
data.drop("generation_gwh_2016",axis=1,inplace=True)
data.drop("generation_gwh_2017",axis=1,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,f1_score

In [None]:
# Splitting data in target and dependent feature
X = data.drop(['primary_fuel'], axis =1)
Y = data['primary_fuel']

In [None]:
#Balanceing Imbalanced target feature
data.primary_fuel.value_counts()

In [None]:
#rom sklearn.preprocessing import StandardScaler
#caler= StandardScaler()
#_scale = scaler.fit_transform(X)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=99, test_size=.3)
print('Training feature matrix size:',X_train.shape)
print('Training target vector size:',Y_train.shape)
print('Test feature matrix size:',X_test.shape)
print('Test target vector size:',Y_test.shape)

In [None]:
#Finding best Random state
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,f1_score
maxAccu=0
maxRS=0
for i in range(1,1000):
    X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3, random_state=i)
    log_reg=LogisticRegression()
    log_reg.fit(X_train,Y_train)
    y_pred=log_reg.predict(X_test)
    acc=accuracy_score(Y_test,y_pred)
    if acc>maxAccu:
        maxAccu=acc
        maxRS=i
print('Best accuracy is', maxAccu ,'on Random_state', maxRS)

In [None]:
#Logistics Regression Model
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=737, test_size=.3)
log_reg=LogisticRegression()
log_reg.fit(X_train,Y_train)
y_pred=log_reg.predict(X_test)
print('\033[1m'+'Logistics Regression Evaluation'+'\033[0m')
print('\n')
print('\033[1m'+'Accuracy Score of Logistics Regression :'+'\033[0m', accuracy_score(Y_test, y_pred))
print('\n')
print('\033[1m'+'Confusion matrix of Logistics Regression :'+'\033[0m \n',confusion_matrix(Y_test, y_pred))
print('\n')
print('\033[1m'+'classification Report of Logistics Regression'+'\033[0m \n',classification_report(Y_test, y_pred))

In [None]:
#Decision Tree Classifier
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=737, test_size=.3)
dtc=DecisionTreeClassifier()
dtc.fit(X_train,Y_train)
y_pred=dtc.predict(X_test)
print('\033[1m'+'DecisionTreeClassifier Evaluation'+'\033[0m')
print('\n')
print('\033[1m'+'Accuracy Score of DecisionTreeClassifier :'+'\033[0m', accuracy_score(Y_test, y_pred))
print('\n')
print('\033[1m'+'Confusion matrix of DecisionTreeClassifier :'+'\033[0m \n',confusion_matrix(Y_test, y_pred))
print('\n')
print('\033[1m'+'classification Report of DecisionTreeClassifier'+'\033[0m \n',classification_report(Y_test, y_pred))

In [None]:
#Random Forest Classifier
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=737, test_size=.3)
rfc=RandomForestClassifier()
rfc.fit(X_train,Y_train)
y_pred=rfc.predict(X_test)
print('\033[1m'+'RandomForestClassifier Evaluation'+'\033[0m')
print('\n')
print('\033[1m'+'Accuracy Score of RandomForestClassifier :'+'\033[0m', accuracy_score(Y_test, y_pred))
print('\n')
print('\033[1m'+'Confusion matrix of RandomForestClassifier :'+'\033[0m \n',confusion_matrix(Y_test, y_pred))
print('\n')
print('\033[1m'+'classification Report of RandomForestClassifier'+'\033[0m \n',classification_report(Y_test, y_pred))

In [None]:
#Crossvalidation
from sklearn.model_selection import cross_val_score
model=[LogisticRegression(),
       DecisionTreeClassifier(),
        RandomForestClassifier(),
        ExtraTreesClassifier()]

for m in model:
    score = cross_val_score(m, X_scale, Y, cv =5)
    print('\n')
    print('\033[1m'+'Cross Validation Score', m, ':'+'\033[0m\n')
    print("Score :" ,score)
    print("Mean Score :",score.mean())
    print("Std deviation :",score.std())
    print('\n')
    print('============================================================================================================')