# EDA for the Power Generation in India

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

import seaborn as sns

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [None]:
power_df = pd.read_csv("../input/daily-power-generation-in-india-20172020/file.csv", thousands=',', 
                       skiprows=1,
                       names=['Date', 'Region', 
                              'Thermal Actual', 'Thermal Estimated',
                              'Nuclear Actual', 'Nuclear Estimated',
                              'Hydro Actual', 'Hydro Estimated'])

The state table had some problem, so will scrape it down manually from the internet!

In [None]:
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_states_and_union_territories_of_India_by_area")

state_df = df[1].drop(["Country of comparable size (land mass)", 
            "Ref", "Rank"], 
           axis=1).drop([28,35,36,37]).reset_index(drop=True)

Using the 2011 census we can also get the approximate population for each state, this is more useful than the number of states, as not all state have the same population.

In [None]:
population_df = pd.read_html("https://en.wikipedia.org/wiki/List_of_states_and_union_territories_of_India_by_population")

In [None]:
population_df = population_df[1][["State or union territory", "Population", "Area[16]", "Density[a]"]]

population_df.columns = ["State","Population","Area","Density"]

population_df.replace("Manipur[c]","Manipur", inplace=True)

In [None]:
state_df = state_df.merge(population_df, left_on="State / Union territory (UT)",
                          right_on="State").drop(["State / Union territory (UT)"], 
                                                      axis=1)

In [None]:
state_df.Population = state_df.Population.apply(lambda x: int(x.replace(",","").replace("[b]","")))
state_df.Region = state_df.Region.apply(lambda x: "NorthEastern" if x=="Northeastern" else x)

In [None]:
state_df.replace("Central","Western", inplace=True)

state_df.groupby("Region")[["State","Population"]]\
                            .agg({"State":"count", 
                                  "Population":"sum"})

In [None]:
state_distribution = state_df.groupby("Region")[["State","Population"]]\
                                                    .agg({"State":"count", 
                                                          "Population":"sum"})\

fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Bar(x=state_distribution.index, 
                     y=state_distribution.State,
                     name='State per Region'),
              row=1, col=1)
fig.add_trace(go.Bar(x=state_distribution.index, 
                     y=state_distribution.Population,
                     name='Population per Region'),
              row=1, col=2)

fig.update_layout(
    showlegend=False,
    title={'text': "Distribution of States and Population in Regions",
        'y':0.9,
        'x':0.5},
    yaxis_title="Frequency",
    yaxis2_title="Population",
    xaxis_title="")

fig.show()

As we can see from the plot above, the NorthEastern states even though had 8 states they have the lowest population.

There are a lot of Northern states,  while there are a very few Central State. If the data were divided into more sub direction, i.e. NorthWest, SouthEast etc., it might have been even more useful. 

In [None]:
power_df.head()

In [None]:
missing = pd.DataFrame(power_df.isna().sum()/power_df.shape[0]).T

missing.style.background_gradient(cmap='Reds', axis=1)

40% of the Nuclear Generation data is missing :( 

Well let's start exploring a little bit about it, why is that so much of data is missing!

In [None]:
power_df[power_df["Nuclear Actual"].isna()].Region.value_counts()

Well apparently the only regions where it has NaN values are the Easter and North Eastern regions, may be they don't have nuclear plants, let's check it by getting total energy consumption for each region and if these two regions have no nuclear energy production at all we can assume out hypothesis and impute the NaNs with 0.

In [None]:
power_columns = power_df.columns.difference(["Date","Region"])
power_df.groupby("Region")[power_columns].sum()\
                   .style.background_gradient(cmap='hsv',
                                              subset=(["Eastern",
                                                       "NorthEastern"],
                                                      ["Nuclear Actual",
                                                      "Nuclear Estimated"]))

In [None]:
power_df.fillna(0, inplace=True)

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(131)
sns.heatmap(power_df.filter(regex="^Hydro.*").corr(), 
            annot=True, cbar=False,
            xticklabels=["Hydro Actual", "Hydro Estimated"],
            yticklabels=["Hydro Actual", "Hydro Estimated"])
plt.yticks(va="center")
plt.xticks(rotation=0)
plt.title("Hydro Eenrgy Correlation")

plt.subplot(132)
sns.heatmap(power_df.filter(regex="^Nuclear.*").corr(), 
            annot=True, cbar=False,
            xticklabels=["Nuclear Actual", "Nuclear Estimated"],
            yticklabels=["Nuclear Actual", "Nuclear Estimated"])
plt.yticks(va="center")
plt.xticks(rotation=0)
plt.title("Nuclear Eenrgy Correlation")

plt.subplot(133)
sns.heatmap(power_df.filter(regex="^Thermal.*").corr(), 
            annot=True, cbar=False,
            xticklabels=["Thermal Actual", "Thermal Estimated"],
            yticklabels=["Thermal Actual", "Thermal Estimated"])
plt.yticks(va="center")
plt.xticks(rotation=0)
plt.title("Thermal Eenrgy Correlation")

plt.show()

Well the Estimations and Actual values of all three energy are higly correlated, we don't even need a statistical test to confirm it.

Now let's sum the columns to get the total generation of energy, one for the actual value and one for the estiamted values.

In [None]:
power_df["Total Generation Actual"] = power_df.filter(regex=".*Actual.*").sum(axis=1)
power_df["Total Generation Estimated"] = power_df.filter(regex=".*Estimated.*").sum(axis=1)

Since the are all very highly correlated let's study only the Actual Power Generation.

Let's see the time plot of the Energy production in India.

In [None]:
power_by_date = power_df.groupby("Date")[power_columns.union(["Total Generation Actual",
                                                               "Total Generation Estimated"])].sum().reset_index()

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=power_by_date["Date"],
    y=power_by_date["Thermal Actual"],
    name='Thermal Generation',
    mode='lines'))
    
fig.add_trace(go.Scatter(
    x=power_by_date["Date"],
    y=power_by_date["Total Generation Actual"]/3,
    name='Average Generation',
    mode='lines'))


fig.add_trace(go.Scatter(
    x=power_by_date["Date"],
    y=power_by_date["Hydro Actual"],
    name='Hydro Generation',
    mode='lines'))

fig.add_trace(go.Scatter(
    x=power_by_date["Date"],
    y=power_by_date["Nuclear Actual"],
    name='Nuclear Generation',
    mode='lines'))

fig.update_layout(
    title={'text': "Power Generation in India",
        'y':0.95,
        'x':0.5},
    yaxis_title="Power Generation in MU",
    xaxis_title="",
    legend={'traceorder':'normal'})

fig.show()

Something happened 22 September 2017, there is a deep peak depression just that day, or maybe just a data error. Another thing we note is the dependency of power generation on the thermal resources, thus consuming a lot of fossil fuels, while the Nuclear Energy Generation is very low, when compared to the average Generation.

Let's see total energy consumption in 3 years grouped by regions, but it needs to be normalized for the number of states in each region to get an average of a state in a specific region, this is done because some regions have 10 states while some have only 2.

In [None]:
power_by_region = power_df.groupby("Region")[['Total Generation Actual','Thermal Actual',
                                             'Nuclear Actual','Hydro Actual']].sum().reset_index()

power_by_region = power_by_region.merge(state_distribution, on="Region")

power_by_region["Total Generation Actual"] = power_by_region["Total Generation Actual"]/power_by_region["Population"]*10000
power_by_region["Thermal Actual"] = power_by_region["Thermal Actual"]/power_by_region["Population"]*10000
power_by_region["Nuclear Actual"] = power_by_region["Nuclear Actual"]/power_by_region["Population"]*10000
power_by_region["Hydro Actual"] = power_by_region["Hydro Actual"]/power_by_region["Population"]*10000

In [None]:
figure = go.Figure()

figure.add_trace(go.Bar(x = power_by_region.Region,
                        y = power_by_region["Thermal Actual"],
                        name="Thermal", textposition='outside',
                        marker_color='#ef553b',
                        text=round(power_by_region["Thermal Actual"],2)))

figure.add_trace(go.Bar(x = power_by_region.Region,
                        y = power_by_region["Hydro Actual"],
                        name="Hydro",textposition='outside',
                        marker_color='#636efa',
                        text=round(power_by_region["Hydro Actual"],2)))

figure.add_trace(go.Bar(x = power_by_region.Region,
                        y = power_by_region["Nuclear Actual"],
                        name="Nuclear", textposition='outside',
                        marker_color='#2ca02c',
                        text=round(power_by_region["Nuclear Actual"],2)))

figure.update_layout(showlegend = True,
                     barmode='group',
                    title={'text': "Power Generation for every 10,000 person in each Region",
                    'y':0.95,
                    'x':0.5})

Finally we do a chropleth for India, firstly we downlaod the geojson files for indian states, then we open it with GeoPandas and dissolve the states in the various Region, one thing to note is that the central state enery is not present in the dataset (It may be aggregated in some other region for example western). Turn out from an aswer of the dataset author that these two states are to be considered as Western states.

The example has been heavily copied from a solution in stack (only for the choropleth  graph code), so I will leave a [link](https://stackoverflow.com/questions/60910962/is-there-any-way-to-draw-india-map-in-plotly) to the original question.

In [None]:
!wget https://gist.githubusercontent.com/jbrobst/56c13bbbf9d97d187fea01ca62ea5112/raw/e388c4cae20aa53cb5090210a42ebb9b765c0a36/india_states.geojson

In [None]:
import geopandas as gpd


states = gpd.read_file('india_states.geojson')
states.head()

In [None]:
# both files uses different notation for and!
state_region = state_df[["State","Region"]].replace("Jammu and Kashmir", "Jammu & Kashmir")

In [None]:
in_regions = states.merge(state_region, 
                          left_on="ST_NM", 
                          right_on="State").dissolve(by="Region").reset_index()[["Region","geometry"]]

In [None]:
fig = px.choropleth(
    power_by_region,
    geojson=in_regions,
    featureidkey='properties.Region',
    locations='Region',
    color='Total Generation Actual',
    color_continuous_scale='Reds'
)

fig.update_geos(fitbounds="locations", visible=False)

fig.update_layout(
    title=dict(
        text="Total Electricity Generation for every 10.000 person 2017/2020",
        xanchor='center',
        x=0.5,
        yref='paper',
        yanchor='bottom',
        y=0.9,
        pad={'b': 10}
    ),
    margin={'r': 0, 't': 30, 'l': 0, 'b': 0},
    height=550,
    width=750
)

fig.layout["coloraxis"]["colorbar"] = dict(
        title={'text': "Electricity Generation"},

        thickness=15,
        len=0.35,
        bgcolor='rgba(255,255,255,0.6)',
    
        xanchor='left',
        x=0.01,
        yanchor='bottom',
        y=0.05
    )

fig.show(config = dict({'scrollZoom': False}))

We do also a population wise choropleth :), just to show how to do a choropleth statewise.

In [None]:
fig = px.choropleth(
    state_df.replace("Jammu and Kashmir","Jammu & Kashmir"),
    geojson=states,
    featureidkey='properties.ST_NM',
    locations='State',
    color='Population',
    color_continuous_scale='Reds'
)

fig.update_geos(fitbounds="locations", visible=False)

fig.update_layout(
    title=dict(
        text="Population in each State",
        xanchor='center',
        x=0.5,
        yref='paper',
        yanchor='bottom',
        y=0.9,
        pad={'b': 10}
    ),
    margin={'r': 0, 't': 30, 'l': 0, 'b': 0},
    height=550,
    width=750
)

fig.layout["coloraxis"]["colorbar"] = dict(
        title={'text': "Population"},

        thickness=15,
        len=0.35,
        bgcolor='rgba(255,255,255,0.6)',
    
        xanchor='left',
        x=0.01,
        yanchor='bottom',
        y=0.05
    )

fig.show(config = dict({'scrollZoom': False}))