In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/energy-consumption-and-generation-in-the-globe/share-energy-consum-by-source.csv')
energy = df.copy()

### Now let's clean and organize the data 

In [None]:
energy.head()

In [None]:
# We beggin by removing the whitespaces and converting the name of the columns to lowercase 
energy.columns = energy.columns.str.replace(' ', '')
energy.columns = energy.columns.str.lower()

In [None]:
# Now we see if there is any null values in the data 
energy.isna().sum()

In [None]:
# The column 'code' is an abreviation of the name of the countries, we will drop the column
energy.drop('code', axis=1, inplace=True)

In [None]:
energy.head()

In [None]:
# Let's find out with country decreased it's oil-generated energy

# Calculates de correlation between the variables grouped by the countries
energy1 = energy.groupby('entity').corr()

# Creates a dataframe based on the column 'oil(%subenergy)'
energy1 = pd.DataFrame(energy1.loc[:, 'oil(%subenergy)'])

# Reset the index based on the countries contained in the dataset
energy1 = energy1.reset_index()

# selects the correlation between tha variables 'year' and 'oil(%subenergy)''
energy1 = energy1[energy1['level_1'] == 'year']

In [None]:
# Find out which country has the smallest correlation value of oil generated energy
energy1.loc[energy1['oil(%subenergy)'] == energy1['oil(%subenergy)'].min()]

In [None]:
# Indonesia is the country that decreased the most it's oil-generated energy, let's plot a regplot to see the data

graph = energy[energy['entity'] == 'Indonesia']
sns.regplot(data=graph, x='year', y='oil(%subenergy)')

In [None]:
# Now, we will dicover which country incresed it's oil-generated energy
energy1.loc[energy1['oil(%subenergy)'] == energy1['oil(%subenergy)'].max()]

In [None]:
graph = energy[energy['entity'] == 'Poland']
sns.regplot(data=graph, x='year', y='oil(%subenergy)')

In [None]:
# Let´s see if the United states has decreased or increased it's oil generated energy

energy1[energy1['entity'] == 'United States']

In [None]:
graph = energy[energy['entity'] == 'United States']
sns.regplot(data=graph, x='year', y='oil(%subenergy)')

### Now we will focus on just one country, China.

In [None]:
# let´s work with data from 2000 to 2019
china = energy[(energy['entity'] == 'China') & (energy['year'] > 1999)]
china.head()

In [None]:
china.drop('entity', axis=1, inplace=True)

In [None]:
sns.pairplot(china, x_vars=["year"], height=6, kind='reg', y_vars=["oil(%subenergy)", "coal(%subenergy)", 'solar(%subenergy)', 'nuclear(%subenergy)',
                                                       'hydro(%subenergy)', 'wind(%subenergy)', 'gas(%subenergy)', 'otherrenewables(%subenergy)'])

Through the graphs plotted above, we can see that China is reducing it´s dependency from coal (though coal is still the main source of energy production), and it´s investing in less poluting sources

In [None]:
# Visualization of correlations within variables

sns.heatmap(china.corr(), annot=True)

In [None]:
sns.set(rc={'figure.figsize':(12,9)})
sns.barplot(x="year", y="coal(%subenergy)", data=china, orient='v')

In [None]:
sns.barplot(x="year", y="hydro(%subenergy)", data=china, orient='v')

In [None]:
# Now we will find out the percentage increase or decrease in energy sources from 2000 to 2019
# First we create a new dataset with data from year 2000 and 2019

china1 = pd.DataFrame({1:china.loc[704,:], 2:china.loc[723,:]})
china1

In [None]:
# We use the pandas.DataFrame.pct_change, it computes the percentage change from the immediately previous row by default. 
# This is useful in comparing the percentage of change in a time series of elements.

china1 = china1.pct_change(axis = 'columns') * 100
china1.rename(columns={2:'%change'})

## Inferential statistics

In [None]:
# Import the modules 
import scipy.stats

In [None]:
# We will devide the 'energy' dataset into three new ones 

europe = energy[energy['entity'].isin(['Germany', 'France', 'United Kingdom', 'Italy', 'Belgium', 'Spain', 'Switzerland', 'Sweden', 'Belarus', 'Bulgaria',
                                       'Croatia', 'Denmark'])]
asia = energy[energy['entity'].isin(['Bangladesh', 'China', 'Japan', 'Malaysia', 'Indonesia', 'Kazakhstan', 'Taiwan', 'Siangapore', 'South Korea', 'Turkey', 
                                     'Vietnam'])]
america = energy[energy['entity'].isin(['United States', 'Canada', 'Mexico', 'Brazil', 'Chile', 'Colombia', 'Ecuador', 'Peru'])]

asia.drop(['entity', 'year'], axis = 1, inplace=True)
europe.drop(['entity', 'year'], axis = 1, inplace=True)
america.drop(['entity', 'year'], axis = 1, inplace=True)

In [None]:
# First we perform a Levene test. If the resulting p-value of Levene's test is less than some significance level (typically 0.05), 
# he obtained differences in sample variances are unlikely to have occurred based on random sampling from a population with equal variances. 
# Thus, the null hypothesis of equal variances is rejected and it is concluded that there is a difference between the variances in the population.

scipy.stats.levene(asia['oil(%subenergy)'], europe['oil(%subenergy)'], center='mean')

### Now we will pwrform a t-test, it's null hypothesis is:

* H0:µ1=µ2  ("there is no difference in median value of the oil(%subenergy)")
* H1:µ1≠µ2 ("there is a difference in median value of the oil(%subenergy)")

In [None]:
# the significance level is 0.05 
scipy.stats.ttest_ind(asia['oil(%subenergy)'], europe['oil(%subenergy)'])

CONCLUSION: We fail to reject the null hypothesis because there is not enough proof that there is a statistical difference in the energy generated from oil in Asia and Europe

In [None]:
# We will loop over all the columns of the dataset and see if we fail to reject the null hypothesis in other columns 

listCol = asia.columns.to_list()

for i in range(len(listCol)):
    tStatistic, p_value = scipy.stats.ttest_ind(asia[listCol[i]], europe[listCol[i]])
    print(f'The {listCol[i]} t-test data is: t_statisc = {tStatistic}     p-value = {p_value}')

As we can see in the tests performed above, we fail to reject the null hypothesis on two columns, 'gas(%subenergy)' and 'oil(%subenergy)'. Meaning that there is not enough proof that there is a statistical difference in the energy generated from those sources.

### Now we are going to perform the ANOVA test

H0: µ1 = µ2 = µ3 (the three population means are equal)

H1: At least one of the means differ

In [None]:
for i in range(len(listCol)):
    tStatistic, p_value = scipy.stats.f_oneway(asia[listCol[i]], europe[listCol[i]], america[listCol[i]])
    print(f'The {listCol[i]} ANOVA data is: t_statisc = {tStatistic}     p-value = {p_value}')

The gas(%subenergy) is the only category does not have enough evidence that at least one of the means differ 

### Correlation: Using the dataset 'asia', which variables are correlated with each other?

In [None]:
# The code below computes the pearson coefficient and the p-value. 
# If the p-values is less than 0.05, the variables are somewhat correlated 

lista = []
lista1 = []
lista2 = []
for i in range(len(listCol)):
    for j in range(len(listCol)):
        a, b = scipy.stats.pearsonr(asia[listCol[i]], asia[listCol[j]])
        if b < 0.05:
            lista.append(b)
            lista1.append(a)
            lista2.append(f'{listCol[i]} and {listCol[j]}')
            df = pd.DataFrame({'columns':lista2, 'Pearson coeficient':lista1, 'p-value':lista})
            
df.head()

## Linear regression modeling

In [None]:
europe = energy[energy['entity'].isin(['Germany', 'France', 'United Kingdom', 'Italy', 'Belgium', 'Spain', 'Switzerland', 'Sweden', 'Belarus', 'Bulgaria',
                                       'Croatia', 'Denmark'])]

In [None]:
europe.rename(columns={'oil(%subenergy)':'oil', 'coal(%subenergy)':'coal', 'solar(%subenergy)':'solar', 'nuclear(%subenergy)':'nuclear', 
                     'hydro(%subenergy)':'hydro', 'wind(%subenergy)':'wind', 'gas(%subenergy)':'gas', 'otherrenewables(%subenergy)':'other'}, inplace=True)

In [None]:
model = sm.OLS.from_formula("oil ~ year", data=europe)
result = model.fit()
result.summary()

#### Based on the result from above, it's possible to conclude that when comparing to measurements one year apart, the country will have on average 0.4894 units lower oil generated energy than the previous year. This difference is statistically significant, based on the p-value shown under the column labeled P>|t|. This means that there is strong evidence that there is a real association between between year and oil

In [None]:
model = sm.OLS.from_formula("oil ~ year + nuclear + coal", data=europe[europe.entity == 'Germany'])
result = model.fit()
result.summary()

When we add two more variables, it's possible to notice that the independent variables ('year', 'nuclear' and 'coal') have a negative effect on the dependent variable ('coal').