In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [15]:
def read_world_bank_data(filename):
    # Read CSV file into a Pandas dataframe
    df = pd.read_csv(filename)
    
    # Remove unnecessary columns
    df = df.drop(columns=['Indicator Code', 'Unnamed: 66', 'Country Code'])
    
    # Rename the columns for convenience
    df = df.rename(columns={'Country Name': 'country_name',
                            'Indicator Name': 'indicator_name'})
    
    # take mean by row where values are NaN
    numeric_cols = df.select_dtypes(include='number').columns
    df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.mean()), axis=1)
    
    # Melt the dataframe to get years as rows
    df_melted = df.melt(id_vars=['country_name', 'indicator_name'],
                        var_name='year', value_name='value')
    
    # Pivot the dataframe to have years as columns
    df_years = df_melted.pivot(index=['country_name', 'indicator_name'],
                               columns='year', values='value')
        
    # Reset the index
    df_years = df_years.reset_index()
    
    # Pivot the dataframe to have countries as columns
    df_countries = df_melted.pivot(index=['indicator_name', 'year'],
                                   columns='country_name', values='value')

    
    # Reset the index
    df_countries = df_countries.reset_index()
    
    return df_years, df_countries

df_cleaned=read_world_bank_data('world_bank_dataset.csv')


In [None]:
# function to to form a dataframe with Year, GDP and Country
def extract_columns(df_cleaned):
    df=pd.DataFrame()
    # pick data for the recent 10 years, note that the data sorted in descending order of year
    df['Year']=df_cleaned.loc[:10, 'Year']
    df['GDP in USD']=df_cleaned.loc[:10, 'GDP in USD']
    df['Country']=df_cleaned.loc[:10, 'Country']
    return df

# function to fetch a single dataframe with 3 features from each country
def form_gdp_df():
    # function call to extract_columns()
    indf=extract_columns(df_cleaned_in)
    usdf=extract_columns(df_cleaned_us)
    cndf=extract_columns(df_cleaned_cn)
    jpdf=extract_columns(df_cleaned_jp)
    cadf=extract_columns(df_cleaned_ca)
    gbdf=extract_columns(df_cleaned_gb)
    zadf=extract_columns(df_cleaned_za)
    # combine the 7 dfs into a single df with 3 columns
    # we ignore the original index
    gdp_df=pd.concat([indf, usdf, cndf, jpdf, cadf, gbdf, zadf], ignore_index=True)
    return gdp_df

# get the combined DF
gdp_df=form_gdp_df()

print("Few records from the Dataframe containing Year, GDP and Country:")
display(gdp_df.head())

# set figure size
plt.figure(figsize=(7, 5))
sns.set(style="whitegrid")
# plot using seaborn library
ax=sns.lineplot(x='Year', y='GDP in USD', hue='Country', style="Country",palette="Set2", markers=True, dashes=False, data=gdp_df, linewidth=2.5)

In [None]:
# function to extract specific columns from the DFs for India and China
def form_in_cn_df():
    # for India
    indf=df_cleaned_in[['Total Population', 'Electric Power Consumption(kWH per capita)', 'Country']]
    # for China
    cndf=df_cleaned_cn[['Total Population', 'Electric Power Consumption(kWH per capita)', 'Country']]
    # combine the two dataframes
    in_cn_df=pd.concat([indf, cndf])
    return in_cn_df

# get the desired data
in_cn_df=form_in_cn_df()
print("Few records from the selected features: ")
display(in_cn_df.head())

# scatter plot
plt.figure(figsize=(7, 5))
sns.set(style="whitegrid")
ax=sns.scatterplot(x='Total Population', y='Electric Power Consumption(kWH per capita)', hue='Country', palette="bright", data=in_cn_df)

In [None]:
# read the columns from the df for Canada
df=df_cleaned_ca.loc[3:, ['Electric Power Consumption(kWH per capita)','Total Population', 'Year']]

print("First few records of the data: ")
display(df.head())

# line plot
plt.figure(figsize=(6, 5))
sns.set(style="whitegrid")
sns.lineplot(x='Total Population', y='Electric Power Consumption(kWH per capita)', palette="colorblind",data=df, linewidth=2.5)

In [None]:
# Pick the columns Year, and 3 different power consumptions from the dataframe for russia
plt.plot(df_cleaned_in.loc[5:, ['Year']],df_cleaned_in.loc[5:, ['Electric Power Consumption(kWH per capita)']],'.-')
plt.plot(df_cleaned_in.loc[5:, ['Year']],df_cleaned_in.loc[5:, ['Renewable Energy Consumption (%)']],'.-')
plt.plot(df_cleaned_in.loc[5:, ['Year']],df_cleaned_in.loc[5:, ['Fossil Fuel Consumption (%)']],'.-')

plt.legend(['Electric Power Consumption(kWH per capita)', 'Renewable Energy Consumption(%)', 'Fossil Fuel Consumption(%)'], loc='best')
plt.title("Energy Consumption in Russia\n")
plt.xlabel('Year')
plt.ylabel('Energy Consumption')
plt.show()