In [77]:
import pandas as pd
import numpy as np

In [78]:
def petaToGiga(row):
    row['Energy Supply'] *= 1000000
    return row

In [79]:
def getData():
#Reading and parsing energy ('Energy Indicators.xls')
    energy = pd.read_excel('Energy Indicators.xls', header=None, skip_footer=2)
    #print (energy)
    energy = (energy.drop([0, 1], axis=1) #Remove first two columns
                     .dropna() #drop NaN's
                     .drop(9)
                     .rename(columns={2: 'Country', #Renaming columns
                                      3: 'Energy Supply',
                                      4: 'Energy Supply per Capita',
                                      5: '% Renewable'})
                     .replace(regex=True, to_replace=[r'\d', r' \(([^)]+)\)'], value=r'') #Pruning in country names
                     .replace(to_replace=["...", "Republic of Korea", "United States of America", #Replace ... with NaN and country names
                                         "United Kingdom of Great Britain and Northern Ireland",
                                         "China, Hong Kong Special Administrative Region"],
                              value=[np.NaN, "South Korea", "United States", "United Kingdom", "Hong Kong"])
                     .apply(petaToGiga, axis=1,)) #convert petajoules to giga joules
    #print (energy)
#Reading GDP ('world_bank.csv')
    GDP = pd.read_csv('world_bank.csv', header=None, skiprows=4)
    GDP = (GDP.rename(columns=GDP.iloc[0])
              .drop(0)
              .replace(to_replace=["Korea, Rep.", "Iran, Islamic Rep.", "Hong Kong SAR, China"],
                       value=["South Korea", "Iran", "Hong Kong"])
              .rename(columns={2006: '2006', 2007: '2007', 2008: '2008', 2009: '2009', 2010: '2010',
                                    2011: '2011', 2012: '2012', 2013: '2013', 2014: '2014', 2015: '2015'}))
#Reading ScimEn('scimagojr-3.xlsx')
    ScimEn = pd.read_excel('scimagojr-3.xlsx')
    return energy, GDP, ScimEn


In [80]:
def answer_one():
    energy, GDP, ScimEn = getData()
    energy = energy.dropna()
    GDP_columns = ['Country Name', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
    GDP = GDP[GDP_columns] #Prune GDP
    ScimEn_columns = ['Rank', 'Country', 'Documents', 'Citable documents', 'Citations', 'Self-citations', 'Citations per document', 'H index']
    ScimEn = ScimEn[ScimEn_columns]
    ScimEn = ScimEn[:15] #Prune ScimEn
#Merging and cleaning dataframes
    new = pd.merge(energy, GDP, how="inner", left_on="Country", right_on="Country Name")
    new = new.drop(['Country Name'], axis=1)
    new = pd.merge(new, ScimEn, how="inner", left_on='Country', right_on='Country')
    new = new.set_index('Country')
    columns = ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations', 'Citations per document', 'H index', 'Energy Supply', 'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
    new = new[columns]
    return new

In [82]:
def answer_two():
    energy, GDP, ScimEn = getData()
    m1 = pd.merge(energy, GDP, how='inner', left_on='Country', right_on='Country Name')
    m2 = pd.merge(energy, ScimEn, how='inner', left_on='Country', right_on='Country')
    m3 = pd.merge(GDP, ScimEn, how='inner', left_on='Country Name', right_on='Country')
    actual = pd.merge(m1, ScimEn, how='inner', left_on='Country', right_on='Country')
    energy_len = len(energy.index)
    GDP_len = len(GDP.index)
    SccimEn_len = len(ScimEn.index)
    m1_len = len(m1.index)
    m2_len = len(m2.index)
    m3_len = len(m3.index)
    actual_len = len(actual.index)
    ans = energy_len + GDP_len + SccimEn_len - m1_len - m2_len - m3_len + actual_len
    return ans

In [83]:
def avg(row):
    data = row[['2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']]
    return np.average(data)

In [84]:
def answer_three():
    Top15 = answer_one()
    avgGDP = Top15.apply(avg, axis=1)
    avgGDP.sort(ascending=False)
    return avgGDP

In [85]:
def answer_four():
    Top15 = answer_one()
    avgGDP = answer_three()
    country = avgGDP.keys()[5] 
    return Top15.loc[country, '2015'] - Top15.loc[country, '2006']

In [86]:
def answer_five():
    Top15 = answer_one()
    ans = np.average(Top15['Energy Supply per Capita'])
    return float(ans)

In [87]:
def answer_six():
    Top15 = answer_one()
    m = np.max(Top15['% Renewable'])
    country = Top15[ Top15['% Renewable'] == m ].iloc[0].name
    return country, m

In [88]:
def answer_seven():
    Top15 = answer_one()
    Top15['citation_ration'] = Top15['Self-citations'] / Top15['Citations']
    m = np.max(Top15['citation_ration'])
    country = Top15[Top15['citation_ration'] == m].iloc[0].name
    return country, m

In [89]:
def answer_eight():
    Top15 = answer_one()
    Top15['population_ratio'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']
    Top15 = Top15.sort(['population_ratio'], ascending=False)
    return Top15.index[2]

In [90]:
def answer_nine():
    Top15 = answer_one()
    Top15['population'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']
    Top15['citation document per person'] = Top15['Citable documents'] / Top15['population']
    Top15 = Top15.corr(method='pearson')
    return Top15.loc['Energy Supply per Capita', 'citation document per person']

In [91]:
def plot9():
    import matplotlib as plt
    %matplotlib inline
    
    Top15 = answer_one()
    Top15['PopEst'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']
    Top15['Citable docs per Capita'] = Top15['Citable documents'] / Top15['PopEst']
    Top15.plot(x='Citable docs per Capita', y='Energy Supply per Capita', kind='scatter', xlim=[0, 0.0006])

In [None]:
def answer_ten():
    Top15 = answer_one()
    median = np.median(Top15['% Renewable'])
    for i in range(len(Top15)):
        if Top15.iloc[i]['% Renewable'] >= median:
            Top15.set_value(Top15.iloc[i].name, 'HighRenew', 1)
        else:
            Top15.set_value(Top15.iloc[i].name, 'HighRenew', 0)
    Top15 = Top15['HighRenew']
    return Top15.sort(inplace=False)

In [96]:
ContinentDict  = {'China':'Asia',
                  'United States':'North America',
                  'Japan':'Asia',
                  'United Kingdom':'Europe',
                  'Russian Federation':'Europe',
                  'Canada':'North America',
                  'Germany':'Europe',
                  'India':'Asia',
                  'France':'Europe',
                  'South Korea':'Asia',
                  'Italy':'Europe',
                  'Spain':'Europe',
                  'Iran':'Asia',
                  'Australia':'Australia',
                  'Brazil':'South America'}

In [97]:
def answer_eleven():
    Top15 = answer_one()
    Top15['PopEst'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']
    for i in range(len(Top15)):
        country = Top15.iloc[i].name
        Top15.set_value(country, 'Continent', ContinentDict[country])
    Top15 = (Top15.reset_index(level=0)
                 .set_index(['Continent', 'Country']))
    Top15 = Top15.groupby(level=0)['PopEst'].agg({'size': np.size, 'sum': np.sum, 'mean': np.average, 'std': np.std})
    return Top15

In [None]:
def answer_twelve():
    Top15 = answer_one()
    for i in range(len(Top15)):
        country = Top15.iloc[i].name
        Top15.set_value(country, 'Continent', ContinentDict[country])
    Top15 = (Top15.reset_index(level=0)
                 .set_index(['Continent', 'Country']))
    Top15 = pd.cut(Top15['% Renewable'], 5)
    Top15 = (Top15.reset_index()
                .set_index(['Continent', '% Renewable']))
    Top15 = Top15.groupby(level=['Continent', '% Renewable']).size()
    return Top15

In [None]:
def answer_thirteen():
    Top15 = answer_one()
    Top15['PopEst'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']
    Top15 = Top15['PopEst']
    for i in range(len(Top15)):
        country = Top15.keys()[i]
        number = "{:,}".format((Top15.iloc[i]))
        Top15.replace(Top15.iloc[i], number, inplace=True)
    return Top15

In [None]:
def plot_optional():
    import matplotlib as plt
    %matplotlib inline
    Top15 = answer_one()
    ax = Top15.plot(x='Rank', y='% Renewable', kind='scatter', 
                    c=['#e41a1c','#377eb8','#e41a1c','#4daf4a','#4daf4a','#377eb8','#4daf4a','#e41a1c',
                       '#4daf4a','#e41a1c','#4daf4a','#4daf4a','#e41a1c','#dede00','#ff7f00'], 
                    xticks=range(1,16), s=6*Top15['2014']/10**10, alpha=.75, figsize=[16,6]);

    for i, txt in enumerate(Top15.index):
        ax.annotate(txt, [Top15['Rank'][i], Top15['% Renewable'][i]], ha='center')

    print("This is an example of a visualization that can be created to help understand the data. \
This is a bubble chart showing % Renewable vs. Rank. The size of the bubble corresponds to the countries' \
2014 GDP, and the color corresponds to the continent.")