In [9]:
import pandas as pd
import os
import unittest

In [3]:
pathToData = os.path.join(os.getcwd(), "Data")
pathToGDPTable = os.path.join(pathToData, "GDP Per Capita.xlsx")
pathToRanking = os.path.join(pathToData, "World Innovation Index.csv")

Unnamed: 0,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type,2013,2014,2015,2016,2017,2018,2019,2020
0,ABW,Aruba,40261,Institutions,Rank,,,,129.0,,,,
1,ABW,Aruba,40262,Institutions,Score (0-100),,,,0.0,,,,
2,ABW,Aruba,40266,Political stability,Value,,,1.3,,,,,
3,ABW,Aruba,40270,Political stability and absence of violence/te...,Index,,1.3,,1.2,,,,
4,ABW,Aruba,40274,Government effectiveness,Index,,1.2,1.2,0.9,,,,


In [4]:
def renameCountry(df, curName, newName):
    """
        Change the country in innovation ranking dataset to it's name in the gdpPerCapita dataset.
        Arguments:
            df: The dataframe to edit
            curName: The country's current name in the innovation ranking
            newName: The country's name in the gdpPerCapita dataset
    """
    df.loc[df['Country Name'] == curName] = newName

In [5]:
def chooseRanking(rank2019, rank2020):
    """
        Choose the more recent of ranks.
        rank2019: the rank in 2019
        rank2020: the rank in 2020
    """
    
    if (not pd.isna(rank2020)):
        return rank2020
    elif(not pd.isna(rank2019)):
        return rank2019
    else:
        return 200
    
def addRankingColumn(globalInnovationIndexRanking):
    """
        Add a column to globalInnovationIndexRanking that gets the most recent ranking if it's available.
    """
    globalInnovationIndexRanking['Ranking'] = [chooseRanking(twoRanks[0], twoRanks[1]) for twoRanks in zip(globalInnovationIndexRanking['2019'], globalInnovationIndexRanking['2020'])]

In [39]:
def cleanFootNoteCells(combinedDF, cellsToClean):
    """
        Clean cells from the GDP table with footnotes. 
        Arguments:
            combinedDF: the dataframe to edit
            cellsToClean: a list of length three tuples of (country, column, actual_value)
    """
    for c in cellsToClean:
        combinedDF.loc[combinedDF['Country/Territory'] == c[0], c[1]] = c[2]

In [76]:
def mostRecentGDPEstimate(IMFEstimate, UNEstimate, WorldBankEstimate):
    """
        Get the most recent GDP per capita estimate out of the three agencies. 
        Arguments will be tuples (estimate, year). 
        If all values are 0, return 0
    """
    
    orderedEstimates = sorted([IMFEstimate, UNEstimate, WorldBankEstimate], key=lambda x: (int(x[1]), int(x[0])))
    return orderedEstimates[2][0]

def createEstimateColumn(combinedDF):
    """
        In the dataframe combinedDF, add a column that gives the most recent available GDP Per Capita estimate
    """
    imfTuple = zip(combinedDF['IMF Estimate'], combinedDF['IMF Year'])
    unTuple = zip(combinedDF['UN Estimate'], combinedDF['UN Year'])
    wbTuple = zip(combinedDF['World Bank Estimate'], combinedDF['World Bank Year'])
    combinedDF['GDP Per Capita'] = [mostRecentGDPEstimate(r[0], r[1], r[2]) for r in zip(imfTuple, unTuple, wbTuple)]

In [32]:
class TestRename(unittest.TestCase):
    
    def test_rename(self):
        rankDF = pd.read_csv(pathToRanking)
        renameCountry(rankDF, "Aruba", "Arb")
        self.assertEqual(rankDF.iloc[1]["Country Name"], "Arb")

testAddRankingColumn (__main__.TestAddRankingColumn) ... FAIL
test_rename (__main__.TestRename) ... 

18377    7.0
Name: Ranking, dtype: float64


ok

FAIL: testAddRankingColumn (__main__.TestAddRankingColumn)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "C:\Users\14159\AppData\Local\Temp\ipykernel_12732\3366704955.py", line 9, in testAddRankingColumn
    self.assertEqual(finlandRanking.item, 7.0)
AssertionError: <bound method IndexOpsMixin.item of 18377    7.0
Name: Ranking, dtype: float64> != 7.0

----------------------------------------------------------------------
Ran 2 tests in 0.212s

FAILED (failures=1)


<unittest.main.TestProgram at 0x234d0bf3f10>

In [38]:
class TestAddRankingColumn(unittest.TestCase):
    def testAddRankingColumn(self):
        rankDF = pd.read_csv(pathToRanking)
        rankDF = rankDF.loc[(rankDF['Indicator'] == "Global Innovation Index") 
                      & (rankDF['Subindicator Type'] == "Rank")]
        addRankingColumn(rankDF)
        finlandRanking = rankDF.loc[rankDF['Country Name'] == "Finland",'Ranking'].iat[0]
        self.assertEqual(finlandRanking, 7.0)     

testAddRankingColumn (__main__.TestAddRankingColumn) ... ok
test_rename (__main__.TestRename) ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.256s

OK


<unittest.main.TestProgram at 0x234d0c02a90>

In [48]:
class TestCleanFootNoteCells(unittest.TestCase):
    def testClean(self):
        gdpDF = pd.read_excel(pathToGDPTable)
        cleanFootNoteCells(gdpDF,  [("Tanzania", "UN Year", "2020")])
        self.assertEqual(gdpDF.loc[gdpDF["Country/Territory"] == "Tanzania", "UN Year"].iat[0], "2020")

In [82]:
class TestEstimateColumn(unittest.TestCase):
    def testEstimateColumn(self):
        gdpDF = pd.read_excel(pathToGDPTable)[:7]
        gdpDF[['IMF Estimate', 'UN Estimate', 'World Bank Estimate']] = gdpDF[['IMF Estimate', 'UN Estimate', 'World Bank Estimate']].apply(pd.to_numeric)
        createEstimateColumn(gdpDF)
        self.assertEqual(gdpDF.loc[gdpDF["Country/Territory"] == "Cayman Islands", "GDP Per Capita"].iat[0], "23,881")
            

In [83]:
unittest.main(argv=[''], verbosity=2, exit=False)

testAddRankingColumn (__main__.TestAddRankingColumn) ... ok
testClean (__main__.TestCleanFootNoteCells) ... ok
testEstimateColumn (__main__.TestEstimateColumn) ... ERROR
test_rename (__main__.TestRename) ... ok

ERROR: testEstimateColumn (__main__.TestEstimateColumn)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "pandas\_libs\lib.pyx", line 2315, in pandas._libs.lib.maybe_convert_numeric
ValueError: Unable to parse string "135,046"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\14159\AppData\Local\Temp\ipykernel_12732\331329645.py", line 4, in testEstimateColumn
    gdpDF[['IMF Estimate', 'UN Estimate', 'World Bank Estimate']] = gdpDF[['IMF Estimate', 'UN Estimate', 'World Bank Estimate']].apply(pd.to_numeric)
  File "c:\users\14159\appdata\local\programs\python\python39\lib\site-packages\pandas\core\frame.py", line 8839, in apply
    return op.a

<unittest.main.TestProgram at 0x234d1f199d0>

ValueError: invalid literal for int() with base 10: '123,123'