In [3]:
import pandas as pd
import os
import unittest

In [4]:
pathToData = os.path.join(os.getcwd(), "Data")
pathToGDPTable = os.path.join(pathToData, "GDP Per Capita.xlsx")
pathToRanking = os.path.join(pathToData, "World Innovation Index.csv")

In [5]:
def renameCountry(df, curName, newName):
    """
        Change the country in innovation ranking dataset to it's name in the gdpPerCapita dataset.
        Arguments:
            df: The dataframe to edit
            curName: The country's current name in the innovation ranking
            newName: The country's name in the gdpPerCapita dataset
    """
    df.loc[df['Country Name'] == curName] = newName

In [6]:
def chooseRanking(rank2019, rank2020):
    """
        Choose the more recent of ranks.
        rank2019: the rank in 2019
        rank2020: the rank in 2020
    """
    
    if (not pd.isna(rank2020)):
        return rank2020
    elif(not pd.isna(rank2019)):
        return rank2019
    else:
        return 200
    
def addRankingColumn(globalInnovationIndexRanking):
    """
        Add a column to globalInnovationIndexRanking that gets the most recent ranking if it's available.
    """
    globalInnovationIndexRanking['Ranking'] = [chooseRanking(twoRanks[0], twoRanks[1]) for twoRanks in zip(globalInnovationIndexRanking['2019'], globalInnovationIndexRanking['2020'])]

In [7]:
def cleanFootNoteCells(combinedDF, cellsToClean):
    """
        Clean cells from the GDP table with footnotes. 
        Arguments:
            combinedDF: the dataframe to edit
            cellsToClean: a list of length three tuples of (country, column, actual_value)
    """
    for c in cellsToClean:
        combinedDF.loc[combinedDF['Country/Territory'] == c[0], c[1]] = c[2]

In [42]:
def makeValuesInts(df):
    """
        Convert the GDP estimates to integer values
    """
    for col in ["IMF Estimate", "UN Estimate", "World Bank Estimate"]:
        df[col] = [str(value).replace(",", "") for value in df[col]]

In [9]:
def mostRecentGDPEstimate(IMFEstimate, UNEstimate, WorldBankEstimate):
    """
        Get the most recent GDP per capita estimate out of the three agencies. 
        Arguments will be tuples (estimate, year). 
        If all values are 0, return 0
    """
    
    orderedEstimates = sorted([IMFEstimate, UNEstimate, WorldBankEstimate], key=lambda x: (int(x[1]), int(x[0])))
    return orderedEstimates[2][0]

def createEstimateColumn(combinedDF):
    """
        In the dataframe combinedDF, add a column that gives the most recent available GDP Per Capita estimate
    """
    imfTuple = zip(combinedDF['IMF Estimate'], combinedDF['IMF Year'])
    unTuple = zip(combinedDF['UN Estimate'], combinedDF['UN Year'])
    wbTuple = zip(combinedDF['World Bank Estimate'], combinedDF['World Bank Year'])
    combinedDF['GDP Per Capita'] = [mostRecentGDPEstimate(r[0], r[1], r[2]) for r in zip(imfTuple, unTuple, wbTuple)]

In [40]:
class TestRename(unittest.TestCase):
    
    def test_rename(self):
        rankDF = pd.read_csv(pathToRanking)
        renameCountry(rankDF, "Aruba", "Arb")
        self.assertEqual(rankDF.iloc[1]["Country Name"], "Arb")
        
    def testAddRankingColumn(self):
        rankDF = pd.read_csv(pathToRanking)
        rankDF = rankDF.loc[(rankDF['Indicator'] == "Global Innovation Index") 
                      & (rankDF['Subindicator Type'] == "Rank")]
        addRankingColumn(rankDF)
        finlandRanking = rankDF.loc[rankDF['Country Name'] == "Finland",'Ranking'].iat[0]
        self.assertEqual(finlandRanking, 7.0)    
    def testClean(self):
        gdpDF = pd.read_excel(pathToGDPTable)
        cleanFootNoteCells(gdpDF,  [("Tanzania", "UN Year", "2020")])
        self.assertEqual(gdpDF.loc[gdpDF["Country/Territory"] == "Tanzania", "UN Year"].iat[0], "2020")
        
    def testCleanValues(self):
        gdpDF = pd.read_excel(pathToGDPTable)
        makeValuesInts(gdpDF)
        self.assertEqual(gdpDF.loc[gdpDF['Country/Territory'] == "Luxembourg", "World Bank Estimate"].iat[0],
                        "115874")
        self.assertEqual(gdpDF.loc[gdpDF['Country/Territory'] == "Canada", "IMF Estimate"].iat[0],
                        "57406")
    
    def testEstimateColumn(self):
        gdpDF = pd.read_excel(pathToGDPTable)[:7]
        makeValuesInts(gdpDF)
        gdpDF[['IMF Estimate', 'UN Estimate', 'World Bank Estimate']] = gdpDF[['IMF Estimate', 'UN Estimate', 'World Bank Estimate']].apply(pd.to_numeric)
        createEstimateColumn(gdpDF)
        self.assertEqual(gdpDF.loc[gdpDF["Country/Territory"] == "Cayman Islands", "GDP Per Capita"].iat[0], 95191)    

In [43]:
unittest.main(argv=[''], verbosity=2, exit=False)

testAddRankingColumn (__main__.TestRename) ... ok
testClean (__main__.TestRename) ... ok
testCleanValues (__main__.TestRename) ... ok
testEstimateColumn (__main__.TestRename) ... ok
test_rename (__main__.TestRename) ... ok

----------------------------------------------------------------------
Ran 5 tests in 0.866s

OK


<unittest.main.TestProgram at 0x1ef9c815b80>