In [5]:
import pandas as pd
import unittest
import numpy as np


In [6]:
%run Methods.ipynb

test_prepData (__main__.TestDataPrep) ... ok
test_withNA (__main__.TestDataPrep) ... ok
test_getItemsThatHaveElement (__main__.TestGetItemWithElements) ... ok
test_yieldPrepData (__main__.TestYieldDataPrep) ... ok

----------------------------------------------------------------------
Ran 4 tests in 0.075s

OK


In [7]:
np.random.seed(1)
testData = {'Area': (['Indonesia'] * 3 + ['China'] * 3 + ['UK'] * 3) * 3,
            'Item': ['Apples'] * 9 + ['Grapes'] * 9 + ['Beehives'] * 9,
            'Unit': ['hg/ha', 'tonnes', 'ha'] * 6 + ["Unit"] * 9,
            'Element': ['Yield', 'Production', 'Area'] * 6 + ['Yield', 'Stocks', 'Count'] * 3,
           '2019': np.random.randint(100, 100000, size = 27), '2020': np.random.randint(100, 100000, size = 27)}

testDf = pd.DataFrame(testData)
testDf.to_clipboard()

In [15]:
def subsetProductionData(agDF):
    """
        Subset the dataframe to only be production data. 
        Args:
            agDF: Agriculture dataframe from FAO
        Returns: a dataframe with only production data. 
    """
    
    productionElements = ["Production", "Stocks", "Laying", "Prod Popultn"]
    
    agDF = agDF[agDF['Element'].isin(productionElements)]

    
    return agDF

In [18]:
def helperGetYearVals(df, year):
    """
        Get the YEAR column in the dataframe.
        Set NAN to zero. 
        Arguments:
            df: dataframe
            year: integer
        Returns: a numpy array
    """
    yearProd = df["Y" + str(year)]
    yearProd[np.isnan(yearProd)] = 1
    return yearProd

def helperCleanYearColumn(df, yearCol):
    """
        Helper for productionChangeDF.
        Clean a year column by making nan and 0 values 1
        Arguments:
            df: dataframe
            yearCol: int, the column to clean
    """
    df.fillna(1, inplace=True)
    df.loc[df[yearCol] == 0, yearCol] = 1

def productionChangeDF(agDF, y1, y2):
    """
        Get a dataframe of ["Item", "Element", "Unit", "Production Change"]
        where Production Change is the percentage change of production
        of an item between years y1 and y2.
        Args: 
            agDF: Agriculture dataframe from FAO
            y1: int, the start year
            y2: int, the end year
        Returns: a dataframe 
    """
    
    checkRange(y1, y2)
    
    df = agDF.copy()
    df = dropRegionRows(df)
    
    
    df.rename(columns = {"Y" + str(y1): y1, "Y" + str(y2): y2}, inplace=True)
    
    df = subsetProductionData(df)
    df = df[['Area', 'Item', 'Element', "Unit"] + [y1, y2]]
    y1Production = df.groupby(["Item", "Element", "Unit"])[y1].sum().reset_index()
    y2Production = df.groupby(["Item","Element", "Unit"])[y2].sum().reset_index()
    totalProdDf = pd.merge(y1Production, y2Production, how="left")
    helperCleanYearColumn(totalProdDf, y1)
    helperCleanYearColumn(totalProdDf, y2)
    totalProdDf['Change'] = [round((p[1] - p[0]) / p[0], 2)for p in zip(totalProdDf[y1], totalProdDf[y2])]
    return totalProdDf

In [19]:
def getYearsAllNANForItems(agData):
    """
        For each item in the ag dataset, find which years if any
        for which production for all countries is NAN.
        Arguments:
            agData: the agriculture dataset from the FAO
        Returns: a dictionary of (items -> yearsNAN)
    """
    itemElementDict = getItemToProductionElementDictionary(agData)
    yearCol = createYearList(1961, 2020)

    itemToAllNANCols = {}

    for item in itemElementDict.keys():
        itemDf = agData[(agData['Item'] == item) & (agData['Element'] == itemElementDict[item])]

        colsAllNAN = []

        for year in yearCol:
            if (np.isnan(itemDf[year]).all()):
                colsAllNAN.append(year)

        if len(colsAllNAN) > 0:
            itemToAllNANCols[item] = colsAllNAN
    return itemToAllNANCols

In [20]:
class TestItemToProdElement(unittest.TestCase):
    def createTestData(self):
        """
            Create dataframe for testing
        """
        np.random.seed(1)
        testData = {'Area Code': ([1] * 3 +[2] * 3 + [3] * 3) * 3,
                    'Area': (['Indonesia'] * 3 + ['China'] * 3 + ['UK'] * 3) * 3,
            'Item': ['Apples'] * 9 + ['Grapes'] * 9 + ['Beehives'] * 9,
            'Element': ['Yield', 'Production', 'Area'] * 6 + ['Yield', 'Stocks', 'Count'] * 3,
            'Unit': ['hg/ha', 'tonnes', 'ha'] * 6 + ["unit"] * 9,        
           'Y2019': np.random.randint(100, 100000, size = 27), 'Y2020': np.random.randint(100, 100000, size = 27)}
        testDf = pd.DataFrame(testData)
        return testDf
    
    def test_productionChange(self):
        testDf = self.createTestData()
        expectedData = {'Item': ['Apples', 'Grapes', 'Beehives'], 'Unit': ['tonnes', 'tonnes', 'unit'], 
                       'Element': ["Production", "Production", "Stocks"], 
                       'Percent Change': [-0.53, 0.34, 0.25]}
        expectedDF = pd.DataFrame(expectedData)
        actual = productionChangeDF(testDf, 2019, 2020)
        
        for item in ["Apples", "Grapes", "Beehives"]:
            actualValue = actual.loc[actual["Item"] == item, "Change"].iat[0]
            expectedValue = expectedDF.loc[expectedDF['Item'] == item, "Percent Change"].iat[0]
            self.assertAlmostEqual(expectedValue, actualValue, delta=0.01)

In [21]:
unittest.main(argv=[''], verbosity=2, exit=False)

test_prepData (__main__.TestDataPrep) ... ok
test_withNA (__main__.TestDataPrep) ... ok
test_getItemsThatHaveElement (__main__.TestGetItemWithElements) ... ok
test_productionChange (__main__.TestItemToProdElement) ... ok
test_yieldPrepData (__main__.TestYieldDataPrep) ... ok

----------------------------------------------------------------------
Ran 5 tests in 0.120s

OK


<unittest.main.TestProgram at 0x25414e598b0>