In [24]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import copy
import unittest

In [25]:
pathToData = os.path.join(os.getcwd(), "Data", "Production_Crops_Livestock_E_All_Data.csv")
agData = pd.read_csv(pathToData, encoding="latin-1")
agData['Item'].unique()

array(['Almonds, with shell', 'Anise, badian, fennel, coriander',
       'Apples', 'Apricots', 'Asses', 'Barley', 'Berries nes',
       'Butter and ghee, sheep milk', 'Butter, cow milk', 'Camels',
       'Cattle', 'Cheese, goat milk', 'Cheese, sheep milk', 'Chickens',
       'Cotton lint', 'Cottonseed', 'Eggs, hen, in shell', 'Fat, camels',
       'Fat, cattle', 'Fat, goats', 'Fat, sheep', 'Figs',
       'Fruit, citrus nes', 'Fruit, fresh nes', 'Fruit, stone nes',
       'Goats', 'Grapes', 'Hides, cattle, fresh', 'Honey, natural',
       'Horses', 'Linseed', 'Maize', 'Meat, camel', 'Meat, cattle',
       'Meat, chicken', 'Meat, game', 'Meat, goat', 'Meat, sheep',
       'Melons, other (inc.cantaloupes)', 'Milk, skimmed cow',
       'Milk, whole fresh camel', 'Milk, whole fresh cow',
       'Milk, whole fresh goat', 'Milk, whole fresh sheep', 'Millet',
       'Molasses', 'Mules', 'Nuts nes', 'Offals, edible, camels',
       'Offals, edible, cattle', 'Offals, edible, goats',
       'Offa

In [26]:
agDataOranges = agData[(agData["Item"] == "Grapes") & (agData["Element"] == "Production")]
agDataOranges = agDataOranges[["Area", "Area Code", "Unit", "Element", "Item", "Y2019", "Y2020"]]
agDataOranges = agDataOranges.iloc[10:15]
print(agDataOranges)

                                  Area  Area Code    Unit     Element    Item  \
4352                           Belgium        255  tonnes  Production  Grapes   
4674                Belgium-Luxembourg         15  tonnes  Production  Grapes   
5731  Bolivia (Plurinational State of)         19  tonnes  Production  Grapes   
6100            Bosnia and Herzegovina         80  tonnes  Production  Grapes   
6606                            Brazil         21  tonnes  Production  Grapes   

          Y2019      Y2020  
4352        NaN        NaN  
4674        NaN        NaN  
5731    23549.0    22235.0  
6100    39289.0    44695.0  
6606  1485806.0  1435596.0  


In [27]:
def setElement(df, element):
    """
        Choose the metric to see the data in. 
        Arguments:
            df: the data frame. 
            element: String, the element to choose frame.
        Return a dataframe with the Element column filtered down to only element. 
    """
    return df[df['Element'] == element]

def createYearList(y1, y2):
    """
        Create a list of [Yy1, Yy1 + 1, Yy1 + 2, ...., Yy2]
        Arguments:
            y1: integer, the year to start with
            y2: interger, the year to end the range
        Return a list of years between y1 and y2 inclusive   
    """
    yearCols = []
    for y in range(y1, y2 + 1):
        yearCols.append("Y" + str(y))
    return yearCols

def cropData(df, agItem):
    """
        Filter the data for a specific crop/food. 
        Argument:
            df: the dataframe
            agItem: String, the crop/food to filter for. Must be an item in the "Item" column
        Return: the agriculture data filtered by the crop/food.     
    """
    if (agItem not in itemsInData):
        txt = "{item} is not an item in the data frame".format(item = agItem)
        raise ValueError(txt)
    return df[df["Item"] == agItem]

def checkYear(year):
    """
        Check that the YEAR is within the years in the dataset. 
        Arguments:
            year: int, the year to check
        Return True if it is, False otherwise.
    """
    return year >= firstYrInData and year <= lastYrInData

def checkRange(year1, year2):
    """
        Check year range is valid. Raise an error if it's not. 
    """
    if (year2 < year1):
        txt = "{y2} is before {y1}".format(y2=year2, y1 = year1)
        raise ValueError(txt)
    if (not checkYear(year1) or not checkYear(year2)):
        raise ValueError("Years must be within the range of {y1} and {y2}".format(y1=year1, y2=year2))

def getYearData(df, year1, year2, keepCols):
    """
        Get the year columns between year1 and year2
        Arguments:
            df: the dataframe
            year1: integer, the beginning year column. Between 1961 and 2020 for the agriculture dataframe.
            year2: integer, the ending year column. Between 1961 and 2020 for the agriculture dataframe.
            keepCols: list, columns besides the year columns
        Return: the ag data with the year columns only    
    """
    checkRange(year1, year2)
    getCols = copy.deepcopy(keepCols)
    getCols.extend(createYearList(year1, year2))
    return df[getCols]

def subsetAgData(df, crop, y1, y2, keepCols):
    """
        Get a subset of the agricultre data for the production 
        of the crop between year y1 and year y2. 
        Arguments:
            df: the dataframe.
            crop: String, the item to filter for
            y1: int, the start year for the data
            y2: int, the end year for the data
            keepCols: list of columns that are not the year columns
        Return: a subset of the agricultre dataframe
    """
    dfSubset = cropData(df, crop)
    return getYearData(dfSubset, y1, y2, keepCols)

def getItemUnit(df):
    """
        Get the unit of measurement for the item after subsetting
        the data frame.
        Arguments:
            df: the dataframe
        Return: A String for the unit of the item's measurment. 
    """
    return df['Unit'].unique()[0]

def dropRegionRows(df):
    """
        Return the agriculture data frame without the region rows. 
    """
    return df[df["Area Code"] < 420]

def addOtherSum(df):
    """
        Add the sum of the countries labelled Other for each year into the df.
        Arguments:
            df: The dataframe
        Return: the dataframe with the other sums for each year added into it. 
    """
    otherSum = df[df["Label"] == "Other"].groupby(["Year"], as_index=False)["Amount"].sum().reset_index()
    otherSum["Label"] = ["Other" for i in range(len(otherSum))]
    otherSum.drop("index", axis=1, inplace = True)
    df = df.drop(df[df['Label'] == "Other"].index)
    df = df[["Label", "Year", "Amount"]]
    df = pd.concat([df, otherSum], ignore_index=True)
    return df

def getTopXSubset(df, topX):
    """
        Get the top X countries by total production for the range of years.
        Add a label column. Countries in the top X will have their lables 
        as area name, all others will be labeled other. 
    """
    sumProduction = df.groupby(["Area"])['Amount'].sum().reset_index()
    sumProduction = sumProduction.sort_values(by = ['Amount'], ascending = [False])
    topXcountries = sumProduction['Area'].to_numpy()[:topX]
    df['Label'] = [name if name in topXcountries else "Other" for name in df['Area']]
    return df

def yearsToRows(df, yearColumns):
    """
        Use pd.melt to move the year columns to rows. 
        Arguments:
            df: the dataset (agData)
            yearColumns: the columns to make as rows
        Return: the result of running pd.melt (years to rows)
    """
    df = pd.melt(df, id_vars = ["Area Code", "Area", "Item", "Element", "Unit"], value_vars = yearColumns,
                    var_name = "Year", value_name="Amount")
    return df


def findMidPoint(y1, y2):
    """
        Calculate the year in between two years
        Arguments:
            y1: int
            y2: int
        Return: int, the midpoint year between the two years
    """
    return (y1 + y2) / 2

def prepData(df, element, item, y1, y2, keepCols, topX):
    """
        Prepare the data for plotting.
        Arguments:
            df: the dataframe
            element: String, the element like "Production"
            y1: int, start year
            y2: int, end year
            keepCols: the columns besides the year columns
            topX: int, the number of top producers for the plot
        Returns a tuple of the prepped data frame and the unit
    """
    df = setElement(df, element)
    df = subsetAgData(df, item, y1, y2, keepCols)
    unit = getItemUnit(df)
    df = dropRegionRows(df)
    df = yearsToRows(df, createYearList(y1, y2))
    df = getTopXSubset(df, topX)
    df = addOtherSum(df)
    return (df,unit)

In [48]:
itemsInData = sorted(agData["Item"].unique())
elementsToChooseFrom = agData["Element"].unique()
keepCols = ["Area Code", "Area", "Item", "Element", "Unit"]
fruit = "Oranges"
year1 = 2000
year2 = 2020
firstYrInData = 1961
lastYrInData = 2020
topX = 10

pathToData = os.path.join(os.getcwd(), "Data", "Production_Crops_Livestock_E_All_Data.csv")
agData = pd.read_csv(pathToData, encoding="latin-1")

keepColumnsList = ["Area", "Area Code", "Unit", "Element", "Item", "Y2019", "Y2020"]

agDataOranges = agData[(agData["Item"] == "Oranges") & (agData["Element"] == "Production")]
agDataOranges = agDataOranges[keepColumnsList]
agDataOranges = agDataOranges.head(5)

agDataGrapes = agData[(agData["Item"] == "Grapes") & (agData["Element"] == "Production")]
agDataGrapes = agDataGrapes[keepColumnsList]
agDataGrapes = agDataGrapes.iloc[10:15]



class TestDataPrep(unittest.TestCase):
    def test_prepData(self):
        preppedData = prepData(agDataOranges, "Production", "Oranges", 2019, 2020, keepCols, 3)
        expectedTopThree = ["Algeria", "Argentina", "Australia", "Other"]
        actualTopThree = preppedData[0]['Label'].unique()
        self.assertCountEqual(expectedTopThree, actualTopThree)
        self.assertEqual("tonnes", preppedData[1])
    
    def test_withNA(self):
        preppedData = prepData(agDataGrapes, "Production", "Grapes", 2019, 2020, keepCols, 2)[0]
        other2019Amount = list(preppedData.loc[(preppedData['Label'] == "Other") & (preppedData['Year'] == "Y2019"), "Amount"])[0]
        other2020Amount = list(preppedData.loc[(preppedData['Label'] == "Other") & (preppedData['Year'] == "Y2020"), "Amount"])[0]
        self.assertEqual(23549.0, other2019Amount)
        self.assertEqual(22235.0, other2020Amount)
                                            
                                               
                                            
        
    
    
unittest.main(argv=[''], verbosity=2, exit=False)
    

test_prepData (__main__.TestDataPrep) ... ok
test_withNA (__main__.TestDataPrep) ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.035s

OK


<unittest.main.TestProgram at 0x268285dcd30>