# Functionality to create stackplot of the top producers of food items

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import copy

Read in the Agriculture Data from FAO. See basic data information. 

In [2]:
pathToData = os.path.join(os.getcwd(), "Data", "Production_Crops_Livestock_E_All_Data.csv")
agData = pd.read_csv(pathToData, encoding="latin-1")

In [3]:
agData.head(10)

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Unit,Y1961,Y1961F,Y1962,...,Y2016,Y2016F,Y2017,Y2017F,Y2018,Y2018F,Y2019,Y2019F,Y2020,Y2020F
0,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,ha,,,,...,19481.0,,19793.0,,20053.0,,29203.0,,22134.0,
1,2,Afghanistan,221,"Almonds, with shell",5419,Yield,hg/ha,,,,...,16859.0,Fc,13788.0,Fc,17161.0,Fc,13083.0,Fc,17759.0,Fc
2,2,Afghanistan,221,"Almonds, with shell",5510,Production,tonnes,,,,...,32843.0,,27291.0,,34413.0,,38205.0,,39307.0,
3,2,Afghanistan,711,"Anise, badian, fennel, coriander",5312,Area harvested,ha,,M,,...,24500.0,Im,26500.0,Im,25333.0,Im,25444.0,Im,25759.0,Im
4,2,Afghanistan,711,"Anise, badian, fennel, coriander",5419,Yield,hg/ha,,,,...,7075.0,Fc,7149.0,Fc,7142.0,Fc,7123.0,Fc,7138.0,Fc
5,2,Afghanistan,711,"Anise, badian, fennel, coriander",5510,Production,tonnes,,M,,...,17333.0,Im,18944.0,Im,18093.0,Im,18123.0,Im,18387.0,Im
6,2,Afghanistan,515,Apples,5312,Area harvested,ha,2220.0,F,2220.0,...,19365.0,,26847.0,,28381.0,,27559.0,,25643.0,
7,2,Afghanistan,515,Apples,5419,Yield,hg/ha,68018.0,Fc,68018.0,...,72762.0,Fc,63487.0,Fc,76527.0,Fc,90832.0,Fc,105626.0,Fc
8,2,Afghanistan,515,Apples,5510,Production,tonnes,15100.0,*,15100.0,...,140903.0,,170443.0,,217192.0,,250324.0,,270857.0,
9,2,Afghanistan,526,Apricots,5312,Area harvested,ha,4820.0,F,4820.0,...,8595.0,,18067.0,,18510.0,,17719.0,,17481.0,


In [4]:
agData.dtypes

Area Code         int64
Area             object
Item Code         int64
Item             object
Element Code      int64
                 ...   
Y2018F           object
Y2019           float64
Y2019F           object
Y2020           float64
Y2020F           object
Length: 127, dtype: object

In [14]:
agData.columns

Index(['Area Code', 'Area', 'Item Code', 'Item', 'Element Code', 'Element',
       'Unit', 'Y1961', 'Y1961F', 'Y1962',
       ...
       'Y2016', 'Y2016F', 'Y2017', 'Y2017F', 'Y2018', 'Y2018F', 'Y2019',
       'Y2019F', 'Y2020', 'Y2020F'],
      dtype='object', length=127)

In [9]:
# See what crops there are to choose from
itemsInData = sorted(agData["Item"].unique())
print(itemsInData)

['Agave fibres nes', 'Almonds, with shell', 'Anise, badian, fennel, coriander', 'Apples', 'Apricots', 'Areca nuts', 'Artichokes', 'Asparagus', 'Asses', 'Avocados', 'Bambara beans', 'Bananas', 'Barley', 'Bastfibres, other', 'Beans, dry', 'Beans, green', 'Beef and Buffalo Meat', 'Beehives', 'Beer of barley', 'Beeswax', 'Berries nes', 'Blueberries', 'Brazil nuts, with shell', 'Broad beans, horse beans, dry', 'Buckwheat', 'Buffaloes', 'Butter and Ghee', 'Butter and ghee, sheep milk', 'Butter, buffalo milk', 'Butter, cow milk', 'Butter, goat milk', 'Cabbages and other brassicas', 'Camelids, other', 'Camels', 'Canary seed', 'Carobs', 'Carrots and turnips', 'Cashew nuts, with shell', 'Cashewapple', 'Cassava', 'Cassava leaves', 'Castor oil seed', 'Cattle', 'Cattle and Buffaloes', 'Cauliflowers and broccoli', 'Cereals nes', 'Cereals, Total', 'Cheese (All Kinds)', 'Cheese, buffalo milk', 'Cheese, goat milk', 'Cheese, sheep milk', 'Cheese, skimmed cow milk', 'Cheese, whole cow milk', 'Cherries', 

In [10]:
def cropData(df, agItem):
    """
        Filter the data for a specific crop/food. 
        Argument:
            df: the dataframe
            agItem: String, the crop/food to filter for. Must be an item in the "Item" column
        Return: the agriculture data filtered by the crop/food.     
    """
    if (agItem not in itemsInData):
        txt = "{item} is not an item in the data frame".format(item = agItem)
        raise ValueError(txt)
    return df[df["Item"] == agItem]

firstYrInData = 1961
lastYrInData = 2020

def checkYear(year):
    """
        Check that the YEAR is within the years in the dataset. 
        Return True if it is, False otherwise.
    """
    return year >= firstYrInData and year <= lastYrInData

def getYearData(df, year1, year2):
    """
        Get the year columns between year1 and year2
        Arguments:
            df: the dataframe
            year1: integer, the beginning year column. Between 1961 and 2020 for the agriculture dataframe.
            year2: integer, the ending year column. Between 1961 and 2020 for the agriculture dataframe.
        Return: the ag data with the year columns only    
    """
    if (year2 < year1):
        txt = "{y2} is before {y1}".format(y2=year2, y1 = year1)
        raise ValueError(txt)
    if (!checkYear(year1) or !checkYear(year2)):
        raise ValueError("Years must be within the range of {y1} and {y2}".format(y1=year1, y2=year2))
    getCols = ["Area Code", "Area", "Item", "Element", "Unit"]
    for y in range(year1, year2 + 1):
        getCols.append("Y" + str(y))
    return df[getCols]

Filter data for a specific crop and use data from a specific range of years

In [11]:
def subsetAgData(df, crop, y1, y2):
    """
        Get a subset of the agricultre data for the production 
        of the crop between year y1 and year y2. 
        Arguments:
            df: the dataframe.
            crop: String, the item to filter for
            y1: int, the start year for the data
            y2: int, the end year for the data
        Return: a subset of the agricultre dataframe
    """
    dfSubset = cropData(df, crop)
    return getYearData(dfSubset, y1, y2)

The data includes regions rolled up into one like "Europe" and "Western Europe". On the FAO website Definitions and Standards, Area Code = Country Code, and anything with Area Code 420 and above is a region.

In [12]:
def dropRegionRows(agData):
    """
        Return the agriculture data frame without the region rows. 
    """
    return agData[agData["Area Code"] < 420]

Get the sum, mean, median, and X highest production amounts for the crop for each year in the range

In [None]:
def getTopTenPoint(arr, topX):
    """
        Get the cutoff value for the topX values in an array.
        Arguments:
            arr: a list of numbers of at least length topX.
            topX: int, the cutoff position. For top 10, topX = 10.
        Return: the cutoff value    
    """
    toReturn = copy.deepcopy(arr)
    toReturn[::-1].sort()
    return toReturn[topX - 1]

def createStatsDict(y1, y2, topX):
    """
        Create a dictionary for each year in a range with the sum, mean, median,
        and the X cutoff point of production amounts.
        Arguments:
            y1: int, the start year
            y2: int, the end year of the range
            topX: int, the cuttoff position. For top 10, topX = 10
    """
    yearCols =  ["Y" + str(y) for y in range(y1, y2 + 1)]