# USDA - National Agricultural Statistics Service

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
from datetime import timedelta
import numpy as np

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [4]:
# https://quickstats.nass.usda.gov/

### Read data from downloaded CSV

In [5]:
src = pd.read_csv(
    "data/raw/lettuce_acres_harvested_total_five_census_years.csv",
    dtype={"County ANSI": str, "State ANSI": str, "Year": str},
)

In [6]:
src.columns = src.columns.str.lower().str.replace(" ", "_", regex=True)

In [7]:
src_slim = src[
    (src["data_item"] == "LETTUCE - ACRES HARVESTED") & (src["year"] == "2017")
][
    [
        "year",
        "geo_level",
        "state",
        "state_ansi",
        "county",
        "county_ansi",
        "data_item",
        "value",
    ]
]

In [8]:
src_slim.rename(
    columns={"state_ansi": "state_fips", "county_ansi": "county_fips"}, inplace=True
)

In [9]:
src_slim["geo_level"] = src_slim["geo_level"].str.title()
src_slim["county"] = src_slim["county"].str.title()
src_slim["state"] = src_slim["state"].str.title()
src_slim["data_item"] = src_slim["data_item"].str.title()

### Strip out withheld values (Arizona missing?)

In [11]:
# D = Withheld to avoid disclosing data for individual operations.
# Z = Less than half the rounding unit.

In [12]:
src_slim["value"] = (
    src_slim["value"]
    .str.strip(" ")
    .str.replace("(D)", "0", regex=False)
    .str.replace("(Z)", "0", regex=False)
    .str.replace(",", "", regex=False)
)

In [13]:
src_slim["value"] = src_slim["value"].astype(int)

In [14]:
src_slim["fips"] = src_slim["state_fips"] + src_slim["county_fips"]

In [15]:
df = src_slim.copy()

---

### What's up with Arizona? 

#### All counties

In [23]:
df.sort_values("value", ascending=False).head()

In [24]:
df["value"].mean()

154.50122399020808

#### Subset just one place

In [21]:
df[df["state"] == "California"].sort_values("value", ascending=False).head()

### Aggregate with methods such as groupby

In [30]:
df.groupby(["state"])["value"].mean()

state
Alabama              0.500000
Alaska              16.200000
Arizona              2.416667
Arkansas             0.586207
California        4778.117647
Colorado            58.000000
Connecticut         16.250000
Delaware             3.000000
Florida              3.175000
Georgia              0.396825
Hawaii              72.250000
Idaho                4.166667
Illinois             1.250000
Indiana              0.538462
Iowa                 0.529412
Kansas               0.322581
Kentucky             0.705128
Louisiana            0.480000
Maine                9.437500
Maryland             4.600000
Massachusetts       25.071429
Michigan             1.986111
Minnesota            1.716981
Mississippi          0.909091
Missouri             0.657534
Montana              1.250000
Nebraska             0.600000
Nevada               0.833333
New Hampshire        7.000000
New Jersey          63.700000
New Mexico          45.944444
New York            19.067797
North Carolina       2.333333
Nort

#### Those in Arizona

In [22]:
df[df["state"] == "Arizona"].sort_values("value", ascending=False).head()

Unnamed: 0,year,geo_level,state,state_fips,county,county_fips,data_item,value,fips
31,2017,County,Arizona,4,Navajo,17,Lettuce - Acres Harvested,15,4017
32,2017,County,Arizona,4,Yavapai,25,Lettuce - Acres Harvested,4,4025
37,2017,County,Arizona,4,Santa Cruz,23,Lettuce - Acres Harvested,4,4023
35,2017,County,Arizona,4,Pima,19,Lettuce - Acres Harvested,3,4019
28,2017,County,Arizona,4,Coconino,5,Lettuce - Acres Harvested,2,4005


---

### Export to JSON

In [34]:
df.to_json(
    "data/processed/lettuce_acres_harvested_counties_2017.json",
    orient="records",
    indent=2,
)