# Load Libraries and packages

In [3]:
import pandas as pd
import csv
import os

# Data Collection

In [4]:
# 1) Read the corn yield CSV into a pandas DataFrame
file_name = "240917_corn_yield_data.csv"

full_path = os.path.join(file_name)
df_corn_yield = pd.read_csv(full_path)

# 2) Select only the columns we care about:
df_corn_yield = df_corn_yield[
    ["Year", "State ANSI", "County ANSI", "Ag District Code", "Data Item", "Value", "CV (%)"]
]

#3) Rename the columns to your preferred naming:
df_corn_yield.columns = ["year", "state_ansi", "county_ansi", "district_code", "data_item", "value", "cv"]

#orint the corn yield DataFrame sample
print("Corn Yield DataFrame:\n", df_corn_yield.head())

#print all unique items in the Data Item column ;)
unique_data_items = df_corn_yield["data_item"].unique()
print("\nUnique Data Items in the corn yield data:")
print(unique_data_items)


Corn Yield DataFrame:
    year  state_ansi  county_ansi  district_code  \
0  2023          17          NaN             99   
1  2023          17        107.0             40   
2  2023          17        115.0             40   
3  2023          17        125.0             40   
4  2023          17        113.0             40   

                                    data_item  value   cv  
0  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  208.5  1.0  
1  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  211.1  1.6  
2  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  225.3  2.8  
3  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  208.1  3.6  
4  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  223.3  2.2  

Unique Data Items in the corn yield data:
['CORN, GRAIN - YIELD, MEASURED IN BU / ACRE'
 'CORN, GRAIN, IRRIGATED - YIELD, MEASURED IN BU / ACRE'
 'CORN, GRAIN, NON-IRRIGATED - YIELD, MEASURED IN BU / ACRE']


In [5]:
#remove '99' districts
df_corn_yield = df_corn_yield[df_corn_yield["district_code"] != "99"]

#drop rows where county_ansi is NaN (prevents IntCastingNaNError)
df_corn_yield = df_corn_yield.dropna(subset=["county_ansi"])

#convert county_ansi to int
df_corn_yield["county_ansi"] = df_corn_yield["county_ansi"].astype(int)

#(Optional) Convert county_ansi back to a zero-padded string (3 digits) to match NOAA if needed
df_corn_yield["county_ansi"] = df_corn_yield["county_ansi"].astype(str).str.zfill(3)

#reverse district_code if you need "10" → "01" for matching NOAA divisions
df_corn_yield["district_code"] = df_corn_yield["district_code"].apply(
    lambda x: str(x)[::-1].zfill(2)
)

print(df_corn_yield[["year", "state_ansi", "county_ansi", "district_code"]].head())

   year  state_ansi county_ansi district_code
1  2023          17         107            04
2  2023          17         115            04
3  2023          17         125            04
4  2023          17         113            04
5  2023          17         129            04


In [6]:
# Compute the min and max year for each state in the corn yield dataset
state_year_ranges = df_corn_yield.groupby("state_ansi")["year"].agg(["min", "max"])
print("Year ranges per state:")
print(state_year_ranges)

# Get the highest minimum year (i.e. the maximum of the minimum years)
highest_min_year = state_year_ranges["min"].max()

# Get the lowest maximum year (i.e. the minimum of the maximum years)
lowest_max_year = state_year_ranges["max"].min()

print(f"Highest minimum year across states: {highest_min_year}")
print(f"Lowest maximum year across states: {lowest_max_year}")

Year ranges per state:
             min   max
state_ansi            
17          1925  2023
19          1926  2023
27          1921  2023
31          1918  2023
Highest minimum year across states: 1926
Lowest maximum year across states: 2023


In [7]:
# Assuming df_corn_yield is your DataFrame and highest_min_year and lowest_max_year have been computed
df_adjusted = df_corn_yield[
    (df_corn_yield["year"] >= highest_min_year) &
    (df_corn_yield["year"] <= lowest_max_year)
].copy()

print("Adjusted DataFrame:")
print(df_adjusted.head())

# Check
lowest_year = df_adjusted["year"].min()
print("Lowest year found in df_adjusted:", lowest_year)

df_corn_yield = df_adjusted.copy()

Adjusted DataFrame:
   year  state_ansi county_ansi district_code  \
1  2023          17         107            04   
2  2023          17         115            04   
3  2023          17         125            04   
4  2023          17         113            04   
5  2023          17         129            04   

                                    data_item  value   cv  
1  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  211.1  1.6  
2  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  225.3  2.8  
3  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  208.1  3.6  
4  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  223.3  2.2  
5  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  214.4  3.9  
Lowest year found in df_adjusted: 1926


In [8]:
# Only continue with the total yield data, not irrigated or non-irrigated
corn_yield_df = df_corn_yield[df_corn_yield["data_item"] == "CORN, GRAIN - YIELD, MEASURED IN BU / ACRE"]
print("Filtered Corn Yield DataFrame:")
print(corn_yield_df.head())

Filtered Corn Yield DataFrame:
   year  state_ansi county_ansi district_code  \
1  2023          17         107            04   
2  2023          17         115            04   
3  2023          17         125            04   
4  2023          17         113            04   
5  2023          17         129            04   

                                    data_item  value   cv  
1  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  211.1  1.6  
2  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  225.3  2.8  
3  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  208.1  3.6  
4  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  223.3  2.2  
5  CORN, GRAIN - YIELD, MEASURED IN BU / ACRE  214.4  3.9  


### Check for missing data

In [9]:
# Ensure that the yield column is numeric and that 'year' is an integer
corn_yield_df["value"] = pd.to_numeric(corn_yield_df["value"], errors="coerce")
corn_yield_df["year"] = corn_yield_df["year"].astype(int)

# Dictionary to store the missing report (keyed by (state, county_ansi))
missing_report = {}

# Group by state and county to build the missing report
for (state, county), group in corn_yield_df.groupby(["state_ansi", "county_ansi"]):
    # Get the years present in the group (even if the yield is NaN)
    years_present = set(group["year"].unique())
    # Compare against the fixed expected range (from 1926 to 2023)
    missing_rows = sorted(set(range(1926, 2023 + 1)) - years_present)
    
    # Find years where a row exists but the yield 'value' is NaN
    missing_values = sorted(group.loc[group["value"].isna(), "year"].unique())
    
    if missing_rows or missing_values:
        missing_report[(state, county)] = {
            "missing_rows": missing_rows,
            "missing_values": missing_values
        }

# Print the missing report header
print("Yield data covers years 1926 to 2023")
for (state, county), d in missing_report.items():
    print(f"State {state}, County {county}:")
    if d["missing_rows"]:
        print(f"  Rows missing for years: {d['missing_rows']}")
    if d["missing_values"]:
        print(f"  Years with missing yield value: {d['missing_values']}")

# Compute overall reporting counts
all_counties = list(corn_yield_df.groupby(["state_ansi", "county_ansi"]).groups.keys())
no_missing_count = len(all_counties) - len(missing_report)
exactly_one_missing = 0
more_than_one_missing = 0

for rep in missing_report.values():
    total_missing = len(rep["missing_rows"]) + len(rep["missing_values"])
    if total_missing == 1:
        exactly_one_missing += 1
    elif total_missing > 1:
        more_than_one_missing += 1

print("\nSummary:")
print(f"Total counties: {len(all_counties)}")
print(f"Counties with no missing data: {no_missing_count}")
print(f"Counties with exactly 1 missing year: {exactly_one_missing}")
print(f"Counties with more than 1 missing year: {more_than_one_missing}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corn_yield_df["value"] = pd.to_numeric(corn_yield_df["value"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corn_yield_df["year"] = corn_yield_df["year"].astype(int)


Yield data covers years 1926 to 2023
State 17, County 001:
  Rows missing for years: [2021]
State 17, County 003:
  Rows missing for years: [2012, 2013, 2015, 2021, 2023]
State 17, County 005:
  Rows missing for years: [2013, 2016, 2018]
State 17, County 007:
  Rows missing for years: [2012]
State 17, County 009:
  Rows missing for years: [2019]
State 17, County 013:
  Rows missing for years: [2013, 2016, 2018]
State 17, County 023:
  Rows missing for years: [2013, 2016, 2019]
State 17, County 025:
  Rows missing for years: [2012]
State 17, County 029:
  Rows missing for years: [2016, 2022]
State 17, County 031:
  Rows missing for years: [2008, 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022, 2023]
State 17, County 033:
  Rows missing for years: [2016]
State 17, County 035:
  Rows missing for years: [2018, 2021]
State 17, County 039:
  Rows missing for years: [2013, 2019, 2023]
State 17, County 041:
  Rows missing for years: [2021]
State 17, County 043:
  Rows missing for years: [

In [10]:
# First, build the same missing_report as before …
# (I’ll assume you already have that dict around)

# Create a list of tuples: ((state, county), total_missing)
missing_counts = [
    ((state, county),
     len(d["missing_rows"]) + len(d["missing_values"]))
    for (state, county), d in missing_report.items()
]

# Sort descending by total_missing
missing_counts_sorted = sorted(
    missing_counts,
    key=lambda item: item[1],
    reverse=True
)

print("Counties sorted by total missing years (rows + values):")
for (state, county), total in missing_counts_sorted:
    d = missing_report[(state, county)]
    rows = d["missing_rows"]
    vals = d["missing_values"]
    print(f"State {state}, County {county}: {total} missing years")
    if rows:
        print(f"   • Missing rows: {rows}")
    if vals:
        print(f"   • Missing values: {vals}")

Counties sorted by total missing years (rows + values):
State 31, County 075: 58 missing years
   • Missing rows: [1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
State 27, County 077: 57 missing years
   • Missing rows: [1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1979, 1986, 1987, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
State 27, County 137: 54 missing years
   • Missing rows: [1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1970, 1971, 1972, 1973,

#### Save loaded and processed yield data

In [13]:
# Convert ANSI codes to strings with leading zeros:
#   state_ansi   → 2 digits (e.g. “04”, “17”)
#   county_ansi  → 3 digits (e.g. “001”, “107”)
#   district_code→ 2 digits

corn_yield_df = corn_yield_df.copy()

corn_yield_df['state_ansi']    = corn_yield_df['state_ansi'].astype(str).str.zfill(2)
corn_yield_df['county_ansi']   = corn_yield_df['county_ansi'].astype(str).str.zfill(3)
corn_yield_df['district_code'] = corn_yield_df['district_code'].astype(str).str.zfill(2)

# Save to CSV without the index
corn_yield_df.to_csv('df_corn_yield.csv', index=False)
