In [1]:
import geopandas as gpd
import pandas as pd

# Normalizing chemical use in pesticides

In [2]:
#read in the cleaned data for the ag information
ag_pesticides = gpd.read_parquet("importing_ag_data/ag_pesticides.parquet")

In [3]:
#create new column called pct_chem_used that standardizes the amount of chemicals applied in each row
ag_pesticides.loc[:, "pct_chem_used"] = ag_pesticides.loc[:, "lbs_chm_used"] / ag_pesticides.loc[:,"lbs_prd_used"]

In [4]:
#replace null values with 0.0 to keep the column as a float
ag_pesticides.loc[ag_pesticides.loc[:, "pct_chem_used"].isna(), "pct_chem_used"]  = 0.0

In [5]:
ag_pesticides.head()

Unnamed: 0,use_no,geometry,lbs_chm_used,lbs_prd_used,acre_planted,acre_treated,applic_dt,applic_time,site_loc_id,aer_gnd_ind,chemname,REGIONNAME,site_name,county_name,month,day,pct_chem_used
0,956467,"POLYGON ((-122.79807 38.66381, -122.79169 38.6...",6.216,7.77,169.53,2.59,04/08/2020,1630.0,1,G,SULFUR,NORTH COAST,"GRAPE, WINE",Sonoma,4,8,0.8
1,956473,"POLYGON ((-122.79807 38.66381, -122.79169 38.6...",10.68,13.35,169.53,4.45,04/08/2020,1630.0,1,G,SULFUR,NORTH COAST,"GRAPE, WINE",Sonoma,4,8,0.8
2,950977,"POLYGON ((-122.44568 38.15707, -122.44565 38.1...",,0.0023,142.0,0.43,04/10/2020,811.0,131,G,,SAN FRANCISCO BAY,"GRAPE, WINE",Sonoma,4,10,0.0
3,950981,"POLYGON ((-122.44568 38.15707, -122.44565 38.1...",0.015974,0.0652,142.0,0.49,04/10/2020,821.0,131,G,GLUFOSINATE-AMMONIUM,SAN FRANCISCO BAY,"GRAPE, WINE",Sonoma,4,10,0.245
4,950993,"POLYGON ((-122.85427 38.51788, -122.84768 38.5...",3.324537,3.9625,27.5,3.17,04/15/2020,650.0,115,G,COPPER OXIDE (OUS),NORTH COAST,"GRAPE, WINE",Sonoma,4,15,0.839


In [6]:
#use this function to find the avg pct_chem_used values for non-null entries in each group (group by county and site)
def avgs_to_dict(df):
    '''
    Creates a dictionary that assigns each grouped county and site to its average percent of chemicals (for non-null entries)
    
    inputs: dataframe 
    outputs: dictionary
    '''
    non_null_copy = df.loc[~(df.loc[:, "lbs_chm_used"].isna()), ["county_name", "site_name", "site_loc_id", "lbs_prd_used", "lbs_chm_used"]].copy()
    non_null_copy.loc[:, "non_null_pct"] =  non_null_copy.loc[:, "lbs_chm_used"] / non_null_copy.loc[:, "lbs_prd_used"]
    return non_null_copy.groupby(["county_name", "site_name"]).agg({"non_null_pct":"mean"}).to_dict()["non_null_pct"]
  
#create the dictionary with the averages for each group
fill_dict = avgs_to_dict(ag_pesticides)

In [7]:
#for the few entries that have no non-null entries for each group, 
# fill in with the average pct_chem_used for the county alone

fill_dict[('Madera', 'INDUSTRIAL HEMP')] = ag_pesticides.loc[ag_pesticides.loc[:, "county_name"]=='Madera', "pct_chem_used"].mean()
fill_dict[('San Luis Obispo', 'UNKNOWN')] = ag_pesticides.loc[ag_pesticides.loc[:, "county_name"]=='San Luis Obispo', "pct_chem_used"].mean()
fill_dict[('El Dorado', 'BOYSENBERRY')] = ag_pesticides.loc[ag_pesticides.loc[:, "county_name"]=='El Dorado', "pct_chem_used"].mean()
fill_dict[('Monterey', 'PASTURELAND')] = ag_pesticides.loc[ag_pesticides.loc[:, "county_name"]=='Monterey', "pct_chem_used"].mean()

In [8]:
#fill in each 0.0 pct_chem_used entry with the average pct_chem_used for its group (found in the dictionary)

#takes about 5 min to run and uses less memory
for index in range(ag_pesticides.shape[0]):
    if ag_pesticides.loc[index, 'pct_chem_used'] == 0.0:
        ag_pesticides.loc[index, 'pct_chem_used'] = fill_dict[(ag_pesticides.loc[index, "county_name"], 
                                                              ag_pesticides.loc[index, "site_name"])]

In [8]:
#This is another method to fill in the missing pct_chem_used values, but it is more expensive

# #takes about 8 minutes to run 

# for index, row in ag_pesticides.iterrows():
#     if row["pct_chem_used"] == 0.0:
#         group_tuple = (row["county_name"], row["site_name"])
#         ag_pesticides.loc[index, 'pct_chem_used'] = fill_dict[group_tuple]
            

In [9]:
#sanity check to make sure the loop worked to fill in the missing values
ag_pesticides[ag_pesticides["pct_chem_used"]==0.0]

Unnamed: 0,use_no,geometry,lbs_chm_used,lbs_prd_used,acre_planted,acre_treated,applic_dt,applic_time,site_loc_id,aer_gnd_ind,chemname,REGIONNAME,site_name,county_name,month,day,pct_chem_used


In [None]:
ag_pesticides.head()

In [10]:
#save the modified df to its own file so that I can save these changes without having to run this code again
ag_pesticides.to_parquet(path="../CA_Pesticide_Project/ag_pesticides_modified.parquet")