In [1]:
import geopandas as gpd
import pandas as pd

In [2]:
#import the 2020 ag pesticide information (without geometry) and specify only necessary columns

ag_pur = pd.read_parquet("/srv/data/my_shared_data_folder/cpr/pur2020_ag.parquet", engine = 'fastparquet', 
                         columns = ['use_no', "lbs_chm_used", "lbs_prd_used",'acre_planted', 
                                    'acre_treated', 'applic_dt', 'applic_time', "site_loc_id",
                                    "aer_gnd_ind","chemname","REGIONNAME","site_name","county_name"])

#drop duplicated rows in the data frame (this reduces the total number of rows from ~7.5mil to ~3.7mil
ag_pur = ag_pur.drop_duplicates()

In [3]:
def month(column_value):
    '''
    Takes in a full date string, separates and returns the month part
    
    input: value in each row of the column (string)
    returns: month part of the date (string or None)
    '''
    
    date = column_value
    if date == None:
        return None
    
    month_day_year = date.split("/")
    return month_day_year[0]

def day(column_value):
    '''
    Takes in a full date string, separates and returns the day part
    
    input: full date value in each row of the column (string)
    returns: day part of the date (string or None)
    '''
    date = column_value
    if date == None:
        return None
    
    month_day_year = date.split("/")
    return month_day_year[1]

In [4]:
#create new rows for "month" and "day" of application date (extracted from the "applic_dt" column)

ag_pur["month"] = ag_pur["applic_dt"].apply(month)
ag_pur["day"] = ag_pur["applic_dt"].apply(day)

In [5]:
#save the modified dataframe to new parquet file 

ag_pur.to_parquet(path="../importing_ag_data/ag_info.parquet")