In [1]:
%load_ext autoreload
%autoreload 2

%cd ..

/Users/rubenbroekx/Documents/Projects/radix-co2-reduction


# Correct Beck

Correct the Beck's field dataset.

In [2]:
import re
import pandas as pd

from pathlib import Path
from datetime import datetime
from typing import Optional

In [3]:
beck = pd.read_csv(Path.cwd() / 'data/beck_corn_data.csv', index_col=0)
beck.drop(columns=['Unnamed: 0.1'], inplace=True)
print(f"Number of samples:", len(beck))
beck

Number of samples: 2488


Unnamed: 0,harvest_date,id,lat,lng,pdf,planted_date,tillage,year
0,HARVESTED: 10/01/2017,67968.0,35.288577,-86.214094,67968.pdf,PLANTED: 04/14/2017,TILLAGE: No-Till,2017.0
1,HARVESTED: 09/30/2017,67971.0,35.246390,-87.093086,67971.pdf,PLANTED: 04/25/2017,TILLAGE: No-Till,2017.0
2,HARVESTED: 10/28/2017,67946.0,42.078668,-95.574360,67946.pdf,PLANTED: 04/23/2017\n\nHARVESTED: 10/28/2017 S...,TILLAGE: Field Cultivation,2017.0
3,HARVESTED: 10/30/2017,71032.0,40.614919,-88.539258,71032.pdf,PLANTED: 04/24/2017,TILLAGE: Strip-Till,2017.0
4,HARVESTED: 12/04/2017,68073.0,38.813362,-83.577890,68073.pdf,PLANTED: 04/26/2017,TILLAGE: No-Till,2017.0
...,...,...,...,...,...,...,...,...
2483,HARVESTED: 10/20/2020,102983.0,40.570445,-93.066438,102983.pdf,PLANTED: 04/30/2020,TILLAGE: No-Till,2020.0
2484,HARVESTED: 09/27/2020,102860.0,44.686568,-95.178946,102860.pdf,PLANTED: 04/23/2020,TILLAGE: Conv.-Till / Disk-Rip,2020.0
2485,HARVESTED: 09/24/2020,102797.0,38.251729,-87.496576,102797.pdf,PLANTED: 04/02/2020,TILLAGE: Chisel / Field Cultivation,2020.0
2486,HARVESTED: 10/01/2020,103511.0,40.628187,-89.453044,103511.pdf,PLANTED: 06/01/2020,TILLAGE: Field Cultivation,2020.0


## PDF column

Unnecessary, so drop it.

In [4]:
beck.drop(columns=['pdf'], inplace=True)
print(f"Number of samples:", len(beck))

Number of samples: 2488


## Dates

Fix the planted and harvest dates.

In [5]:
def extract_harvest_date(d: str) -> Optional[str]:
    """Parse out the harvest date."""
    d = re.sub(r"[\n]+", " ", d)[11:21]
    try:
        return datetime.strptime(d, "%m/%d/%Y").strftime("%Y-%m-%d")
    except ValueError:
        return None
    
beck['harvest_date'] = [extract_harvest_date(d) for d in beck.harvest_date]
beck.dropna(subset=['harvest_date'], inplace=True)
print(f"Number of samples:", len(beck))

Number of samples: 2432


In [6]:
def extract_planting_date(d: str) -> Optional[str]:
    """Parse out the harvest date."""
    d = re.sub(r"[\n]+", " ", d)[9:19]
    try:
        return datetime.strptime(d, "%m/%d/%Y").strftime("%Y-%m-%d")
    except ValueError:
        return None
    
beck['planted_date'] = [extract_planting_date(d) for d in beck.planted_date]
beck.dropna(subset=['planted_date'], inplace=True)
print(f"Number of samples:", len(beck))

Number of samples: 2430


## Typing

Ensure correct typing in dataset.

In [7]:
beck['id'] = [int(i) for i in beck.id]
beck['year'] = [int(i) for i in beck.year]
print(f"Number of samples:", len(beck))

Number of samples: 2430


## Tillage

Correct the tillage.

**NOTE: Currently, only 'Conv.-Till', 'No-Till', and 'Field Cultivation' are kept.**

In [8]:
# from collections import Counter
# sorted(Counter(beck.tillage).items(), key=lambda x: -x[1])

In [9]:
TILLAGE_TYPES = (
    'Conv.-Till',
    'No-Till',
)

In [10]:
def extract_tillage(t: str) -> Optional[str]:
    """Parse out the tillage type."""
    if t == 'TILLAGE: Conv.-Till':
        return 'Conv.-Till'
    elif t == 'TILLAGE: No-Till':
        return 'No-Till'
    elif t == 'TILLAGE: Field Cultivation' or t == 'TILLAGE: Chisel / Field Cultivation':
        return 'Conv.-Till'
    else:
        return None
    
beck['tillage'] = [extract_tillage(d) for d in beck.tillage]
beck.dropna(subset=['tillage'], inplace=True)
print(f"Number of samples:", len(beck))

Number of samples: 1591


In [11]:
# Manual correction
TILLAGE = {
}

print(f"Corrected {len(TILLAGE)} samples")
for k,v in TILLAGE.items():
    assert v in TILLAGE_TYPES
    beck.at[beck[beck.id == k].index, 'tillage'] = v
print(f"Number of samples:", len(beck))

Corrected 0 samples
Number of samples: 1591


## Cover Crop

Add information about cover crops (manually checked).

In [12]:
beck['cover_crop'] = None

In [13]:
COVER_CROP = {
    94546: True,
    103524: True,
    103476: True,
    103293: True,
    103036: True,
    103154: True,
    103038: True,
    94546: True,
    103423: True,
    103256: True,
    103249: True,
    103536: True,
    94114: True,
    103688: True,
    57901: True,
    103071: True,
    94513: True,
    103639: True,
    102834: True,
    102784: True,
    93836: True,
    68001: True,
    102919: True,
    103424: True,
    104133: True,
    94523: True,
    94625: True,
    93727: True,
    69422: True,
    70781: True,
    70782: True,
    80077: True,
    103420: True,
    71025: True,
    93837: True,
    70432: True,
    80080: True,
    79661: True,
    103088: True,
    103514: True,
    69399: True,
    68286: True,
    79391: True,
    68170: True,
    103575: True,
    102777: True,
    103698: False,
    103810: False,
    104403: False,
    104394: False,
    104353: False,
    104312: False,
    79482: False,
    68364: False,
    103408: False,
    104551: False,
    104354: False,
    103671: False,
    103910: False,
    80304: False,
    79981: False,
    103916: False,
    103837: False,
    102922: False,
    93796: False,
    79239: False,
    93741: False,
    93783: False,
    103569: False,
    103352: False,
}

print(f"Annotated {len(COVER_CROP)} samples")
for k,v in COVER_CROP.items():
    beck.at[beck[beck.id == k].index, 'cover_crop'] = v
print(f"Number of samples:", len(beck))

Annotated 69 samples
Number of samples: 1591


## Save

Save the adjusted dataset.

In [14]:
beck.to_csv(Path.cwd() / 'data/beck_corrected.csv', index=False)