In [1]:
import pandas as pd

In [12]:
dat = pd.read_csv("full_contracts_dataset.csv")

# Remove Some Rows

- if there is no valid url, we cannot use that.
- if there is no valid contract, we cannot use that.

In [13]:
no_url = dat[~dat['Valid URL']]
no_url

dat = dat[dat['Valid URL']]

#If no valid record PIID
dat.dropna(subset=['PIID'], inplace=True)

# Handle Contracts where costs are not separated into current/totals

If there is only a single column represented, that will be considered the total value.

In [14]:
def handle_missing_totals(current_val, total_val):
    if pd.isna(total_val):
        return current_val
    else:
        return total_val

dat['totalBaseAndExercisedOptionsValue'] = dat.apply(lambda row: handle_missing_totals(row['baseAndExercisedOptionsValue'], row['totalBaseAndExercisedOptionsValue']), axis=1)
dat['totalUltimateContractValue'] = dat.apply(lambda row: handle_missing_totals(row['ultimateContractValue'], row['totalUltimateContractValue']), axis=1)
dat['totalObligatedAmount'] = dat.apply(lambda row: handle_missing_totals(row['obligatedAmount'], row['totalObligatedAmount']), axis=1)

# Replace some NaN values with "UNKOWN"

In case of categorical values that are important, but no information is recorded, we still need to categorize it under "UNKNOWN"

In [15]:
columns_to_replace_with_unknown = [
    'vendorState',
    'vendorCity',
    'vendorZip',
    'vendorCongressionalDistrict'
]


dat[columns_to_replace_with_unknown] = dat[columns_to_replace_with_unknown].fillna('UNKNOWN')

# Convert Dollar figures into usable values

They are formatted as strings. For situations where the values are empty or in DOGE's case, say "SEE FPDS" those values are set to 0

In [16]:
def clean_money(dollars: str) -> str:
    if dollars is None or pd.isna(dollars) or str(dollars).strip() == '':
        return 0.0

    if type(dollars) is float:
        return dollars
    
    dollars = dollars.replace('\r', '')
    dollars = dollars.replace('$', '')
    dollars = dollars.replace(',', '')

    if dollars == "SEE FPDS":
        dollars = 0


    return float(dollars)

dat['Doge Value'] = dat.apply(lambda row: clean_money(row['Doge Value']), axis=1)
dat['obligatedAmount'] = dat.apply(lambda row: clean_money(row['obligatedAmount']), axis=1)
dat['totalObligatedAmount'] = dat.apply(lambda row: clean_money(row['totalObligatedAmount']), axis=1)
dat['baseAndExercisedOptionsValue'] = dat.apply(lambda row: clean_money(row['baseAndExercisedOptionsValue']), axis=1)
dat['totalBaseAndExercisedOptionsValue'] = dat.apply(lambda row: clean_money(row['totalBaseAndExercisedOptionsValue']), axis=1)
dat['ultimateContractValue'] = dat.apply(lambda row: clean_money(row['ultimateContractValue']), axis=1)
dat['totalUltimateContractValue'] = dat.apply(lambda row: clean_money(row['totalUltimateContractValue']), axis=1)
dat['feesPaidForUseOfService'] = dat.apply(lambda row: clean_money(row['feesPaidForUseOfService']), axis=1)

In [18]:
dat.to_csv("full_contracts_dataset_CLEANED.csv")