In [1]:
import pandas as pd

In [16]:
dat = pd.read_csv("full_contracts_dataset.csv")

# Clean Data

- remove rows
    - If no valid URL
    - If no valid record PIID (ex https://www.fpds.gov/common/jsp/LaunchWebPage.jsp?command=execute&requestid=239771231&version=1.5)
- put all dollar figures into numbers  
    - Doge Value
        - How to handle "SEE FPDS"? -> Set to $0
    - obligatedAmount  
    - totalObligatedAmount  
    - baseAndExercisedOptionsValue  
    - totalBaseAndExercisedOptionsValue  
    - ultimateContractValue  
    - feesPaidForUseOfService  

- handle records with missing totals  
    - Replace the NaN total values with the current values  
        - ultimateContractValue -> totalUltimateContractValue  
        - baseAndExercisedOptionsValue -> totalBaseAndExercisedOptionsValue  
        - obligatedAmount -> totalObligatedAmount  
- Convert dates to datetime
    - displayLastModifiedDate
More to come

In [17]:
#Remove/replace x and y columns

dat.head()

Unnamed: 0,Doge Agency,Doge Upload Date,Contract URL,Doge Value,Doge Desc,Valid URL,agencyID,PIID,modNumber,transactionNumber,...,emailAddress,individualOrderLimit,fixedFeeValue,feeRangeLowerValue,feeRangeUpperValue,orderingProcedure,displayIDVType,typeOfIDC,multipleOrSingleAwardIDC,whoCanUse
0,COMMODITY FUTURES TRADING COMMISSION,2/11/2025,https://www.fpds.gov/ezsearch/jsp/viewLinkCont...,$0,RENEW WEST PRINT SUBSCRIPTIONS FOR VARIOUS LEG...,True,9507,9523ZY19F0048,P00001,0.0,...,,,,,,,,,,
1,COMMODITY FUTURES TRADING COMMISSION,2/13/2025,https://www.fpds.gov/ezsearch/jsp/viewLinkCont...,"$60,373",ANNUAL SUBSCRIPTION TO POLITICO PRO,True,9507,9523ZY21P0041,P00003,0.0,...,,,,,,,,,,
2,CONSUMER FINANCIAL PROTECTION BUREAU,2/11/2025,https://www.fpds.gov/ezsearch/jsp/viewLinkCont...,"$148,350",BLOOMBERG TERMINALS,True,955F,9531CB24C0024,P00001,0.0,...,,,,,,,,,,
3,CONSUMER FINANCIAL PROTECTION BUREAU,2/11/2025,https://www.fpds.gov/ezsearch/jsp/viewLinkCont...,"$584,172",BLOOMBERG GOVERNMENT SUBSCRIPTIONS,True,955F,9531CB23F0100,P00002,0.0,...,,,,,,,,,,
4,CONSUMER FINANCIAL PROTECTION BUREAU,2/11/2025,https://www.fpds.gov/ezsearch/jsp/viewLinkCont...,"$522,660",POLITICO PRO,True,955F,9531CB25P0012,P00001,0.0,...,,,,,,,,,,


In [19]:
no_url = dat[~dat['Valid URL']]
no_url

dat = dat[dat['Valid URL']]

#If no valid record PIID
dat.dropna(subset=['PIID'], inplace=True)

In [20]:
def handle_missing_totals(current_val, total_val):
    if pd.isna(total_val):
        return current_val
    else:
        return total_val

dat['totalBaseAndExercisedOptionsValue'] = dat.apply(lambda row: handle_missing_totals(row['baseAndExercisedOptionsValue'], row['totalBaseAndExercisedOptionsValue']), axis=1)
dat['totalUltimateContractValue'] = dat.apply(lambda row: handle_missing_totals(row['ultimateContractValue'], row['totalUltimateContractValue']), axis=1)
dat['totalObligatedAmount'] = dat.apply(lambda row: handle_missing_totals(row['obligatedAmount'], row['totalObligatedAmount']), axis=1)

In [21]:
dat[['vendorState', 'vendorCity', 'vendorZip', 'vendorCongressionalDistrict']] = dat[['vendorState', 'vendorCity', 'vendorZip', 'vendorCongressionalDistrict']].fillna('UNKNOWN')

In [22]:
def clean_money(dollars: str) -> str:
    if dollars is None or pd.isna(dollars) or str(dollars).strip() == '':
        return 0.0

    if type(dollars) is float:
        return dollars
    
    dollars = dollars.replace('\r', '')
    dollars = dollars.replace('$', '')
    dollars = dollars.replace(',', '')

    if dollars == "SEE FPDS":
        dollars = 0


    return float(dollars)

dat['Doge Value'] = dat.apply(lambda row: clean_money(row['Doge Value']), axis=1)
dat['obligatedAmount'] = dat.apply(lambda row: clean_money(row['obligatedAmount']), axis=1)
dat['totalObligatedAmount'] = dat.apply(lambda row: clean_money(row['totalObligatedAmount']), axis=1)
dat['baseAndExercisedOptionsValue'] = dat.apply(lambda row: clean_money(row['baseAndExercisedOptionsValue']), axis=1)
dat['totalBaseAndExercisedOptionsValue'] = dat.apply(lambda row: clean_money(row['totalBaseAndExercisedOptionsValue']), axis=1)
dat['ultimateContractValue'] = dat.apply(lambda row: clean_money(row['ultimateContractValue']), axis=1)
dat['totalUltimateContractValue'] = dat.apply(lambda row: clean_money(row['totalUltimateContractValue']), axis=1)
dat['feesPaidForUseOfService'] = dat.apply(lambda row: clean_money(row['feesPaidForUseOfService']), axis=1)

In [8]:
sum_rows = []

def format_as_money(amount: float) -> str:
    """Formats a float as a string representing money, 
    including commas and two decimal places."""
    return "${:,.2f}".format(amount)

## How much money does DOGE claim to have saved in contracts on doge.gov?

What is the sum of the "Value" column on the doge "savings" tab?

In [30]:
sum_rows.append({
    "Question": "How much money does DOGE claim to have saved in cancelled contracts?",
    "Answer": format_as_money(sum(dat['Doge Value']))
})

## How much money was actually saved?
We want to take the ultimate total potential contract value, and subtract all exercised options (money already spent on the contract).
This would reflect the maximum potential savings after termination of a contract in progress.

(totalUltimateContractValue - totalBaseAndExercisedOptionsValue)

"Being as generous as possible, this is the most that could be saved. Why is your number more than doubled?"

In [31]:
def calculate_savings(row):
    tcv = row['totalUltimateContractValue']
    spent = row['totalBaseAndExercisedOptionsValue']
    return tcv - spent

dat['maximumSavingsRealized'] = dat.apply(lambda row: calculate_savings(row), axis=1)


sum_rows.append({
    "Question": "How much money was actually saved?",
    "Answer": format_as_money(sum(dat['maximumSavingsRealized']))
})

## How much money was wasted by canceling contracts early?

For in progress contracts that did not complete, how much money was already invested that the government will not realize the benefit of?  
i.e. sunk costs.  

We want to look at the total exercised options, and estimate overhead costs for contract procurement.  

DISCLAIMER: This assumes that the intent behind the award requires the contract to be executed completely/fully funded in order to be realized. There may be circumstances where some value was still salvaged, however this granularity of information cannot be determined with our level of access.

In [32]:
def calculate_sunk_costs(row):
    ESTIMATED_OVERHEAD_COSTS = 0
    sunk_cost = row['totalBaseAndExercisedOptionsValue']
    return sunk_cost + ESTIMATED_OVERHEAD_COSTS

dat['sunkCosts'] = dat.apply(lambda row: calculate_sunk_costs(row), axis=1)
sum(dat['sunkCosts'])

sum_rows.append({
    "Question": "How much money was wasted by terminating contracts early?",
    "Answer": format_as_money(sum(dat['sunkCosts']))
})

## How much money is each state losing?

To determine this, we'll want to know, by state, how much potential spending is being withheld. This will use the derived column 'maximumSavingsRealized' to indicate funding that is no longer flowing to companies in the respective states.  

We will create a smaller dataframe with just the state code (vendor and savings, and groupby the state

In [33]:
only_state = dat[['PIID', 'vendorState', 'maximumSavingsRealized']]
BY_STATE = only_state.groupby('vendorState', as_index=False)['maximumSavingsRealized'].sum()
BY_STATE = BY_STATE.sort_values(by='maximumSavingsRealized', ascending=False)
BY_STATE['maximumSavingsRealized'] = BY_STATE.apply(lambda row: format_as_money(row['maximumSavingsRealized']), axis=1)
BY_STATE.rename(columns={'maximumSavingsRealized': 'Money Lost'}, inplace=True)

sum_rows.append({
    "Question": "How much money is each state losing?",
    "Answer": "See Sheet 2"
})

## How much money is each district losing?

Again, same methodology, just with congressional district.

In [34]:
#vendorCongressionalDistrict
only_district = dat[['PIID', 'vendorCongressionalDistrict', 'maximumSavingsRealized']]
BY_DISTRICT = only_district.groupby('vendorCongressionalDistrict', as_index=False)['maximumSavingsRealized'].sum()
BY_DISTRICT = BY_DISTRICT.sort_values(by='maximumSavingsRealized', ascending=False)
BY_DISTRICT['maximumSavingsRealized'] = BY_DISTRICT.apply(lambda row: format_as_money(row['maximumSavingsRealized']), axis=1)
BY_DISTRICT.rename(columns={'maximumSavingsRealized': 'Money Lost'}, inplace=True)
sum_rows.append({
    "Question": "How much money is each district losing?",
    "Answer": "See Sheet 3"
})

## How much money is each company losing?

Similar methodology as by state, just grouped by company name.

In [35]:
only_company = dat[['PIID', 'vendorName', 'maximumSavingsRealized']]
BY_COMPANY = only_company.groupby('vendorName', as_index=False)['maximumSavingsRealized'].sum()
BY_COMPANY = BY_COMPANY.sort_values(by='maximumSavingsRealized', ascending=False)
BY_COMPANY['maximumSavingsRealized'] = BY_COMPANY.apply(lambda row: format_as_money(row['maximumSavingsRealized']), axis=1)
BY_COMPANY.rename(columns={'maximumSavingsRealized': 'Money Lost'}, inplace=True)
sum_rows.append({
    "Question": "How much money is each company losing?",
    "Answer": "See Sheet 4"
})

## How many private sector jobs will be lost? (estimate)

Take the average median salary of each state (provided by sofi) to calculate the 

## Summarize findings

Put it all together in one sheet.

In [36]:
from pandas import ExcelWriter
def save_xls(list_dfs, xls_path):
    with ExcelWriter(xls_path) as writer:
        for n, df in enumerate(list_dfs):
            df.to_excel(writer,'Sheet %s' % (n + 1), index=False)

QUESTIONS = pd.DataFrame(sum_rows)
save_xls([QUESTIONS, BY_STATE, BY_DISTRICT, BY_COMPANY], "summary.xlsx")

  df.to_excel(writer,'Sheet %s' % (n + 1), index=False)
