In [None]:
import pandas as pd
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

def currency(x):
    return float(
        x
        .replace(',', '')
        .replace('$', '')
        .replace('(', '-')
        .replace(')', '')
    ) if x else 0

def clean_merchant(x):
    x = re.sub('[0-9]', '', x.title())
    if 'Amzn Mktp Us' in x:
        return 'Amzn Mktp US'
    elif 'Amazon.Com' in x:
        return 'Amazon.com'
    else:
        return x

years = [2018, 2019, 2020]

df = (
    pd.concat(
        pd.read_csv(
            f"../input/san-jose-public-employee-compensation/procurement-card-transactions-{year}.csv",
            converters = {
                'Transaction Amount': currency,
                'Cardholder Name': lambda x: x.title(),
                'Merchant Name': clean_merchant, # Remove trans id, store #, etc
            }
        )
        .assign(Year = str(year))
        for year in years
    )
)
df.describe(include='all')

# What are the top merchants?

In [None]:
(
    df
    .query("Department in ['POLICE DEPARTMENT','POLICE']")
    .groupby(['Merchant Name'])
    .agg(
        TotalSpent=('Transaction Amount','sum'),
        Category=('Merchant Category Code Description', pd.Series.mode)
    )
    .sort_values(by='TotalSpent', ascending=False)
    .iloc[0:200]
)

# Who submits the expense reports?

In [None]:
(
    df
    .query("Department in ['POLICE DEPARTMENT', 'POLICE']")
    .groupby('Cardholder Name')
    .agg(
        TotalSpent=('Transaction Amount','sum'),
        TransCnt=('Transaction Date', 'count'),
        TopMerchant=('Merchant Name', pd.Series.mode),
    )
    .sort_values(by='TotalSpent', ascending=False)
    .iloc[0:100]
)

# Let's see the top 500 transactions

In [None]:
(
    df
    .query("Department in ['POLICE DEPARTMENT', 'POLICE']")
    .sort_values(by='Transaction Amount', ascending=False)
    .iloc[0:500]
)