In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.options.display.max_columns = 50

In [3]:
def read_in_csv(file_path='./parking-geo.csv'):
    # let's be memory efficient when loading our data
    dtypes_dict = \
    {
        'ticket_number': np.int32,
        'violation_location': str,
        'license_plate_number': str,
        'license_plate_state': 'category',
        'license_plate_type': 'category',
        'zipcode': str,
        'violation_code': 'category',
        'violation_description': 'category',
        'unit': 'category',
        'unit_description': 'category',
        'vehicle_make': 'category',
        'fine_level1_amount': np.int32,
        'fine_level2_amount': np.int32,
        'current_amount_due': np.float64,
        'total_payments': np.float64,
        'ticket_queue': 'category',
        'notice_level': 'category',
        'hearing_disposition': 'category',
        'notice_number': np.int32,
        'dismissal_reason': str,
        'officer': str,
        'address': str,
        'license_hash': str,
        'year': np.int32,
        'month': 'category',
        'hour': 'category',
        'penalty': np.float64,
        'ward': 'category',
        'geocode_accuracy': np.float64,
        'geocode_accuracy_type': 'category',
        'geocoded_address': str,
        'geocoded_lng': str,
        'geocoded_lat': str,
        'geocoded_city': 'category',
        'geocoded_state': 'category'
    }
    #still better than strings
    parse_dates_list = \
    [
        'issue_date',
        'ticket_queue_date',
    ]
    
    # read csv into memory -- this takes quite a while
    df = pd.read_csv(file_path, dtype=dtypes_dict, parse_dates=parse_dates_list)
    return df

In [4]:
def filter_data(df_raw, min_year = 1995, max_year = 2019):
    df_filtered = df_raw[
        (df_raw['year'] > min_year) & (df_raw['year'] < max_year) & 
        (df_raw['geocode_accuracy_type'].isin(['rooftop', 'range_interpolation', 'intersection', 'point'])) & 
        (df_raw['geocode_accuracy'] > 0.7) &
        (df_raw['geocoded_city'] == 'Chicago')
    ]
    # not used
    # df_filtered_na = df_filtered[df_filtered['ward'].isnull()]
    df_filtered = df_filtered[df_filtered['ward'].notnull()]
    return df_filtered

In [30]:
def calculate_summary_stats(df_raw, min_year = 1995, max_year = 2019, filter_and_group_data=True):
    
    # private helper function
    def rank_series(series):
        out_series = series.rank(ascending=False)
        return out_series
    
    if filter_and_group_data:
        df_filtered = filter_data(df_raw, min_year, max_year)
    else:
        df_filtered = df_raw

    # calculate base dataframes
    df_dict = dict()
    df_dict['filtered'] = df_filtered
    df_dict['police_tickets'] = df_filtered[
        df_filtered['unit_description'].isin(['CPD', 'CPD-Other', 'CPD-Airport'])
    ]
    df_dict['contested_tickets'] = df_filtered[
        df_filtered['hearing_disposition'].isin(['Liable', 'Not Liable'])
    ]
    df_dict['paid_tickets'] = df_filtered[
        df_filtered['ticket_queue'] == 'Paid'
    ]
    df_dict['dismissed_tickets'] = df_filtered[
        df_filtered['ticket_queue'] == 'Dismissed'
    ]
    df_dict['seized_or_suspended_tickets'] = df_filtered[
        df_filtered['notice_level'].isin(['SEIZ', 'DLS'])
    ]
    df_dict['bankruptcy_tickets'] = df_filtered[
        df_filtered['ticket_queue'] == 'Bankruptcy'
    ]

    # group dataframes by ward
    gb_dict = dict()
    
    for key in df_dict:
        if filter_and_group_data:
            gb_dict[key] = df_dict[key].groupby('ward')
        else:
            gb_dict[key] = df_dict[key]

    # calculate the different stats
    out_dict = dict()
    ticket_count = gb_dict['filtered']['ticket_number'].count()
    out_dict['ticket_count'] = ticket_count
    out_dict['current_amount_due'] = gb_dict['filtered']['current_amount_due'].sum()
    out_dict['fine_level1_amount'] = gb_dict['filtered']['fine_level1_amount'].sum()
    out_dict['total_payments'] = gb_dict['filtered']['total_payments'].sum()
    out_dict['avg_per_ticket'] = out_dict['fine_level1_amount']/ticket_count
    out_dict['paid_pct'] = out_dict['total_payments']/(out_dict['current_amount_due']+out_dict['total_payments'])
    out_dict['police_ticket_count'] = gb_dict['police_tickets']['ticket_number'].count()
    out_dict['police_ticket_count_pct'] = out_dict['police_ticket_count'] / ticket_count
    out_dict['contested_ticket_count'] = gb_dict['contested_tickets']['ticket_number'].count()
    out_dict['contested_ticket_count_pct'] = out_dict['contested_ticket_count'] / ticket_count
    out_dict['paid_ticket_count'] = gb_dict['paid_tickets']['ticket_number'].count()
    out_dict['paid_ticket_count_pct'] = out_dict['paid_ticket_count'] / ticket_count
    out_dict['dismissed_ticket_count'] = gb_dict['dismissed_tickets']['ticket_number'].count()
    out_dict['dismissed_ticket_count_pct'] = out_dict['dismissed_ticket_count'] / ticket_count
    out_dict['seized_or_suspended_ticket_count'] = gb_dict['seized_or_suspended_tickets']['ticket_number'].count()
    out_dict['seized_or_suspended_ticket_count_pct'] = out_dict['seized_or_suspended_ticket_count'] / ticket_count
    out_dict['bankruptcy_ticket_count'] = gb_dict['bankruptcy_tickets']['ticket_number'].count()
    out_dict['bankruptcy_ticket_count_pct'] = out_dict['bankruptcy_ticket_count'] / ticket_count

    # calculate ranks; combine and format output dataframe
    if filter_and_group_data:
        df_out = pd.DataFrame()
        for key in out_dict:
            df_out[key] = out_dict[key]
            df_out[key+'_rank'] = rank_series(out_dict[key]).astype(int)
        df_out.index = df_out.index.astype(int)
        df_out = df_out.sort_index()

        return df_out
    else:
        series_out = pd.Series()
        for key in out_dict:
            series_out[key] = out_dict[key]
        return series_out

In [28]:
def calculate_top_level_summaries(df_in, columns):
    df_out = calculate_summary_stats(df_in, min_year=None, max_year=None, filter_and_group_data=False)
    return df_out[columns]

In [26]:
def calculate_top_5_per_year(df_raw, min_year=1995, max_year=2019):
    df_filtered = filter_data(df_raw, min_year, max_year)
    gb_in = df_filtered[['ticket_number','year','violation_code']].groupby(['year','violation_code'])
    top_5_list = []
    for year, new_df in gb_in.count().groupby('year'):
        top_5_list.append(new_df.nlargest(5, columns='ticket_number'))

    df_out = pd.concat(top_5_list)
    return df_out

In [8]:
%%time
# takes quite a while...
df = read_in_csv()

CPU times: user 9min 21s, sys: 2min 43s, total: 12min 5s
Wall time: 12min 49s


In [10]:
%%time
df_1996to2018 = calculate_summary_stats(df, min_year=1995, max_year=2019)

CPU times: user 1min 23s, sys: 3min 6s, total: 4min 29s
Wall time: 5min 29s


In [11]:
%%time
df_2013to2017 = calculate_summary_stats(df, min_year=2012, max_year=2018)

CPU times: user 20.6 s, sys: 13.3 s, total: 33.9 s
Wall time: 29 s


In [12]:
%%time
df_top_five_2013to2017 = calculate_top_5_per_year(df, min_year=2012, max_year=2018)

CPU times: user 9.94 s, sys: 2.66 s, total: 12.6 s
Wall time: 7.66 s


In [13]:
%%time
col_list = df_2013to2017.columns.tolist()
final_col_list = [x for x in col_list if x[-4:] != '_pct' and x[-5:] != '_rank']

CPU times: user 86 µs, sys: 1e+03 ns, total: 87 µs
Wall time: 90.8 µs


In [31]:
%%time
df_top_level_summaries = calculate_top_level_summaries(df, final_col_list)

CPU times: user 53 s, sys: 2min 22s, total: 3min 15s
Wall time: 3min 56s


In [33]:
df_1996to2018.to_csv('df_1996to2018.csv')
df_2013to2017.to_csv('df_2013to2017.csv')
df_top_five_2013to2017.to_csv('df_top_five_2013to2017')
df_top_level_summaries.to_csv('df_top_level_summaries')

In [34]:
df_check = pd.read_csv('./wardstotals.csv', index_col='ward').sort_index(ascending=True)
df_check5yr = pd.read_csv('./wardstotals5yr.csv', index_col='ward').sort_index(ascending=True)
df_1996to2018_check = df_check[col_list]
df_2013to2017_check = df_check5yr[col_list]
wardstotals_sql_minus_pandas = df_1996to2018_check - df_1996to2018
wardstotals5yr_sql_minus_pandas = df_2013to2017_check - df_2013to2017

In [35]:
wardstotals5yr_sql_minus_pandas.to_csv('./wardstotals5yr_sql_minus_pandas.csv')
wardstotals_sql_minus_pandas.to_csv('./wardstotals_sql_minus_pandas.csv')