# 12. Performance

## Exercise 48 - Categories

In [4]:
# load in data
parking = pd.read_csv(root_path + 'nyc-parking-violations-2020.csv',
                      usecols=['Plate ID',
                               'Registration State',
                               'Vehicle Make',
                               'Vehicle Color',
                               'Vehicle Body Type'],
                     low_memory=False)

parking.head()

Unnamed: 0,Plate ID,Registration State,Vehicle Body Type,Vehicle Make,Vehicle Color
0,J58JKX,NJ,SDN,HONDA,BK
1,KRE6058,PA,SUBN,ME/BE,BLK
2,444326R,NJ,SDN,LEXUS,BLACK
3,F728330,OH,SDN,CHEVR,
4,FMY9090,NY,SUBN,JEEP,GREY


In [5]:
# determine amount of memory currently used
parking.memory_usage(deep=True).sum()

3797397954

In [6]:
# turn each column into a category
for column in ['Plate ID', 'Registration State', 'Vehicle Make', 'Vehicle Color', 'Vehicle Body Type']:
    parking[column] = parking[column].astype('category')

In [7]:
# new memory usage
parking.memory_usage(deep=True).sum()

480673817

In [8]:
# amount of memory saved?
3797397954-480673817

3316724137

In [9]:
# new data usage / old data usage
480673817/3797397954

0.1265797851114553

In [10]:
# what types are columns now?
parking.dtypes

Plate ID              category
Registration State    category
Vehicle Body Type     category
Vehicle Make          category
Vehicle Color         category
dtype: object

In [11]:
# .info provides deep memory_usage in a higher number (megabytes here)
parking.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12495734 entries, 0 to 12495733
Data columns (total 5 columns):
 #   Column              Dtype   
---  ------              -----   
 0   Plate ID            category
 1   Registration State  category
 2   Vehicle Body Type   category
 3   Vehicle Make        category
 4   Vehicle Color       category
dtypes: category(5)
memory usage: 458.4 MB


### Exercise 48b

In [13]:
# which made less sense to convert to category?
# 'Plate ID', as there are many unique values

In [14]:
# which made the most sense to turn into a category?

# load in data
parking = pd.read_csv(root_path + 'nyc-parking-violations-2020.csv',
                      usecols=['Plate ID',
                               'Registration State',
                               'Vehicle Make',
                               'Vehicle Color',
                               'Vehicle Body Type'],
                     low_memory=False)

parking.head()

(parking.count() / parking.nunique()).sort_values(ascending=False)

Registration State    183760.794118
Vehicle Body Type       7619.200734
Vehicle Color           6383.835443
Vehicle Make            2386.432630
Plate ID                   3.850149
dtype: float64

In [15]:
# yes, standardizing the spellings would result in fewer categories, resulting in less memory usage

In [16]:
# read first 10,000 lines from the csv, but all columns
ten_thousand = pd.read_csv(root_path + 'nyc-parking-violations-2020.csv',
                           nrows=10_000)
ten_thousand.head()            

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
0,1477633194,J58JKX,NJ,PAS,05/08/1972 12:00:00 AM,16,SDN,HONDA,P,8730,...,BK,0.0,0,-,0,,,,,
1,1449715424,KRE6058,PA,PAS,08/29/1977 12:00:00 AM,98,SUBN,ME/BE,P,86530,...,BLK,0.0,0,-,0,,,,,
2,1455779155,444326R,NJ,PAS,10/03/1988 12:00:00 AM,20,SDN,LEXUS,P,27030,...,BLACK,0.0,0,-,0,,,,,
3,1458800908,F728330,OH,PAS,01/03/1990 12:00:00 AM,21,SDN,CHEVR,P,33030,...,,0.0,0,-,0,,,,,
4,1466038676,FMY9090,NY,PAS,02/14/1990 12:00:00 AM,21,SUBN,JEEP,S,45130,...,GREY,0.0,2015,-,0,,,,,


In [17]:
# show 10 columns that would benefit most from conversion to category
(ten_thousand.count() / ten_thousand.nunique()).sort_values(ascending=False).head(10)

Violation Description                5615.000000
Violation Legal Code                 5615.000000
Law Section                          3333.333333
Unregistered Vehicle?                2169.000000
Violation County                     1086.333333
Issuing Agency                        909.090909
Feet From Curb                        833.333333
Violation In Front Of Or Opposite     796.800000
Date First Observed                   400.000000
Plate Type                            344.827586
dtype: float64

## Exercise 49 - Faster reading and writing

In [19]:
# load in data
parking = pd.read_csv(root_path + 'nyc-parking-violations-2020.csv',
                      usecols=['Plate ID',
                               'Registration State',
                               'Vehicle Make',
                               'Vehicle Color',
                               'Vehicle Body Type'],
                     low_memory=False)

In [20]:
# time writing to 3 different file formats

file_names = ['parking.csv', 'parking.json', 'parking.feather']
convert_methods = [parking.to_csv, parking.to_json, parking.to_feather]

In [21]:
# test timing
t0 = time.perf_counter()
t1 = time.perf_counter()

duration = t1 - t0
duration

2.2299995180219412e-05

In [22]:
# function to time all 3
for file, method in zip(file_names, convert_methods):
    t0 = time.perf_counter()
    method(file)
    t1 = time.perf_counter()
    duration = t1 - t0
    print(f'{file}: {duration} seconds')

parking.csv: 20.741107499983627 seconds
parking.json: 10.354889100010041 seconds
parking.feather: 5.399322999990545 seconds


In [23]:
# view the size of the created files
root = 'parking'
for file in glob.glob(f'{root}*'):
    print(f'{file:27}: {os.stat(file).st_size:,}')

parking.csv                : 424,952,751
parking.feather            : 403,367,042
parking.json               : 1,069,427,589


In [24]:
# read files into a df, time how long it takes
read_methods = [pd.read_csv, pd.read_json, pd.read_feather]

for file, method in zip(file_names, read_methods):
    t0 = time.perf_counter()
    method(file)
    t1 = time.perf_counter()
    duration = t1 - t0
    print(f'{file}: {duration} seconds')

parking.csv: 7.324452900036704 seconds
parking.json: 75.53041779994965 seconds
parking.feather: 6.40810679999413 seconds


### Exercise 49b

In [26]:
# compare read_csv speed using pyarrow engine
t0 = time.perf_counter()

pd.read_csv('parking.csv', engine='pyarrow')

t1 = time.perf_counter()
duration = t1 - t0
duration

7.549705000012182

In [27]:
# does specifying dtypes take more or less time to read?
t0 = time.perf_counter()

pd.read_csv('parking.csv',
            dtype={'Plate ID': str,
                   'Registration State': str,
                   'Vehicle Make': str,
                   'Vehicle Color': str,
                   'Vehicle Body Type': str})
t1 = time.perf_counter()
duration = t1 - t0
duration

# doing it like this in this case slows it down

7.723472299985588

In [28]:
# data usage with NumPy vs PyArrow backend?

# NumPy
np_mem = pd.read_csv('parking.csv').memory_usage(deep=True).sum()

# PyArrow
pa_mem = pd.read_csv('parking.csv', dtype_backend='pyarrow').memory_usage(deep=True).sum()

In [29]:
# bytes saved using PyArrow
np_mem - pa_mem

3293170241

In [30]:
# proportion
pa_mem / np_mem

0.15502622079296735

## Exercise 50 - "query" and "eval"

In [32]:
# load in data
parking = pd.read_csv(root_path + 'nyc-parking-violations-2020.csv',
                      usecols=['Plate ID',
                               'Registration State',
                               'Plate Type',
                               'Feet From Curb',
                               'Vehicle Make',
                               'Vehicle Color'],
                      engine='pyarrow')
parking.head()

Unnamed: 0,Plate ID,Registration State,Plate Type,Feet From Curb,Vehicle Make,Vehicle Color
0,J58JKX,NJ,PAS,0,HONDA,BK
1,KRE6058,PA,PAS,0,ME/BE,BLK
2,444326R,NJ,PAS,0,LEXUS,BLACK
3,F728330,OH,PAS,0,CHEVR,
4,FMY9090,NY,PAS,0,JEEP,GREY


In [33]:
parking = parking.rename(columns={'Plate ID': 'pid',
                                  'Registration State': 'state',
                                  'Plate Type': 'ptype',
                                  'Feet From Curb': 'feet',
                                  'Vehicle Make': 'make',
                                  'Vehicle Color': 'color'})
parking.head()

Unnamed: 0,pid,state,ptype,feet,make,color
0,J58JKX,NJ,PAS,0,HONDA,BK
1,KRE6058,PA,PAS,0,ME/BE,BLK
2,444326R,NJ,PAS,0,LEXUS,BLACK
3,F728330,OH,PAS,0,CHEVR,
4,FMY9090,NY,PAS,0,JEEP,GREY


In [34]:
# find all cars with registration state NY, NJ, CT using df.loc
%timeit parking.loc[parking['state'].isin(['NY', 'NJ', 'CT'])]

1.54 s ± 27.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
# using df.query
%timeit parking.query("state == 'NY' | state == 'NJ' | state == 'CT'")

2.19 s ± 59.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
# compare to boolean index with .loc?
%timeit parking.loc[(parking['state'] == 'NY') | (parking['state'] == 'NJ') | (parking['state'] == 'CT')]

3.46 s ± 103 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [37]:
# I was efficient in my original syntax, so no speed was gained by utilizing query

In [38]:
# perform many queries using df.loc, df.query, df.eval and compare times

# cars from NY
# cars from NY with passenger (PAS) plates
# white cars from NY with PAS plates
# white cars from NY with PAS plates parked > 1 foot from curb
# white Toyota make cars from NY with PAS plates parked > 1 foot from curb

In [39]:
# using df.loc

# cars from NY
%timeit parking.loc[parking['state'] == 'NY']

# cars from NY with passenger (PAS) plates
%timeit parking.loc[(parking['state'] == 'NY') & (parking['ptype'] == 'PAS')]

# white cars from NY with PAS plates
%timeit parking.loc[(parking['state'] == 'NY') & (parking['ptype'] == 'PAS') & (parking['color'] == 'WHITE')]

# white cars from NY with PAS plates parked > 1 foot from curb
%timeit parking.loc[(parking['state'] == 'NY') & (parking['ptype'] == 'PAS') & (parking['color'] == 'WHITE') & (parking['feet'] > 1)]

# white Toyota make cars from NY with PAS plates parked > 1 foot from curb
%timeit parking.loc[(parking['state'] == 'NY') & (parking['ptype'] == 'PAS') & (parking['color'] == 'WHITE') & (parking['feet'] > 1) & (parking['make'] == 'TOYOT')]

1.75 s ± 84.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.28 s ± 36.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.41 s ± 83.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.33 s ± 9.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
3.12 s ± 29.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
# using df.query

# cars from NY
%timeit parking.query("state == 'NY'")

# cars from NY with passenger (PAS) plates
%timeit parking.query("state == 'NY' & ptype == 'PAS'")

# white cars from NY with PAS plates
%timeit parking.query("state == 'NY' & ptype == 'PAS' & color == 'WHITE'")

# white cars from NY with PAS plates parked > 1 foot from curb
%timeit parking.query("state == 'NY' & ptype == 'PAS' & color == 'WHITE' & feet > 1")

# white Toyota make cars from NY with PAS plates parked > 1 foot from curb
%timeit parking.query("state == 'NY' & ptype == 'PAS' & color == 'WHITE' & feet > 1 & make == 'TOYOT'")

1.4 s ± 28.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.57 s ± 33 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.18 s ± 19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.17 s ± 15.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.45 s ± 16.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [41]:
# using df.eval

# cars from NY
%timeit parking[parking.eval("state == 'NY'")]

# cars from NY with passenger (PAS) plates
%timeit parking[parking.eval("state == 'NY' & ptype == 'PAS'")]

# white cars from NY with PAS plates
%timeit parking[parking.eval("state == 'NY' & ptype == 'PAS' & color == 'WHITE'")]

# white cars from NY with PAS plates parked > 1 foot from curb
%timeit parking[parking.eval("state == 'NY' & ptype == 'PAS' & color == 'WHITE' & feet > 1")]

# white Toyota make cars from NY with PAS plates parked > 1 foot from curb
%timeit parking[parking.eval("state == 'NY' & ptype == 'PAS' & color == 'WHITE' & feet > 1 & make == 'TOYOT'")]

1.4 s ± 24.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.59 s ± 25.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.19 s ± 25.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.18 s ± 20.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.46 s ± 33.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Exercise 50b

In [43]:
# testing various syntax and engines

In [44]:
%timeit parking.query("state == 'NY' and ptype == 'PAS' and color == 'WHITE' and feet > 1 and make == 'TOYOT'")

1.46 s ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [45]:
%timeit parking.query("state == 'NY' & ptype == 'PAS' & color == 'WHITE' & feet > 1 & make == 'TOYOT'")

1.42 s ± 8.35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [46]:
%timeit parking.query("state == 'NY' and ptype == 'PAS' and color == 'WHITE' and feet > 1 and make == 'TOYOT'", engine='python')

1.49 s ± 34.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [47]:
%timeit parking.query("state == 'NY' & ptype == 'PAS' & color == 'WHITE' & feet > 1 & make == 'TOYOT'", engine='python')

1.46 s ± 14.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [48]:
# query cars ticketed more than 1 meter from the curb using df.loc and df.query
%timeit parking.loc[parking['feet'] > 3.28]
%timeit parking.query("feet > 3.28")

58.5 ms ± 1.25 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
97.9 ms ± 1.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [49]:
# more than 1 meter from curb and from NY (also tested .isin() to show how powerful it is)
%timeit parking.loc[(parking['feet'] > 3.28) & (parking['state'].isin(['NY']))]
%timeit parking.loc[(parking['feet'] > 3.28) & (parking['state'] == 'NY')]
%timeit parking.query("feet > 3.28 & state == 'NY'")

486 ms ± 6.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
841 ms ± 19.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
511 ms ± 3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Chapter 12 Notes
- df.memory_usage(deep=True) includes memory of object data types that normally is not included in the calculation
- (df.count() / df.nunique()).sort_values(ascending=False) is helpful in determining which columns could benefit most from being converted to categorical dtype
- df.info(memory_usage='deep') provides usage in a higher number (e.g. mb, gb)
- pd.read_csv(filename, engine='pyarrow') can be up to 20x faster, can also use pd.from_feather('myfile.feather')
- Using pd.read_csv(filename, dtype_backend='pyarrow') can take advantage of Apache Arrow dtypes in the backend. Simple comparisons currently faster with PyArrow, but more complex groupings and joins are still currently faster with NumPy backend.
- df.to_feather('myfile.feather') reads to feather file
- From my testing, using engine='python' vs. 'numexpr' didn't make any computational difference with df.query. Using query on smaller dfs can end up being slower but it's a great option to be comfortable with when working with larger datasets. Bottleneck is often at assignment/retrieval of elements, not calculations, which could be the case here.
- .isin() is extremely efficient, even when using only one value in the list