In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
filename = '/Users/reuven/Courses/Current/data/nyc-parking-violations-2020.csv'

df = pd.read_csv(filename, usecols=['Issue Date', 'Vehicle Make', 'Vehicle Color', 'Street Name'])

In [3]:
df.head(10)

Unnamed: 0,Issue Date,Vehicle Make,Street Name,Vehicle Color
0,05/08/1972 12:00:00 AM,HONDA,43 ST,BK
1,08/29/1977 12:00:00 AM,ME/BE,UNION ST,BLK
2,10/03/1988 12:00:00 AM,LEXUS,CLERMONT AVENUE,BLACK
3,01/03/1990 12:00:00 AM,CHEVR,DIVISION AVE,
4,02/14/1990 12:00:00 AM,JEEP,GRAND ST,GREY
5,07/21/1990 12:00:00 AM,HYUN,B 99 ST,GY
6,09/19/1990 12:00:00 AM,INTER,W/S/O WASHINGTON ST,WH
7,10/14/1990 12:00:00 AM,BMW,BAINBRIDGE AVE,BLK
8,07/25/1991 12:00:00 AM,FORD,94 ST,GREY
9,01/01/2000 12:00:00 AM,CMCKU,E 54 ST,RED


In [4]:
# how commonly were different colored cars ticketed in New York?
df['Vehicle Color'].value_counts().head(10)

Vehicle Color
WH       2344858
GY       2307704
BK       2066374
WHITE    1061234
BL        775124
RD        483298
BLACK     465110
GREY      306787
BROWN     292348
SILVE     191477
Name: count, dtype: int64

In [5]:
df.head(15)

Unnamed: 0,Issue Date,Vehicle Make,Street Name,Vehicle Color
0,05/08/1972 12:00:00 AM,HONDA,43 ST,BK
1,08/29/1977 12:00:00 AM,ME/BE,UNION ST,BLK
2,10/03/1988 12:00:00 AM,LEXUS,CLERMONT AVENUE,BLACK
3,01/03/1990 12:00:00 AM,CHEVR,DIVISION AVE,
4,02/14/1990 12:00:00 AM,JEEP,GRAND ST,GREY
...,...,...,...,...
10,01/02/2000 12:00:00 AM,HONDA,ROCKAWAY AVE,GREY
11,01/02/2000 12:00:00 AM,HONDA,E 167 ST,GRAY
12,01/02/2000 12:00:00 AM,HONDA,ROCKAWAY AVE,BLACK
13,01/02/2000 12:00:00 AM,HONDA,ROCKAWAY AVE,BLACK


In [6]:
# I can run the .replace method, giving the original and new values

df['Vehicle Color'].replace('BK', 'BLACK')

0           BLACK
1             BLK
2           BLACK
3             NaN
4            GREY
            ...  
12495729       BR
12495730      BLK
12495731       GY
12495732      BLK
12495733    WHITE
Name: Vehicle Color, Length: 12495734, dtype: object

In [7]:
df['Vehicle Color'].replace('BK', 'BLACK').value_counts().head(15)

Vehicle Color
BLACK    2531484
WH       2344858
GY       2307704
WHITE    1061234
BL        775124
RD        483298
GREY      306787
BROWN     292348
SILVE     191477
GR        182929
BLUE      178298
RED       161693
TN        120576
BR        102204
YW         98700
Name: count, dtype: int64

In [None]:
# this will work, but what a pain!

df['Vehicle Color'] = df['Vehicle Color'].replace('BK', 'BLACK')
df['Vehicle Color'] = df['Vehicle Color'].replace('WH', 'WHITE')

In [8]:
# instead, we can define a dictionary!

color_map = {'BK':'BLACK', 'BLK':'BLACK', 'WH':'WHITE', 'GY':'GRAY', 'GREY':'GRAY'}

df['Vehicle Color'].replace(color_map)

0           BLACK
1           BLACK
2           BLACK
3             NaN
4            GRAY
            ...  
12495729       BR
12495730    BLACK
12495731     GRAY
12495732    BLACK
12495733    WHITE
Name: Vehicle Color, Length: 12495734, dtype: object

In [9]:
# assign the column back, and we're set!
df['Vehicle Color'] = df['Vehicle Color'].replace(color_map)

In [10]:
df['Vehicle Color'].value_counts().head(20)

Vehicle Color
WHITE    3406092
GRAY     2655345
BLACK    2623023
BL        775124
RD        483298
BROWN     292348
SILVE     191477
GR        182929
BLUE      178298
RED       161693
TN        120576
BR        102204
YW         98700
OTHER      60245
GREEN      58765
GL         54851
GRY        46527
MR         42812
WHT        35433
YELLO      32792
Name: count, dtype: int64

In [11]:
# instead, we can define a dictionary!

color_map = {'BK':'BLACK', 'BLK':'BLACK', 'WH':'WHITE', 'GY':'GRAY', 'GREY':'GRAY',
            'BL':'BLUE', 'RD':'RED', 'GR':'GREEN'}

df['Vehicle Color'] = df['Vehicle Color'].replace(color_map)

df['Vehicle Color'].value_counts().head(20)

Vehicle Color
WHITE    3406092
GRAY     2655345
BLACK    2623023
BLUE      953422
RED       644991
BROWN     292348
GREEN     241694
SILVE     191477
TN        120576
BR        102204
YW         98700
OTHER      60245
GL         54851
GRY        46527
MR         42812
WHT        35433
YELLO      32792
WHI        29760
OR         28100
BK.        27830
Name: count, dtype: int64

In [12]:
filename = '/Users/reuven/Courses/Current/data/nyc-parking-violations-2020.csv'

df = pd.read_csv(filename, usecols=['Issue Date', 'Vehicle Make', 'Vehicle Color', 'Street Name'])

In [13]:
df['Vehicle Color'].head(10)

0       BK
1      BLK
2    BLACK
3      NaN
4     GREY
5       GY
6       WH
7      BLK
8     GREY
9      RED
Name: Vehicle Color, dtype: object

In [15]:
# anything that starts with B and ends with K should be considered black
# I can express it with a regular expression!

df.replace('^B.*K$', 'BLACK', regex=True)

Unnamed: 0,Issue Date,Vehicle Make,Street Name,Vehicle Color
0,05/08/1972 12:00:00 AM,HONDA,43 ST,BLACK
1,08/29/1977 12:00:00 AM,ME/BE,UNION ST,BLACK
2,10/03/1988 12:00:00 AM,LEXUS,CLERMONT AVENUE,BLACK
3,01/03/1990 12:00:00 AM,CHEVR,DIVISION AVE,
4,02/14/1990 12:00:00 AM,JEEP,GRAND ST,GREY
...,...,...,...,...
12495729,01/03/2040 12:00:00 AM,FORD,3RD AVE,BR
12495730,04/19/2045 12:00:00 AM,HONDA,PELHAM PARK DR,BLACK
12495731,01/17/2049 12:00:00 AM,FORD,LYDIG AVE,GY
12495732,12/19/2063 12:00:00 AM,TOYOT,E 68 STREET,BLACK
