# 5. Cleaning data

## Exercise 25 - Parking cleanup

In [257]:
# load in CSV, specify columns
parking = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\nyc-parking-violations-2020.csv',
                     usecols=['Plate ID', 'Registration State', 'Vehicle Make', 'Vehicle Color', 'Violation Time', 'Street Name'])
parking.head()

Unnamed: 0,Plate ID,Registration State,Vehicle Make,Violation Time,Street Name,Vehicle Color
0,J58JKX,NJ,HONDA,0523P,43 ST,BK
1,KRE6058,PA,ME/BE,0428P,UNION ST,BLK
2,444326R,NJ,LEXUS,0625A,CLERMONT AVENUE,BLACK
3,F728330,OH,CHEVR,1106A,DIVISION AVE,
4,FMY9090,NY,JEEP,1253A,GRAND ST,GREY


In [258]:
# how many rows?
num_rows = parking.shape[0]
num_rows

12495734

In [259]:
# drop NA and count rows remaining
num_notna = parking.dropna().shape[0]
num_notna

12048375

In [260]:
# how much money lost due to missing data if $100 per ticket?
print(f'${(num_rows - num_notna) * 100:,}')

$44,735,900


In [261]:
# only remove if license plate, state, car make, and/or street name missing
num_selective_missing = parking.dropna(subset=['Plate ID', 'Registration State', 'Vehicle Make', 'Street Name']).shape[0]
num_selective_missing

12431949

In [262]:
# how much money lost now?
print(f'${(num_rows - num_selective_missing) * 100:,}')

$6,378,500


In [263]:
# find how many missing those columns but make is not required now
num_missing_make_not_required = parking.dropna(subset=['Plate ID', 'Registration State', 'Street Name']).shape[0]
num_missing_make_not_required

12494116

In [264]:
# how much money lost now?
print(f'${(num_rows - num_missing_make_not_required) * 100:,}')

$161,800


### Exercise 25b

In [266]:
# how many rows removed if requiring 3/4 columns
num_missing_thresh = len(parking.dropna(subset=['Plate ID', 'Registration State', 'Vehicle Make', 'Street Name'], thresh=3).index)
num_rows - num_missing_thresh

253

In [267]:
# which column has greatest NA values
parking.isna().sum().sort_values(ascending=False)

Vehicle Color         391982
Vehicle Make           62420
Street Name             1417
Violation Time           278
Plate ID                 202
Registration State         0
dtype: int64

In [268]:
# replace 'BLANKPLATE' with NA
parking['Plate ID'] = parking['Plate ID'].replace('BLANKPLATE', pd.NA)

In [269]:
# rerun NA counts
parking.isna().sum()

Plate ID                9084
Registration State         0
Vehicle Make           62420
Violation Time           278
Street Name             1417
Vehicle Color         391982
dtype: int64

## Exercise 26 - Celebrity deaths

In [271]:
# load in CSV, specify columns, parse dates
celebs = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\celebrity_deaths_2016.csv',
                     usecols=['dateofdeath', 'age'],
                     parse_dates=['dateofdeath'])
celebs.head()

Unnamed: 0,dateofdeath,age
0,2016-01-01,71
1,2016-01-01,74
2,2016-01-01,79
3,2016-01-01,45
4,2016-01-01,83


In [272]:
# extract month and make a new column
celebs['month'] = celebs['dateofdeath'].dt.month
celebs.head()

Unnamed: 0,dateofdeath,age,month
0,2016-01-01,71,1
1,2016-01-01,74,1
2,2016-01-01,79,1
3,2016-01-01,45,1
4,2016-01-01,83,1


In [273]:
# make month column the index
celebs = celebs.set_index('month')
celebs.head()

Unnamed: 0_level_0,dateofdeath,age
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2016-01-01,71
1,2016-01-01,74
1,2016-01-01,79
1,2016-01-01,45
1,2016-01-01,83


In [274]:
# sort index
celebs = celebs.sort_index()
celebs

Unnamed: 0_level_0,dateofdeath,age
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2016-01-01,71
1,2016-01-21,47
1,2016-01-21,87
1,2016-01-21,90
1,2016-01-21,73
...,...,...
12,2016-12-10,63
12,2016-12-10,20
12,2016-12-10,57
12,2016-12-10,78


In [275]:
# determine if all 'age' values can be interpreted as digits
celebs['age'].str.isdigit().all()

False

In [276]:
# clean nonintegers from age column

# fill missing values with placeholder, 'False'
celebs['age'] = celebs['age'].fillna('False')

# only select rows where age is digit
celebs = celebs.loc[celebs['age'].str.isdigit(), :]
celebs

Unnamed: 0_level_0,dateofdeath,age
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2016-01-01,71
1,2016-01-21,47
1,2016-01-21,87
1,2016-01-21,90
1,2016-01-21,73
...,...,...
12,2016-12-10,63
12,2016-12-10,20
12,2016-12-10,57
12,2016-12-10,78


In [277]:
# convert age into integer dtype
celebs['age'] = celebs['age'].astype(np.int64)

# data types won't be properly changed with loc, even if this throws an error

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  celebs['age'] = celebs['age'].astype(np.int64)


In [278]:
celebs.dtypes

dateofdeath    datetime64[ns]
age                     int64
dtype: object

In [279]:
# avg age of celebrities who died February to July 2016
celebs.loc[range(2,8), 'age'].mean()

77.17887409200968

### Exercise 26b

In [281]:
# create multi-index with day and month

# reset index
celebs = celebs.reset_index()

# extract day
celebs['day'] = celebs['dateofdeath'].dt.day

# set multi-index
celebs = celebs.set_index(['month', 'day'])
celebs

Unnamed: 0_level_0,Unnamed: 1_level_0,dateofdeath,age
month,day,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,2016-01-01,71
1,21,2016-01-21,47
1,21,2016-01-21,87
1,21,2016-01-21,90
1,21,2016-01-21,73
...,...,...,...
12,10,2016-12-10,63
12,10,2016-12-10,20
12,10,2016-12-10,57
12,10,2016-12-10,78


In [282]:
# avg age Feb 15 to July 15
celebs.loc[(celebs['dateofdeath'] >= '2016-02-15') & (celebs['dateofdeath'] <= '2016-07-15'), 'age'].mean()

77.05183037332367

In [283]:
# load in causeofdeath column
cause = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\celebrity_deaths_2016.csv',
                     usecols=['causeofdeath'])
cause.head()

Unnamed: 0,causeofdeath
0,brain cancer
1,cancer
2,cancer
3,complications of a stroke
4,heart failure


In [284]:
# top 5 causes of death
cause.value_counts().head()

causeofdeath      
 cancer               248
 heart attack         125
 traffic collision     56
 lung cancer           51
 pneumonia             50
Name: count, dtype: int64

In [285]:
# will NA with 'unknown'
cause = cause.fillna('unknown')
cause.value_counts().head()

causeofdeath      
unknown               5008
 cancer                248
 heart attack          125
 traffic collision      56
 lung cancer            51
Name: count, dtype: int64

## Exercise 27 - Titanic interpolation

In [287]:
# read in Excel file
titanic = pd.read_excel(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\titanic3.xls')
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.34,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [288]:
# which columns have NA values, how many?
titanic.isna().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [289]:
# determine what to do for columns with missing values
titanic.loc[titanic['age'].isna(),:]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
15,1,0,"Baumann, Mr. John D",male,,0,0,PC 17318,25.93,,S,,,"New York, NY"
37,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S,9,,"Los Angeles, CA"
40,1,0,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.60,,C,,,"Philadelphia, PA"
46,1,0,"Cairns, Mr. Alexander",male,,0,0,113798,31.00,,S,,,
59,1,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...",female,,0,0,17770,27.72,,C,5,,"New York, NY"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1293,3,0,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,,,
1297,3,0,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S,,,
1302,3,0,"Yousif, Mr. Wazli",male,,0,0,2647,7.22,,C,,,
1303,3,0,"Yousseff, Mr. Gerious",male,,0,0,2627,14.46,,C,,,


### Exercise 27b

In [291]:
# create series with index as unique embarked values and values as most common destination for that embarked value
destinations = pd.Series([])

for embarked_from in titanic['embarked'].dropna().unique():
    destinations.loc[embarked_from] = titanic.loc[titanic['embarked']==embarked_from, 'home.dest'].value_counts().index[0]

destinations

S           New York, NY
C           New York, NY
Q    Ireland Chicago, IL
dtype: object

In [292]:
# replace NA values in home.dest with values from embarked
titanic.loc['home.dest'] = titanic['home.dest'].fillna(titanic['embarked'])

In [293]:
# use most common destinations to replace values
titanic.loc['home.dest'] = titanic['home.dest'].replace(destinations)

## Exercise 28 - Inconsistent data

In [295]:
# read in CSV, specify columns
violations = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\nyc-parking-violations-2020.csv',
                        usecols=['Plate ID', 'Registration State', 'Vehicle Make', 'Vehicle Color', 'Street Name'])
violations.head()

Unnamed: 0,Plate ID,Registration State,Vehicle Make,Street Name,Vehicle Color
0,J58JKX,NJ,HONDA,43 ST,BK
1,KRE6058,PA,ME/BE,UNION ST,BLK
2,444326R,NJ,LEXUS,CLERMONT AVENUE,BLACK
3,F728330,OH,CHEVR,DIVISION AVE,
4,FMY9090,NY,JEEP,GRAND ST,GREY


In [296]:
# how many unique vehicle colors
len(violations['Vehicle Color'].value_counts(dropna=False))

1897

In [297]:
# 30 most common colors
violations['Vehicle Color'].value_counts(dropna=False).index[:30]

Index([   'WH',    'GY',    'BK', 'WHITE',    'BL',    'RD', 'BLACK',     nan,
        'GREY', 'BROWN', 'SILVE',    'GR',  'BLUE',   'RED',    'TN',    'BR',
          'YW',   'BLK', 'OTHER', 'GREEN',    'GL',   'GRY',    'MR',  'GRAY',
         'WHT', 'YELLO',   'WHI',    'OR',   'BK.',    'WT'],
      dtype='object', name='Vehicle Color')

In [298]:
# create color lists
white_vals = ['WH', 'WHT', 'WHI', 'WT', 'WHITE', 'WT.', 'W', 'WH.', 'WHT.']
gray_vals = ['GY', 'GREY', 'GRY', 'GRAY', 'GY.', 'LTGY', 'LTG', 'DKGY', 'GYGY', 'DKG']
black_vals = ['BK', 'BLACK', 'BLK', 'BK.', 'BL.', 'BLK.']
blue_vals = ['BL', 'BLUE', 'BLU']
red_vals = ['RD', 'RED', 'RD.']
brown_vals = ['BROWN', 'BR', 'BRN']
yellow_vals = ['YW', 'YELLOW', 'YELLO']
silver_vals = ['SILVER', 'SILVE', 'SL', 'SIL', 'SL.']
orange_vals = ['ORANGE', 'OR', 'ORANG']
purple_vals = ['PURPLE', 'PR', 'PURPL']
green_vals = ['GREEN', 'GR', 'GRN', 'GN']

In [299]:
# consolidate colors
violations.loc[violations['Vehicle Color'].isin(white_vals), 'Vehicle Color'] = 'WHITE'
violations.loc[violations['Vehicle Color'].isin(gray_vals), 'Vehicle Color'] = 'GRAY'
violations.loc[violations['Vehicle Color'].isin(black_vals), 'Vehicle Color'] = 'BLACK'
violations.loc[violations['Vehicle Color'].isin(blue_vals), 'Vehicle Color'] = 'BLUE'
violations.loc[violations['Vehicle Color'].isin(red_vals), 'Vehicle Color'] = 'RED'
violations.loc[violations['Vehicle Color'].isin(brown_vals), 'Vehicle Color'] = 'BROWN'
violations.loc[violations['Vehicle Color'].isin(yellow_vals), 'Vehicle Color'] = 'YELLOW'
violations.loc[violations['Vehicle Color'].isin(silver_vals), 'Vehicle Color'] = 'SILVER'
violations.loc[violations['Vehicle Color'].isin(orange_vals), 'Vehicle Color'] = 'ORANGE'
violations.loc[violations['Vehicle Color'].isin(purple_vals), 'Vehicle Color'] = 'PURPLE'
violations.loc[violations['Vehicle Color'].isin(green_vals), 'Vehicle Color'] = 'GREEN'

In [300]:
# updated common colors
violations['Vehicle Color'].value_counts(dropna=False).index[:30]

Index([ 'WHITE',   'GRAY',  'BLACK',   'BLUE',    'RED',  'BROWN',      nan,
        'GREEN', 'SILVER', 'YELLOW',     'TN',  'OTHER',     'GL',     'MR',
       'ORANGE',   'GOLD',    'TAN', 'PURPLE',    'LT/',    'DK/',      'B',
          'BRO',   'BKGY',   'WHBL',   'DKBL',     'BN',  'BLUE.',   'WHGY',
        'UNKNO',   'RED.'],
      dtype='object', name='Vehicle Color')

In [301]:
# how many total colors now?
len(violations['Vehicle Color'].value_counts(dropna=False))

1860

In [302]:
# can rinse and repeat to further reduce

### Exercise 28b

In [304]:
# total count of unique vehicle makes
violations['Vehicle Make'].value_counts()

Vehicle Make
TOYOT    1395273
HONDA    1343265
FORD     1328063
NISSA    1119587
CHEVR     711464
          ...   
BEAVE          1
NELSO          1
HOWBY          1
BONEE          1
KIA (          1
Name: count, Length: 5210, dtype: int64

In [305]:
# function to clean strings
def fix_this(x):
    # returns true if not a string
    if not isinstance(x, str):
        return x
    # instantiate empty output string
    output = ''
    # loop through each char in string, adding if in ascii_uppercase library
    for char in x.strip().upper():
        if char in string.ascii_uppercase:
            output += char
    # return cleaned string
    return output

In [306]:
# apply string-cleaning function
violations.loc['Vehicle Make'] = violations['Vehicle Make'].apply(fix_this)

# return updated value counts
violations['Vehicle Make'].value_counts()

Vehicle Make
TOYOT    1395273
HONDA    1343265
FORD     1328063
NISSA    1119587
CHEVR     711464
          ...   
BEAVE          1
NELSO          1
HOWBY          1
BONEE          1
KIA (          1
Name: count, Length: 5210, dtype: int64

In [307]:
# how standardized are the street names?
violations['Street Name'].value_counts()

# need to standardize capitalization, whether 'th, rd' etc. is used after numbers, abbreviations for streets and street types

Street Name
Broadway                180225
3rd Ave                 133003
5th Ave                  78211
2nd Ave                  75533
Madison Ave              75419
                         ...  
I/O W 164 ST                 1
HUTINSON RIVER PARKW         1
BEACH 58                     1
HUTINSON RIVER PARK          1
W/S/O 182 STREET             1
Name: count, Length: 57757, dtype: int64

In [308]:
# need to clean registration state column?
violations['Registration State'].unique()

# 68 'states' include Canadian provinces, also "99"?

array(['NJ', 'PA', 'OH', 'NY', 'NC', '99', 'ME', 'GV', 'IN', 'CT', 'TX',
       'SC', 'MA', 'FL', 'IL', 'MN', 'OK', 'VA', 'ID', 'CA', 'MS', 'NE',
       'HI', 'NB', 'AL', 'WA', 'VT', 'WY', 'WI', 'NH', 'MD', 'KY', 'GA',
       'RI', 'AR', 'TN', 'AZ', 'NV', 'QB', 'DE', 'MO', 'OR', 'MI', 'NM',
       'UT', 'ON', 'DP', 'LA', 'WV', 'CO', 'SD', 'DC', 'IA', 'MT', 'AK',
       'KS', 'MX', 'ND', 'AB', 'PR', 'PE', 'BC', 'SK', 'NS', 'FO', 'NT',
       'MB', 'YT', nan], dtype=object)