#### Import dependencies:

In [1]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
import matplotlib as plt
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
import time
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)
import re
import os
import matplotlib.pyplot as plt
%matplotlib inline

Python version: 3.8.8 (default, Apr 13 2021, 12:59:45) 
[Clang 10.0.0 ]
pandas version: 1.2.4
matplotlib version: 3.3.4
NumPy version: 1.20.1
SciPy version: 1.6.2
IPython version: 7.29.0
scikit-learn version: 0.24.1
-------------------------


#### Download data file if it has not been downloaded already:

In [19]:
#url = 'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD'
fn_src = 'https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD'
fn_dst = 'NYPD_Motor_Vehicle_Collisions_Crashe.csv'
from six.moves import urllib

if os.path.isfile(fn_dst):
    print('File %s has already been downloaded' % fn_dst)
else:
    print('Fetching file %s[2.4GB]. This may take a while...' % fn_dst)
    urllib.request.urlretrieve(fn_src, fn_dst)
    print('File %s has been downloaded' % fn_dst)

File NYPD_Motor_Vehicle_Collisions_Crashe.csv has already been downloaded


#### Open a stream to the data file so we don't have to load the whole data set into main memory

In [20]:
from openclean.pipeline import stream
ds_full = stream(fn_dst)

In [21]:
print(f'The dataset contains {ds_full.count():,} rows.')

The dataset contains 1,848,299 rows.


In [22]:
ds_full.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,04/14/2021,5:32,,,,,,BRONX WHITESTONE BRIDGE,,,...,Unspecified,,,,4407480,Sedan,Sedan,,,
1,04/13/2021,21:35,BROOKLYN,11217.0,40.68358,-73.97617,"(40.68358, -73.97617)",,,620 ATLANTIC AVENUE,...,,,,,4407147,Sedan,,,,
2,04/15/2021,16:15,,,,,,HUTCHINSON RIVER PARKWAY,,,...,,,,,4407665,Station Wagon/Sport Utility Vehicle,,,,
3,04/13/2021,16:00,BROOKLYN,11222.0,,,,VANDERVORT AVENUE,ANTHONY STREET,,...,Unspecified,,,,4407811,Sedan,,,,
4,04/12/2021,8:25,,,0.0,0.0,"(0.0, 0.0)",EDSON AVENUE,,,...,Unspecified,,,,4406885,Station Wagon/Sport Utility Vehicle,Sedan,,,
5,04/13/2021,17:11,,,,,,VERRAZANO BRIDGE UPPER,,,...,Unspecified,,,,4407883,Sedan,Box Truck,,,
6,04/13/2021,17:30,QUEENS,11106.0,,,,33 st,31ave,,...,Unspecified,,,,4408019,Sedan,Sedan,,,
7,04/16/2021,23:30,,,,,,SHORE PARKWAY,,,...,,,,,4408060,Sedan,,,,
8,04/11/2021,17:00,,,,,,GOWANUS RAMP,,,...,Other Vehicular,,,,4406314,Sedan,Sedan,,,
9,04/16/2021,21:15,,,,,,BRONX RIVER PARKWAY RAMP,,,...,Unspecified,,,,4408149,Station Wagon/Sport Utility Vehicle,Sedan,,,


#### Use openclean to profile the data set:

In [6]:
from openclean.profiling.column import DefaultColumnProfiler

#profiles = ds.profile(default_profiler=DefaultColumnProfiler)
profiles = ds_full.profile(default_profiler=DefaultColumnProfiler)

In [7]:
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
CRASH DATE,1848299,0,3447,0.001865,11.687347
CRASH TIME,1848299,0,1440,0.000779,8.931235
BOROUGH,1848299,571200,5,4e-06,2.118409
ZIP CODE,1848299,571422,232,0.000182,7.221101
LATITUDE,1848299,215704,122527,0.07505,15.634194
LONGITUDE,1848299,215704,96067,0.058843,15.342925
LOCATION,1848299,215704,244820,0.149958,16.18665
ON STREET NAME,1848299,378234,16137,0.010977,10.601064
CROSS STREET NAME,1848299,665839,19278,0.016303,11.809442
OFF STREET NAME,1848299,1562434,180625,0.631854,16.922181


In [23]:
# Print the most frequent data type for each column.

print('Schema\n------')
for col in ds_full.columns:
    p = profiles.column(col)
    print("  '{}' ({})".format(col, p['datatypes']['distinct'].most_common(1)[0][0]))

Schema
------
  'CRASH DATE' (date)
  'CRASH TIME' (str)
  'BOROUGH' (str)
  'ZIP CODE' (int)
  'LATITUDE' (float)
  'LONGITUDE' (float)
  'LOCATION' (str)
  'ON STREET NAME' (str)
  'CROSS STREET NAME' (str)
  'OFF STREET NAME' (str)
  'NUMBER OF PERSONS INJURED' (int)
  'NUMBER OF PERSONS KILLED' (int)
  'NUMBER OF PEDESTRIANS INJURED' (int)
  'NUMBER OF PEDESTRIANS KILLED' (int)
  'NUMBER OF CYCLIST INJURED' (int)
  'NUMBER OF CYCLIST KILLED' (int)
  'NUMBER OF MOTORIST INJURED' (int)
  'NUMBER OF MOTORIST KILLED' (int)
  'CONTRIBUTING FACTOR VEHICLE 1' (str)
  'CONTRIBUTING FACTOR VEHICLE 2' (str)
  'CONTRIBUTING FACTOR VEHICLE 3' (str)
  'CONTRIBUTING FACTOR VEHICLE 4' (str)
  'CONTRIBUTING FACTOR VEHICLE 5' (str)
  'COLLISION_ID' (int)
  'VEHICLE TYPE CODE 1' (str)
  'VEHICLE TYPE CODE 2' (str)
  'VEHICLE TYPE CODE 3' (str)
  'VEHICLE TYPE CODE 4' (str)
  'VEHICLE TYPE CODE 5' (str)


## Profiling & Cleaning of each field in the data set

In [24]:
profiles.minmax('CRASH DATE')

Unnamed: 0,min,max
date,2012-07-01,2021-12-07


In [25]:
from datetime import datetime
#datetime.strptime('2014-12-04', '%Y-%m-%d').date()


def validate_date(sourceValue):
    dummy_date = datetime.strptime("01/01/1000", '%m/%d/%Y').date()
    
    try:
        source_date = sourceValue
        start_date = datetime.strptime("01/07/2012", '%m/%d/%Y').date()
        end_date = datetime.strptime("07/12/2021", '%m/%d/%Y').date()
    
        if source_date == '':
            source_date = dummy_date
        
        source_date = datetime.strptime(sourceValue, '%m/%d/%Y').date()
        
        if source_date >= start_date and source_date <= end_date:
            return source_date
        else:
            return dummy_date
    except:
        return dummy_date
    

ds_full = ds_full.update('CRASH DATE', validate_date)

In [26]:
profiles.minmax('CRASH TIME')

Unnamed: 0,min,max
str,0:00,9:59


In [27]:
profiles.minmax('ZIP CODE')

Unnamed: 0,min,max
int,10000.0,11697.0
str,,


In [28]:
profiles.column('ZIP CODE').get('topValues')

[('11207', 24006),
 ('11101', 17327),
 ('11236', 16726),
 ('11203', 16189),
 ('10019', 16058),
 ('11385', 15857),
 ('11234', 15815),
 ('11201', 15547),
 ('10016', 15520),
 ('10036', 15429)]

In [29]:
profiles.column('BOROUGH').get('topValues')

[('BROOKLYN', 402138),
 ('QUEENS', 342100),
 ('MANHATTAN', 293456),
 ('BRONX', 185604),
 ('STATEN ISLAND', 53801)]

In [108]:
ds_full.distinct('BOROUGH')

Counter({'UNKNOWN': 571200,
         'BROOKLYN': 402138,
         'QUEENS': 342100,
         'STATEN ISLAND': 53801,
         'BRONX': 185604,
         'MANHATTAN': 293456})

In [33]:
def validate_crime_indicator(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        crime_indicator_list = ["BROOKLYN", "QUEENS", "STATEN ISLAND","BRONX","MANHATTAN"]
        
        if sourceValue in crime_indicator_list:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('BOROUGH', validate_crime_indicator)

In [35]:
profiles.column('COLLISION_ID').get('topValues')

[('4407480', 1),
 ('4407147', 1),
 ('4407665', 1),
 ('4407811', 1),
 ('4406885', 1),
 ('4407883', 1),
 ('4408019', 1),
 ('4408060', 1),
 ('4406314', 1),
 ('4408149', 1)]

In [34]:
ds_full.distinct('COLLISION_ID')

Counter({'4407480': 1,
         '4407147': 1,
         '4407665': 1,
         '4407811': 1,
         '4406885': 1,
         '4407883': 1,
         '4408019': 1,
         '4408060': 1,
         '4406314': 1,
         '4408149': 1,
         '4406488': 1,
         '4408310': 1,
         '4406097': 1,
         '4136992': 1,
         '4277087': 1,
         '4395664': 1,
         '4397513': 1,
         '4403773': 1,
         '4405244': 1,
         '4405914': 1,
         '4408191': 1,
         '4407366': 1,
         '4407778': 1,
         '4407461': 1,
         '4407407': 1,
         '4407900': 1,
         '4407760': 1,
         '4407746': 1,
         '4408143': 1,
         '4407638': 1,
         '4407958': 1,
         '4407885': 1,
         '4407616': 1,
         '4408038': 1,
         '4408224': 1,
         '4407392': 1,
         '4407765': 1,
         '4407821': 1,
         '4407520': 1,
         '4380668': 1,
         '4407902': 1,
         '4407971': 1,
         '4408071': 1,
         '4

In [36]:
ds_full.distinct('ON STREET NAME')

Counter({'BRONX WHITESTONE BRIDGE': 168,
         '': 378234,
         'HUTCHINSON RIVER PARKWAY': 168,
         'VANDERVORT AVENUE': 23,
         'EDSON AVENUE                    ': 235,
         'VERRAZANO BRIDGE UPPER': 159,
         '33 st': 1,
         'SHORE PARKWAY': 94,
         'GOWANUS RAMP                    ': 716,
         'BRONX RIVER PARKWAY RAMP': 48,
         'BEVERLEY ROAD                   ': 989,
         'GREENFIELD AVENUE': 1,
         'BROOKLYN BRIDGE                 ': 1237,
         'GOLD STREET': 16,
         'BRUCKNER BLVD': 1,
         'COURT STREET': 33,
         'RALPH AVENUE': 137,
         'BARCLAY AVENUE': 10,
         'BRONX RIVER PARKWAY': 279,
         'MAJOR DEEGAN EXPRESSWAY RAMP': 64,
         'CROSS ISLAND PARKWAY': 508,
         'MYRTLE AVENUE': 205,
         'GARRISON AVENUE': 20,
         'HUGUENOT AVENUE': 18,
         'BOSTON ROAD': 145,
         'GLENWOOD ROAD': 66,
         'WEST 94 STREET': 7,
         'PRINCE STREET': 28,
         'BROAD

In [37]:
profiles.minmax('LATITUDE')

Unnamed: 0,min,max
float,30.78418,43.344444
int,0.0,0.0


In [39]:
profiles.column('LATITUDE').get('topValues')

[('0', 2117),
 ('40.861862', 732),
 ('40.8047', 691),
 ('40.608757', 670),
 ('40.696033', 662),
 ('40.798256', 626),
 ('40.6960346', 587),
 ('40.759308', 579),
 ('40.7606005', 474),
 ('40.820305', 467)]

In [40]:
def validate_lat(sourceValue):
    UNKNOWN = -999
    
    try:
        if sourceValue >= 39 and sourceValue <= 42:  
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('LATITUDE', validate_lat)

In [38]:
profiles.minmax('LONGITUDE')

Unnamed: 0,min,max
float,-201.35999,-32.768513
int,-74.0,0.0


In [41]:
profiles.column('LONGITUDE').get('topValues')

[('0', 2117),
 ('-73.91282', 716),
 ('-73.98453', 695),
 ('-74.038086', 672),
 ('-73.91243', 645),
 ('-73.89063', 613),
 ('-73.9845292', 587),
 ('-73.882744', 552),
 ('-73.89686', 543),
 ('-73.89083', 538)]

In [42]:
def validate_lat(sourceValue):
    UNKNOWN = -999
    
    try:
        if sourceValue >= -75 and sourceValue <= -72:  
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('LATITUDE', validate_lat)

In [43]:
ds_full.distinct('LOCATION')

Counter({'': 215704,
         '(40.68358, -73.97617)': 47,
         '(0.0, 0.0)': 2117,
         '(40.69754, -73.98312)': 27,
         '(40.843464, -73.836)': 1,
         '(40.692547, -73.990974)': 66,
         '(40.626457, -73.918)': 74,
         '(40.526894, -74.16728)': 16,
         '(40.840775, -73.87246)': 18,
         '(40.694035, -73.72679)': 42,
         '(40.857365, -73.84657)': 107,
         '(40.698807, -73.91837)': 41,
         '(40.71402, -73.74827)': 2,
         '(40.815, -73.89402)': 33,
         '(40.55079, -74.20098)': 9,
         '(40.655903, -73.89817)': 2,
         '(40.890076, -73.819855)': 53,
         '(40.650402, -73.89422)': 22,
         '(40.79335, -73.97275)': 29,
         '(40.72538, -74.00011)': 7,
         '(40.75184, -73.90358)': 50,
         '(40.649788, -73.9622)': 31,
         '(40.686928, -73.920815)': 2,
         '(40.712963, -73.93647)': 5,
         '(40.801285, -73.95394)': 19,
         '(40.633976, -74.02211)': 1,
         '(40.69168, -73.999344)'

In [44]:
profiles.column('CROSS STREET NAME').get('topValues')

[('3 AVENUE                        ', 9843),
 ('BROADWAY                        ', 9685),
 ('2 AVENUE                        ', 8421),
 ('5 AVENUE                        ', 7051),
 ('7 AVENUE                        ', 6634),
 ('8 AVENUE                        ', 6580),
 ('3 AVENUE', 5660),
 ('1 AVENUE                        ', 5318),
 ('BROADWAY', 5215),
 ('PARK AVENUE                     ', 4847)]

In [45]:
ds_full.distinct('CROSS STREET NAME')

Counter({'': 665839,
         'ANTHONY STREET': 26,
         '31ave': 1,
         'EAST 21 STREET': 422,
         'OSGOOD AVENUE': 38,
         'CONCORD STREET': 92,
         '\x7f�ST 138 STREET': 1,
         'JORALEMON STREET': 105,
         'AVENUE K': 676,
         'HYLAN BOULEVARD': 750,
         'LONGWOOD AVENUE': 212,
         'ROPES AVENUE': 47,
         'EAST 108 STREET': 142,
         'BROADWAY': 5215,
         'WOOSTER STREET': 72,
         '58 STREET': 467,
         'CHURCH AVENUE': 1324,
         'PELHAM PARKWAY NORTH': 153,
         'BLAKE AVENUE': 397,
         'BUFFALO AVENUE': 304,
         'WHITE PLAINS ROAD': 1368,
         'SOUTH 5 STREET': 125,
         '184 STREET': 162,
         '155 STREET': 234,
         'STUYVESANT AVENUE': 135,
         '24 AVENUE': 289,
         'PARK AVENUE': 2692,
         'SYCAMORE STREET': 50,
         'STEWART AVENUE': 121,
         '85 STREET': 376,
         'GROVE STREET': 143,
         'ANDREWS AVENUE': 130,
         '64 PLACE': 67,
 

In [24]:
def validate_age_group(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ["25-44", "18-24", "45-64", "<18", "65+","UNKNOWN"]
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('PERP_AGE_GROUP', validate_age_group)

In [46]:
profiles.column('NUMBER OF PERSONS INJURED').get('topValues')

[('0', 1464140),
 ('1', 297634),
 ('2', 56151),
 ('3', 18520),
 ('4', 6941),
 ('5', 2747),
 ('6', 1131),
 ('7', 489),
 ('8', 207),
 ('9', 115)]

In [47]:
ds_full.distinct('NUMBER OF PERSONS INJURED')

Counter({'0': 1464140,
         '1': 297634,
         '7': 489,
         '2': 56151,
         '3': 18520,
         '5': 2747,
         '4': 6941,
         '6': 1131,
         '9': 115,
         '8': 207,
         '17': 5,
         '10': 71,
         '16': 8,
         '': 18,
         '11': 41,
         '14': 8,
         '15': 7,
         '18': 5,
         '20': 2,
         '13': 18,
         '12': 27,
         '22': 3,
         '31': 1,
         '19': 4,
         '27': 1,
         '32': 1,
         '24': 3,
         '43': 1})

In [49]:
def validate_NPI(sourceValue):
    UNKNOWN = "-999"
    
    try:
        values = ['0','1','2','3','4','5','6','7','8','9','10','17','16','11','14','15','18','20','13','12','22','31','19','27','32','24','43']
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('NUMBER OF PERSONS INJURED', validate_NPI)

In [50]:
profiles.column('NUMBER OF PERSONS KILLED').get('topValues')

[('0', 1845915),
 ('1', 2282),
 ('2', 57),
 ('3', 9),
 ('4', 3),
 ('8', 1),
 ('5', 1)]

In [51]:
ds_full.distinct('NUMBER OF PERSONS KILLED')

Counter({'0': 1845915,
         '1': 2282,
         '2': 57,
         '3': 9,
         '4': 3,
         '': 31,
         '8': 1,
         '5': 1})

In [52]:
def validate_NPK(sourceValue):
    UNKNOWN = "-999"
    
    try:
        values = ['0','1','2','3','4','5','8']
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('NUMBER OF PERSONS KILLED', validate_NPK)

In [53]:
profiles.column('NUMBER OF PEDESTRIANS KILLED').get('topValues')

[('0', 1847053), ('1', 1234), ('2', 11), ('6', 1)]

In [54]:
ds_full.distinct('NUMBER OF PEDESTRIANS KILLED')

Counter({'0': 1847053, '1': 1234, '2': 11, '6': 1})

In [55]:
profiles.column('NUMBER OF PEDESTRIANS INJURED').get('topValues')

[('0', 1755767),
 ('1', 89176),
 ('2', 2960),
 ('3', 306),
 ('4', 50),
 ('5', 20),
 ('6', 11),
 ('7', 3),
 ('9', 2),
 ('27', 1)]

In [56]:
ds_full.distinct('NUMBER OF PEDESTRIANS INJURED')

Counter({'0': 1755767,
         '1': 89176,
         '2': 2960,
         '3': 306,
         '4': 50,
         '6': 11,
         '5': 20,
         '7': 3,
         '27': 1,
         '9': 2,
         '13': 1,
         '15': 1,
         '8': 1})

In [59]:
profiles.column('NUMBER OF CYCLIST INJURED').get('topValues')

[('0', 1804356), ('1', 43487), ('2', 438), ('3', 17), ('4', 1)]

In [60]:
ds_full.distinct('NUMBER OF CYCLIST INJURED')

Counter({'0': 1804356, '1': 43487, '2': 438, '3': 17, '4': 1})

In [62]:
ds_full.distinct('NUMBER OF CYCLIST KILLED')

Counter({'0': 1848116, '1': 182, '2': 1})

In [63]:
profiles.column('NUMBER OF CYCLIST KILLED').get('topValues')

[('0', 1848116), ('1', 182), ('2', 1)]

In [65]:
profiles.column('NUMBER OF MOTORIST INJURED').get('topValues')

[('0', 1600620),
 ('1', 166509),
 ('2', 51610),
 ('3', 17984),
 ('4', 6802),
 ('5', 2708),
 ('6', 1092),
 ('7', 465),
 ('8', 199),
 ('9', 112)]

In [66]:
profiles.column('NUMBER OF MOTORIST KILLED').get('topValues')

[('0', 1847383), ('1', 860), ('2', 44), ('3', 9), ('4', 2), ('5', 1)]

In [67]:
ds_full.distinct('CONTRIBUTING FACTOR VEHICLE 1')

Counter({'Following Too Closely': 96498,
         'Unspecified': 652406,
         'Pavement Slippery': 18039,
         'Driver Inattention/Distraction': 358908,
         'Other Vehicular': 58003,
         'Passing Too Closely': 43585,
         'Passing or Lane Usage Improper': 47726,
         'Driver Inexperience': 27967,
         'Failure to Yield Right-of-Way': 107265,
         'Brakes Defective': 6029,
         'Turning Improperly': 45962,
         'Unsafe Speed': 20711,
         'Backing Unsafely': 69814,
         'Reaction to Uninvolved Vehicle': 17328,
         'View Obstructed/Limited': 11740,
         'Steering Failure': 2397,
         'Traffic Control Disregarded': 30324,
         'Drugs (illegal)': 695,
         'Aggressive Driving/Road Rage': 7915,
         'Fell Asleep': 4615,
         'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion': 7379,
         'Alcohol Involvement': 18670,
         'Unsafe Lane Changing': 36152,
         'Pavement Defective': 2226,
         'Ot

In [68]:
profiles.column('CONTRIBUTING FACTOR VEHICLE 1').get('topValues')

[('Unspecified', 652406),
 ('Driver Inattention/Distraction', 358908),
 ('Failure to Yield Right-of-Way', 107265),
 ('Following Too Closely', 96498),
 ('Backing Unsafely', 69814),
 ('Other Vehicular', 58003),
 ('Passing or Lane Usage Improper', 47726),
 ('Fatigued/Drowsy', 47149),
 ('Turning Improperly', 45962),
 ('Passing Too Closely', 43585)]

In [70]:
ds_full.distinct('CONTRIBUTING FACTOR VEHICLE 2')

Counter({'Unspecified': 1329552,
         '': 268393,
         'Other Vehicular': 29555,
         'Unsafe Speed': 3984,
         'Driver Inattention/Distraction': 84288,
         'Driver Inexperience': 6060,
         'Failure to Yield Right-of-Way': 15604,
         'Following Too Closely': 16361,
         'Turning Improperly': 8229,
         'Passing or Lane Usage Improper': 11126,
         'View Obstructed/Limited': 2720,
         'Oversized Vehicle': 2118,
         'Passing Too Closely': 7728,
         'Unsafe Lane Changing': 6047,
         'Traffic Control Disregarded': 6276,
         'Pavement Slippery': 3795,
         'Reaction to Uninvolved Vehicle': 2968,
         'Fell Asleep': 483,
         'Aggressive Driving/Road Rage': 1465,
         'Backing Unsafely': 7529,
         'Alcohol Involvement': 1392,
         'Outside Car Distraction': 2501,
         'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion': 1773,
         'Obstruction/Debris': 636,
         'Glare': 462,
       

In [69]:
profiles.column('CONTRIBUTING FACTOR VEHICLE 2').get('topValues')

[('Unspecified', 1329552),
 ('Driver Inattention/Distraction', 84288),
 ('Other Vehicular', 29555),
 ('Following Too Closely', 16361),
 ('Failure to Yield Right-of-Way', 15604),
 ('Passing or Lane Usage Improper', 11126),
 ('Fatigued/Drowsy', 10831),
 ('Turning Improperly', 8229),
 ('Passing Too Closely', 7728),
 ('Backing Unsafely', 7529)]

In [71]:
ds_full.distinct('VEHICLE TYPE CODE 1')

Counter({'Sedan': 471353,
         'Station Wagon/Sport Utility Vehicle': 374662,
         'Taxi': 45627,
         '�MBU': 1,
         'Pick-up Truck': 29732,
         'Box Truck': 20621,
         'Bus': 17305,
         '': 10216,
         'Ambulance': 3026,
         'Tow Truck / Wrecker': 1076,
         'E-Bike': 1203,
         'Van': 7666,
         'Flat Bed': 2027,
         'Moped': 723,
         'Tractor Truck Diesel': 8766,
         'AMBULANCE': 2476,
         'Bike': 11110,
         'Trailer': 14,
         'Motorcycle': 6194,
         'Garbage or Refuse': 1789,
         'SCHOOL BUS': 11,
         'Lift Boom': 198,
         'scooter': 8,
         'CATER': 2,
         'PKUP': 2,
         'Carry All': 1664,
         'FDNY Ambul': 4,
         '3-Door': 243,
         'Beverage Truck': 266,
         'SEMI TRAIL': 2,
         'Dump': 3105,
         'Convertible': 3266,
         '4 dr sedan': 40084,
         'dump': 28,
         'Flat Rack': 282,
         'GARBAGE TR': 6,
         'Motor

In [72]:
profiles.column('VEHICLE TYPE CODE 1').get('topValues')

[('Sedan', 471353),
 ('PASSENGER VEHICLE', 416206),
 ('Station Wagon/Sport Utility Vehicle', 374662),
 ('SPORT UTILITY / STATION WAGON', 180291),
 ('Taxi', 45627),
 ('4 dr sedan', 40084),
 ('TAXI', 31911),
 ('Pick-up Truck', 29732),
 ('VAN', 25266),
 ('OTHER', 22966)]

In [74]:
ds_full.distinct('VEHICLE TYPE CODE 2')

Counter({'Sedan': 342483,
         '': 319543,
         'Box Truck': 23113,
         'Station Wagon/Sport Utility Vehicle': 280673,
         'Taxi': 34778,
         'Ambulance': 1691,
         'Van': 7262,
         'Bike': 23564,
         'Dump': 3285,
         'Garbage or Refuse': 1727,
         'PK': 1792,
         'Pick-up Truck': 27371,
         'Tow Truck / Wrecker': 1039,
         'E-Bike': 2270,
         'Motorcycle': 4869,
         'Tractor Truck Gasoline': 1365,
         'Bus': 15414,
         'Motorbike': 471,
         'Chassis Cab': 679,
         'Tractor Truck Diesel': 8572,
         'Concrete Mixer': 452,
         'Multi-Wheeled Vehicle': 82,
         'fire truck': 13,
         'E-Scooter': 1495,
         'Hopper': 24,
         'FDNY TRUCK': 16,
         'VAN/TRUCK': 1,
         'Pedicab': 98,
         'Moped': 825,
         'FDNY ENGIN': 2,
         'Carry All': 1725,
         'FIRE TRUCK': 608,
         'Convertible': 2230,
         'Dirt Bike': 2,
         'BOOM LIFT': 

In [73]:
profiles.column('VEHICLE TYPE CODE 2').get('topValues')

[('Sedan', 342483),
 ('PASSENGER VEHICLE', 318607),
 ('Station Wagon/Sport Utility Vehicle', 280673),
 ('SPORT UTILITY / STATION WAGON', 140204),
 ('UNKNOWN', 81464),
 ('Taxi', 34778),
 ('4 dr sedan', 30049),
 ('TAXI', 27702),
 ('Pick-up Truck', 27371),
 ('Bike', 23564)]

In [78]:
SELECTED_COLUMNS = [\
    'CRASH DATE',\
    'CRASH TIME',\
    'BOROUGH',\
    'ZIP CODE',\
    'LATITUDE',\
    'LONGITUDE',\
    'ON STREET NAME',\
    'CROSS STREET NAME',\
    'OFF STREET NAME',\
    'NUMBER OF PERSONS INJURED',\
    'NUMBER OF PERSONS KILLED',\
    'NUMBER OF PEDESTRIANS INJURED',\
    'NUMBER OF PEDESTRIANS KILLED',\
    'NUMBER OF CYCLIST INJURED',\
    'NUMBER OF CYCLIST KILLED',\
    'NUMBER OF MOTORIST INJURED',\
    'NUMBER OF MOTORIST KILLED',\
    'CONTRIBUTING FACTOR VEHICLE 1',\
    'CONTRIBUTING FACTOR VEHICLE 2',\
    'CONTRIBUTING FACTOR VEHICLE 3',\
    'CONTRIBUTING FACTOR VEHICLE 4',\
    'CONTRIBUTING FACTOR VEHICLE 5',\
    'COLLISION_ID',\
    'VEHICLE TYPE CODE 1',\
    'VEHICLE TYPE CODE 2',\
    'VEHICLE TYPE CODE 3',\
    'VEHICLE TYPE CODE 4',\
    'VEHICLE TYPE CODE 5',\
]

ds_full = ds_full.select(SELECTED_COLUMNS)

In [79]:
ds_full.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,2021-04-14,5:32,UNKNOWN,,-999,,BRONX WHITESTONE BRIDGE,,,0,...,Unspecified,,,,4407480,Sedan,Sedan,,,
1,2021-04-13,21:35,BROOKLYN,11217.0,-999,-73.97617,,,620 ATLANTIC AVENUE,1,...,,,,,4407147,Sedan,,,,
2,2021-04-15,16:15,UNKNOWN,,-999,,HUTCHINSON RIVER PARKWAY,,,0,...,,,,,4407665,Station Wagon/Sport Utility Vehicle,,,,
3,2021-04-13,16:00,BROOKLYN,11222.0,-999,,VANDERVORT AVENUE,ANTHONY STREET,,0,...,Unspecified,,,,4407811,Sedan,,,,
4,2021-04-12,8:25,UNKNOWN,,-999,0.0,EDSON AVENUE,,,0,...,Unspecified,,,,4406885,Station Wagon/Sport Utility Vehicle,Sedan,,,
5,2021-04-13,17:11,UNKNOWN,,-999,,VERRAZANO BRIDGE UPPER,,,0,...,Unspecified,,,,4407883,Sedan,Box Truck,,,
6,2021-04-13,17:30,QUEENS,11106.0,-999,,33 st,31ave,,0,...,Unspecified,,,,4408019,Sedan,Sedan,,,
7,2021-04-16,23:30,UNKNOWN,,-999,,SHORE PARKWAY,,,0,...,,,,,4408060,Sedan,,,,
8,2021-04-11,17:00,UNKNOWN,,-999,,GOWANUS RAMP,,,1,...,Other Vehicular,,,,4406314,Sedan,Sedan,,,
9,2021-04-16,21:15,UNKNOWN,,-999,,BRONX RIVER PARKWAY RAMP,,,0,...,Unspecified,,,,4408149,Station Wagon/Sport Utility Vehicle,Sedan,,,


# VALIDATING DATA AFTER CLEANING

In [80]:
data_df = pd.read_csv("NYPD_Motor_Vehicle_Collisions_Crashe.csv")

In [82]:
data_df = data_df.drop(['VEHICLE TYPE CODE 3','VEHICLE TYPE CODE 4','VEHICLE TYPE CODE 5','CONTRIBUTING FACTOR VEHICLE 3','OFF STREET NAME','CONTRIBUTING FACTOR VEHICLE 4','CONTRIBUTING FACTOR VEHICLE 5'],axis=1)

In [85]:
def validate_vtc(data,column_name):
    data[column_name].replace(np.nan, 'Unspecified', inplace=True)
    return data

In [90]:
data_df = validate_cf(data_df,'CONTRIBUTING FACTOR VEHICLE 1')
data_df = validate_cf(data_df,'CONTRIBUTING FACTOR VEHICLE 2')
data_df = validate_cf(data_df,'VEHICLE TYPE CODE 1')
data_df = validate_cf(data_df,'VEHICLE TYPE CODE 2')

In [92]:
def validate_ST(data,column_name):
    data[column_name].replace(np.nan, 'UNKNOWN', inplace=True)
    return data

In [94]:
data_df = validate_ST(data_df,'CROSS STREET NAME')
data_df = validate_ST(data_df,'ON STREET NAME')

In [98]:
data_df['ON STREET NAME'] = data_df['ON STREET NAME'].replace('\s+', ' ', regex=True)
data_df['CROSS STREET NAME'] = data_df['CROSS STREET NAME'].replace('\s+', ' ', regex=True)

In [104]:
def validate_nan(data,column_name):
    data[column_name].replace(np.nan, 'UNKNOWN', inplace=True)
    return data

In [115]:
data_df = validate_nan(data_df,'NUMBER OF PERSONS INJURED')
data_df = validate_nan(data_df,'NUMBER OF PERSONS KILLED')
data_df = validate_nan(data_df,'LOCATION')
data_df = validate_nan(data_df,'BOROUGH')

In [109]:
def validate_LATLON(data,column_name):
    data[column_name].replace(np.nan, -999, inplace=True)
    return data

In [116]:
data_df = validate_nan(data_df,'LATITUDE')
data_df = validate_nan(data_df,'LONGITUDE')
data_df = validate_nan(data_df,'ZIP CODE')

In [117]:
ds_full_jc = stream(data_df)
profiles = ds_full_jc.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
CRASH DATE,1848299,0,3447,0.001865,11.687347
CRASH TIME,1848299,0,1440,0.000779,8.931235
BOROUGH,1848299,0,6,3e-06,2.355797
ZIP CODE,1848299,0,423,0.000229,6.469487
LATITUDE,1848299,0,122528,0.066292,14.329432
LONGITUDE,1848299,0,96068,0.051976,14.072155
LOCATION,1848299,0,244821,0.132457,14.817414
ON STREET NAME,1848299,0,16120,0.008722,9.162703
CROSS STREET NAME,1848299,0,19149,0.01036,8.49741
NUMBER OF PERSONS INJURED,1848299,0,28,1.5e-05,0.9684


In [118]:
data_df.to_csv('filtered10.csv')