#### Import dependencies:

In [8]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
import matplotlib as plt
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
import time
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)
import re
import os
import matplotlib.pyplot as plt
%matplotlib inline

Python version: 3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
pandas version: 1.1.3
matplotlib version: 3.3.2
NumPy version: 1.19.5
SciPy version: 1.5.2
IPython version: 7.19.0
scikit-learn version: 0.23.2
-------------------------


#### Download data file if it has not been downloaded already:

In [9]:
#url = 'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD'
fn_src = 'https://data.cityofnewyork.us/api/views/57mv-nv28/rows.csv?accessType=DOWNLOAD'
fn_dst = 'NYPD_Complaint_Map.csv'

from six.moves import urllib

if os.path.isfile(fn_dst):
    print('File %s has already been downloaded' % fn_dst)
else:
    print('Fetching file %s[2.4GB]. This may take a while...' % fn_dst)
    urllib.request.urlretrieve(fn_src, fn_dst)
    print('File %s has been downloaded' % fn_dst)

File NYPD_Complaint_Map.csv has already been downloaded


#### Open a stream to the data file so we don't have to load the whole data set into main memory

In [10]:
from openclean.pipeline import stream
ds_full = stream(fn_dst)

In [11]:
print(f'The dataset contains {ds_full.count():,} rows.')

The dataset contains 7,375,993 rows.


In [12]:
ds_full.head()

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,...,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,JURIS_DESC,PARKS_NM,HADEVELOPT,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon
0,394506329,12/31/2019,17:30:00,,,32.0,12/31/2019,118,DANGEROUS WEAPONS,793.0,...,,STREET,N.Y. POLICE DEPT,,,999937,238365,40.82092679700002,-73.94332421899996,"(40.82092679700002, -73.94332421899996)"
1,968873685,12/29/2019,16:31:00,12/29/2019,16:54:00,47.0,12/29/2019,113,FORGERY,729.0,...,,STREET,N.Y. POLICE DEPT,,,1022508,261990,40.88570140600008,-73.86164032499995,"(40.885701406000074, -73.86164032499995)"
2,509837549,12/15/2019,18:45:00,,,109.0,12/29/2019,578,HARRASSMENT 2,638.0,...,FRONT OF,STREET,N.Y. POLICE DEPT,,,1034178,209758,40.74228115600005,-73.81982408,"(40.74228115600005, -73.81982408)"
3,352454313,12/28/2019,01:00:00,,,47.0,12/28/2019,126,MISCELLANEOUS PENAL LAW,117.0,...,REAR OF,STREET,N.Y. POLICE DEPT,,,1026412,258211,40.87531145100007,-73.84754521099995,"(40.87531145100007, -73.84754521099995)"
4,248803469,09/05/2008,21:41:00,,,,09/05/2008,101,MURDER & NON-NEGL. MANSLAUGHTER,,...,OUTSIDE,,N.Y. POLICE DEPT,,,1001215,193881,40.698827283,-73.938819047,"(40.698827283, -73.938819047)"
5,293718737,12/27/2019,22:00:00,,,9.0,12/27/2019,107,BURGLARY,223.0,...,FRONT OF,RESIDENCE - APT. HOUSE,N.Y. POLICE DEPT,,,989665,201866,40.72075882100006,-73.98046642299995,"(40.72075882100006, -73.98046642299995)"
6,552685226,12/27/2019,20:10:00,12/27/2019,20:15:00,79.0,12/27/2019,117,DANGEROUS DRUGS,521.0,...,,STREET,N.Y. POLICE DEPT,,,1001545,192836,40.695958362000056,-73.93763162199998,"(40.695958362000056, -73.93763162199998)"
7,134037758,12/26/2019,20:00:00,12/27/2019,07:15:00,101.0,12/27/2019,341,PETIT LARCENY,321.0,...,FRONT OF,STREET,N.Y. POLICE DEPT,,,1054394,162186,40.61157006600007,-73.74736517199995,"(40.61157006600007, -73.74736517199995)"
8,855385879,12/26/2019,19:57:00,,,44.0,12/26/2019,361,OFF. AGNST PUB ORD SENSBLTY &,639.0,...,,STREET,N.Y. POLICE DEPT,,,1007027,245405,40.84023413800003,-73.91768411399994,"(40.84023413800003, -73.91768411399994)"
9,241602326,12/25/2019,23:00:00,12/26/2019,14:25:00,20.0,12/26/2019,341,PETIT LARCENY,321.0,...,,STREET,N.Y. POLICE DEPT,,,987147,220853,40.77287456000005,-73.98954212299998,"(40.77287456000005, -73.98954212299998)"


#### Use openclean to profile the data set:

In [13]:
from openclean.profiling.column import DefaultColumnProfiler

#profiles = ds.profile(default_profiler=DefaultColumnProfiler)
profiles = ds_full.profile(default_profiler=DefaultColumnProfiler)

In [14]:
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
CMPLNT_NUM,7375993,0,7373143,0.9996136,22.813633
CMPLNT_FR_DT,7375993,655,8606,0.001166862,12.425578
CMPLNT_FR_TM,7375993,48,1441,0.0001953648,8.136466
CMPLNT_TO_DT,7375993,1704204,6825,0.001203324,12.417984
CMPLNT_TO_TM,7375993,1699541,1441,0.0002538558,8.862856
ADDR_PCT_CD,7375993,2166,77,1.044234e-05,6.14869
RPT_DT,7375993,0,5479,0.0007428152,12.405384
KY_CD,7375993,0,74,1.003255e-05,4.170727
OFNS_DESC,7375993,18823,71,9.65045e-06,4.006583
PD_CD,7375993,6278,432,5.861828e-05,5.913459


In [15]:
# Print the most frequent data type for each column.

print('Schema\n------')
for col in ds_full.columns:
    p = profiles.column(col)
    print("  '{}' ({})".format(col, p['datatypes']['distinct'].most_common(1)[0][0]))

Schema
------
  'CMPLNT_NUM' (int)
  'CMPLNT_FR_DT' (date)
  'CMPLNT_FR_TM' (date)
  'CMPLNT_TO_DT' (date)
  'CMPLNT_TO_TM' (date)
  'ADDR_PCT_CD' (int)
  'RPT_DT' (date)
  'KY_CD' (int)
  'OFNS_DESC' (str)
  'PD_CD' (int)
  'PD_DESC' (str)
  'CRM_ATPT_CPTD_CD' (str)
  'LAW_CAT_CD' (str)
  'BORO_NM' (str)
  'LOC_OF_OCCUR_DESC' (str)
  'PREM_TYP_DESC' (str)
  'JURIS_DESC' (str)
  'PARKS_NM' (str)
  'HADEVELOPT' (str)
  'X_COORD_CD' (int)
  'Y_COORD_CD' (int)
  'Latitude' (float)
  'Longitude' (float)
  'Lat_Lon' (str)


### Field Name & Description:
    'CMPLNT_NUM' (int)                 Complaint Number
    'CMPLNT_FR_DT' (date)              Complaint From Date
    'CMPLNT_FR_TM' (date)              Complaint From Time
    'CMPLNT_TO_DT' (date)              Complaint To Date
    'CMPLNT_TO_TM' (date)              Complaint To Time
    'ADDR_PCT_CD' (int)                Code of Precinct in which the Incident Occured
    'RPT_DT' (date)                    Report Date
    'KY_CD' (int)                      "Key Code": Offense Classification Code (3 digits)
    'OFNS_DESC' (str)                  Offense Description
    'PD_CD' (int)                      PD Code of Offense. More granular than Key Code
    'PD_DESC' (str)                    PD Description of Offense.
    'CRM_ATPT_CPTD_CD' (str)           Whether Crime was Atempted or Completed (values: 'COMPLETED', 'ATTEMPTED')
    'LAW_CAT_CD' (str)                 Level of Offense (values: 'FELONY', 'VIOLATION', 'MISDEMEANOR')
    'BORO_NM' (str)                    Name of Borough in which Incident Occurred
    'LOC_OF_OCCUR_DESC' (str)          Description of where the incident occurred with respect to the premises
                                       (values:'FRONT OF', 'REAR OF', 'OUTSIDE', 'INSIDE', 'OPPOSITE OF')
    'PREM_TYP_DESC' (str)              Description of the type of premises in which the Incident Occurred
    'JURIS_DESC' (str)                 Description of Jurisdiction in which Incident Occurred
    'JURISDICTION_CODE' (int)          Jurisdiction Code
    'PARKS_NM' (str)                   Name of Park in which Incident Occurred, if Applicable
    'HADEVELOPT' (str)                 Name of NYCHA Housing Development in which Incident Occurred, if Applicable
    'HOUSING_PSA' (int)                Housing PSA
    'X_COORD_CD' (int)                 X-coordinate, New York State Plane Coordinate System
    'Y_COORD_CD' (int)                 Y-coordinate, New York State Plane Coordinate System
    'SUSP_AGE_GROUP' (int)             Age Group of Suspect
    'SUSP_RACE' (str)                  Race of Suspect
    'SUSP_SEX' (str)                   Sex of Suspect
    'TRANSIT_DISTRICT' (int)           Transit-District code
    'Latitude' (float)                 Global Latitude of Location where Incident Occurred
    'Longitude' (float)                Global Longitude of Location where Incident Occured
    'Lat_Lon' (str)                    'Latitude' and 'Longitude' together
    'PATROL_BORO' (str)                Patrol Borough
    'STATION_NAME' (str)               Station Name
    'VIC_AGE_GROUP' (int)              Age Group of Victim
    'VIC_RACE' (str)                   Race of Victim
    'VIC_SEX' (str)                    Sex of Victim
    
    
    (Note: some field descriptions were taken from https://www1.nyc.gov/assets/nypd/downloads/pdf/analysis_and_planning/incident_level_data_footnotes.pdf)
   

## Profiling & Cleaning of each field in the data set

In [16]:
profiles.minmax('CMPLNT_NUM')

Unnamed: 0,min,max
int,100000065,999999904


In [17]:
profiles.minmax('CMPLNT_FR_DT')

Unnamed: 0,min,max
date,1010-05-14 00:00:00,2020-12-31


In [18]:
from datetime import datetime
#datetime.strptime('2014-12-04', '%Y-%m-%d').date()


def validate_date(sourceValue):
    dummy_date = datetime.strptime("01/01/1000", '%m/%d/%Y').date()
    
    try:
        source_date = sourceValue
        start_date = datetime.strptime("01/01/2006", '%m/%d/%Y').date()
        end_date = datetime.strptime("12/31/2020", '%m/%d/%Y').date()
    
        if source_date == '':
            source_date = dummy_date
        
        source_date = datetime.strptime(sourceValue, '%m/%d/%Y').date()
        
        if source_date >= start_date and source_date <= end_date:
            return source_date
        else:
            return dummy_date
    except:
        return dummy_date
    

ds_full = ds_full.update('CMPLNT_FR_DT', validate_date)

In [19]:
profiles.minmax('CMPLNT_FR_TM')

Unnamed: 0,min,max
date,2021-12-12,2021-12-12 23:59:00


In [20]:
profiles.minmax('CMPLNT_TO_DT')

Unnamed: 0,min,max
date,1010-10-15 00:00:00,2090-04-06


In [21]:
ds_full = ds_full.update('CMPLNT_TO_DT', validate_date)

In [22]:
profiles.minmax('CMPLNT_TO_TM')

Unnamed: 0,min,max
date,2021-12-12 00:00:00,2021-12-12 23:59:00
str,24:00:00,24:00:00


In [23]:
profiles.minmax('ADDR_PCT_CD')

Unnamed: 0,min,max
int,1,123


In [24]:
profiles.column('ADDR_PCT_CD').get('topValues')

[('75', 237049),
 ('43', 189629),
 ('44', 184214),
 ('40', 178413),
 ('14', 166925),
 ('46', 155839),
 ('52', 154649),
 ('73', 151060),
 ('120', 145011),
 ('67', 139888)]

In [25]:
profiles.minmax('RPT_DT')

Unnamed: 0,min,max
date,2006-01-01,2020-12-31


In [26]:
ds_full = ds_full.update('RPT_DT', validate_date)

In [27]:
profiles.minmax('KY_CD')

Unnamed: 0,min,max
int,101,881


In [28]:
profiles.column('KY_CD').get('topValues')

[('341', 1244164),
 ('578', 945459),
 ('344', 774250),
 ('109', 638981),
 ('351', 619342),
 ('361', 383375),
 ('235', 342576),
 ('106', 286139),
 ('105', 266449),
 ('107', 254293)]

In [29]:
profiles.column('OFNS_DESC').get('topValues')

[('PETIT LARCENY', 1244155),
 ('HARRASSMENT 2', 945389),
 ('ASSAULT 3 & RELATED OFFENSES', 774177),
 ('CRIMINAL MISCHIEF & RELATED OF', 744951),
 ('GRAND LARCENY', 638972),
 ('DANGEROUS DRUGS', 427058),
 ('OFF. AGNST PUB ORD SENSBLTY &', 383332),
 ('FELONY ASSAULT', 286085),
 ('ROBBERY', 266447),
 ('BURGLARY', 254292)]

In [30]:
ds_full.distinct('OFNS_DESC')

Counter({'DANGEROUS WEAPONS': 161447,
         'FORGERY': 72712,
         'HARRASSMENT 2': 945389,
         'MISCELLANEOUS PENAL LAW': 185789,
         'MURDER & NON-NEGL. MANSLAUGHTER': 6278,
         'BURGLARY': 254292,
         'DANGEROUS DRUGS': 427058,
         'PETIT LARCENY': 1244155,
         'OFF. AGNST PUB ORD SENSBLTY &': 383332,
         'GRAND LARCENY': 638972,
         'FELONY ASSAULT': 286085,
         'ASSAULT 3 & RELATED OFFENSES': 774177,
         'ARSON': 17658,
         'RAPE': 21648,
         'SEX CRIMES': 88030,
         'GRAND LARCENY OF MOTOR VEHICLE': 133845,
         'ROBBERY': 266447,
         'CRIMINAL MISCHIEF & RELATED OF': 744951,
         'THEFT-FRAUD': 76655,
         'VEHICLE AND TRAFFIC LAWS': 91809,
         'CRIMINAL TRESPASS': 82135,
         'OFFENSES INVOLVING FRAUD': 23076,
         'FRAUDS': 44473,
         'OFFENSES AGAINST PUBLIC ADMINI': 137502,
         'OFFENSES AGAINST THE PERSON': 18170,
         'ADMINISTRATIVE CODE': 16287,
         'I

In [31]:
profiles.minmax('PD_CD')

Unnamed: 0,min,max
int,100,975


In [32]:
# Print the most frequent values in column ''

profiles.column('PD_CD').get('topValues')

[('101', 640218),
 ('638', 613331),
 ('639', 374673),
 ('333', 364252),
 ('637', 332066),
 ('338', 302018),
 ('254', 259062),
 ('321', 237811),
 ('109', 232452),
 ('259', 208063)]

In [33]:
# Print the most frequent values in column 'PD_DESC'

profiles.column('PD_DESC').get('topValues')

[('ASSAULT 3', 640218),
 ('HARASSMENT,SUBD 3,4,5', 613331),
 ('AGGRAVATED HARASSMENT 2', 374673),
 ('LARCENY,PETIT FROM STORE-SHOPL', 364252),
 ('HARASSMENT,SUBD 1,CIVILIAN', 332066),
 ('LARCENY,PETIT FROM BUILDING,UN', 302018),
 ('MISCHIEF, CRIMINAL 4, OF MOTOR', 259062),
 ('LARCENY,PETIT FROM AUTO', 237811),
 ('ASSAULT 2,1,UNCLASSIFIED', 232452),
 ('CRIMINAL MISCHIEF,UNCLASSIFIED 4', 208063)]

In [34]:
ds_full.distinct('PD_DESC')

Counter({'WEAPONS POSSESSION 3': 56612,
         'FORGERY,ETC.,UNCLASSIFIED-FELO': 60754,
         'HARASSMENT,SUBD 3,4,5': 613331,
         'RECKLESS ENDANGERMENT 1': 27671,
         '': 6278,
         'BURGLARY,RESIDENCE,NIGHT': 55288,
         'CONTROLLED SUBSTANCE, SALE 5': 7853,
         'LARCENY,PETIT FROM AUTO': 237811,
         'AGGRAVATED HARASSMENT 2': 374673,
         'LARCENY,GRAND FROM STORE-SHOPL': 24843,
         'LARCENY,PETIT BY ACQUIRING LOS': 11742,
         'ASSAULT 2,1,UNCLASSIFIED': 232452,
         'OBSTR BREATH/CIRCUL': 25907,
         'LARCENY,PETIT FROM OPEN AREAS,': 113412,
         'ARSON 2,3,4': 11456,
         'LARCENY,PETIT FROM STORE-SHOPL': 364252,
         'RAPE 3': 3107,
         'SEXUAL ABUSE 3,2': 42771,
         'LARCENY,PETIT OF LICENSE PLATE': 55147,
         'LARCENY,GRAND OF AUTO': 116902,
         'ASSAULT 3': 640218,
         'SEXUAL ABUSE': 4554,
         'ROBBERY,CLOTHING': 746,
         'MISCHIEF, CRIMINAL 4, OF MOTOR': 259062,
         'R

In [35]:
ds_full.distinct('CRM_ATPT_CPTD_CD')

Counter({'COMPLETED': 7250300, 'ATTEMPTED': 125686, '': 7})

In [36]:
def validate_crime_indicator(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        crime_indicator_list = ["COMPLETED", "ATTEMPTED"]
        
        if sourceValue in crime_indicator_list:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('CRM_ATPT_CPTD_CD', validate_crime_indicator)

In [37]:
ds_full.distinct('LAW_CAT_CD')

Counter({'FELONY': 2275064, 'VIOLATION': 960209, 'MISDEMEANOR': 4140720})

In [38]:
def validate_law_cat_cd(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ["FELONY", "MISDEMEANOR", "VIOLATION"] 
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('LAW_CAT_CD', validate_law_cat_cd)

In [39]:
ds_full.distinct('BORO_NM')

Counter({'MANHATTAN': 1771637,
         'BRONX': 1599801,
         'QUEENS': 1463554,
         '': 11329,
         'BROOKLYN': 2186681,
         'STATEN ISLAND': 342991})

In [40]:
def validate_boro_nm(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ["BRONX","BROOKLYN","MANHATTAN","QUEENS","STATEN ISLAND"]
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('BORO_NM', validate_boro_nm)

In [41]:
ds_full.distinct('LOC_OF_OCCUR_DESC')

Counter({'': 1543800,
         'FRONT OF': 1727410,
         'REAR OF': 157302,
         'OUTSIDE': 3813,
         'INSIDE': 3747880,
         'OPPOSITE OF': 195788})

In [42]:
def validate_Loc_of_occur_desc(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ["FRONT OF","REAR OF","OUTSIDE","INSIDE","OPPOSITE OF"]
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('LOC_OF_OCCUR_DESC', validate_Loc_of_occur_desc)

In [43]:
profiles.column('PREM_TYP_DESC').get('topValues')

[('STREET', 2352311),
 ('RESIDENCE - APT. HOUSE', 1564899),
 ('RESIDENCE-HOUSE', 725940),
 ('RESIDENCE - PUBLIC HOUSING', 553247),
 ('OTHER', 199770),
 ('COMMERCIAL BUILDING', 190978),
 ('CHAIN STORE', 167139),
 ('TRANSIT - NYC SUBWAY', 160448),
 ('DEPARTMENT STORE', 144809),
 ('GROCERY/BODEGA', 94682)]

In [44]:
ds_full.distinct('PREM_TYP_DESC')

Counter({'STREET': 2352311,
         '': 40745,
         'RESIDENCE - APT. HOUSE': 1564899,
         'DEPARTMENT STORE': 144809,
         'OTHER': 199770,
         'RESIDENCE-HOUSE': 725940,
         'RESIDENCE - PUBLIC HOUSING': 553247,
         'ABANDONED BUILDING': 2827,
         'HOSPITAL': 35193,
         'DRUG STORE': 70116,
         'CEMETERY': 854,
         'PUBLIC BUILDING': 43986,
         'FOOD SUPERMARKET': 40053,
         'COMMERCIAL BUILDING': 190978,
         'GYM/FITNESS FACILITY': 22943,
         'CHAIN STORE': 167139,
         'BUS STOP': 6212,
         'GROCERY/BODEGA': 94682,
         'FAST FOOD': 36335,
         'STORE UNCLASSIFIED': 31669,
         'HOTEL/MOTEL': 33547,
         'RESTAURANT/DINER': 85640,
         'BANK': 30868,
         'BRIDGE': 7743,
         'SHOE': 5142,
         'PARKING LOT/GARAGE (PUBLIC)': 40170,
         'CHURCH': 12912,
         'ATM': 6088,
         'HIGHWAY/PARKWAY': 21915,
         'PUBLIC SCHOOL': 85002,
         'PARK/PLAYGROUND': 

In [45]:
profiles.column('JURIS_DESC').get('topValues')

[('N.Y. POLICE DEPT', 6556968),
 ('N.Y. HOUSING POLICE', 562181),
 ('N.Y. TRANSIT POLICE', 163810),
 ('PORT AUTHORITY', 34258),
 ('OTHER', 24147),
 ('DEPT OF CORRECTIONS', 9638),
 ('POLICE DEPT NYC', 8955),
 ('TRI-BORO BRDG TUNNL', 5677),
 ('HEALTH & HOSP CORP', 3692),
 ('N.Y. STATE POLICE', 1883)]

In [46]:
ds_full.distinct('JURIS_DESC')

Counter({'N.Y. POLICE DEPT': 6556968,
         'METRO NORTH': 822,
         'N.Y. HOUSING POLICE': 562181,
         'N.Y. STATE POLICE': 1883,
         'DEPT OF CORRECTIONS': 9638,
         'OTHER': 24147,
         'HEALTH & HOSP CORP': 3692,
         'N.Y. TRANSIT POLICE': 163810,
         'PORT AUTHORITY': 34258,
         'U.S. PARK POLICE': 282,
         'STATN IS RAPID TRANS': 439,
         'NYC PARKS': 1125,
         'TRI-BORO BRDG TUNNL': 5677,
         'NEW YORK CITY SHERIFF OFFICE': 279,
         'N.Y. STATE PARKS': 456,
         'LONG ISLAND RAILRD': 548,
         'NYS DEPT TAX AND FINANCE': 106,
         'CONRAIL': 16,
         'AMTRACK': 188,
         'FIRE DEPT (FIRE MARSHAL)': 477,
         'POLICE DEPT NYC': 8955,
         'SEA GATE POLICE DEPT': 30,
         'NYC DEPT ENVIRONMENTAL PROTECTION': 14,
         'NYS DEPT ENVIRONMENTAL CONSERVATION': 1,
         'DISTRICT ATTORNEY OFFICE': 1})

In [47]:
profiles.column('PARKS_NM').get('topValues')

[('NA', 4118985),
 ('CENTRAL PARK', 1641),
 ('FLUSHING MEADOWS CORONA PARK', 1315),
 ('CONEY ISLAND BEACH & BOARDWALK', 1060),
 ('WASHINGTON SQUARE PARK', 778),
 ('RIVERSIDE PARK', 614),
 ('PROSPECT PARK', 536),
 ('UNION SQUARE PARK', 534),
 ('MARCUS GARVEY PARK', 433),
 ("RANDALL'S ISLAND PARK", 413)]

In [48]:
ds_full.distinct('PARKS_NM')

Counter({'': 3229345,
         'NA': 4118985,
         'GRAND ARMY PLAZA BROOKLYN': 11,
         'SUNSET PARK': 226,
         'HAPPY WARRIOR PLAYGROUND': 15,
         'POE PARK': 33,
         'BRYANT PARK': 299,
         'HUDSON RIVER PARK': 144,
         'CITY HALL PARK': 20,
         'SARA D. ROOSEVELT PARK': 345,
         'BELT PARKWAY/SHORE PARKWAY': 81,
         'HIGHBRIDGE PARK MANHATTAN SIDE': 248,
         'ASTORIA PARK': 115,
         'QUEENSBRIDGE PARK': 21,
         'KISSENA PARK': 77,
         'FLUSHING MEADOWS CORONA PARK': 1315,
         'MCCARREN PARK': 194,
         'HOLCOMBE RUCKER PARK': 23,
         'CONFERENCE HOUSE PARK': 18,
         'MARCUS GARVEY PARK': 433,
         'CENTRAL PARK': 1641,
         'VAN CORTLANDT PARK': 258,
         'HARLEM RIVER PARK': 159,
         'CALLAHAN-KELLY PLAYGROUND': 30,
         'RICHMAN (ECHO) PARK': 214,
         'FORT TRYON PARK': 105,
         'UNION SQUARE PARK': 534,
         'WASHINGTON SQUARE PARK': 778,
         'ST. VARTAN

In [49]:
profiles.column('HADEVELOPT').get('topValues')

[('CASTLE HILL', 7555),
 ('VAN DYKE I', 6108),
 ('MARCY', 5597),
 ('BUTLER', 5231),
 ('GRANT', 5202),
 ('LINCOLN', 5076),
 ('LINDEN', 5039),
 ('DOUGLASS', 4879),
 ('PINK', 4861),
 ('FARRAGUT', 4753)]

In [50]:
ds_full.distinct('HADEVELOPT')

Counter({'': 7029181,
         'LA GUARDIA': 2715,
         'BOSTON SECOR': 1694,
         'WILLIAMSBURG': 3627,
         'SACK WERN': 1323,
         'HAMMEL': 1683,
         'COOPER PARK': 2182,
         'WOODSON': 334,
         'ARMSTRONG I': 1818,
         'WALD': 3074,
         'RED HOOK EAST': 3124,
         'REDFERN': 1927,
         'CLAREMONT PARKWAY-FRANKLIN AVENUE': 258,
         'RICHMOND TERRACE': 2178,
         'BORINQUEN PLAZA I': 2056,
         'ASTORIA': 2101,
         'DOUGLASS': 4879,
         'WEST FARMS ROAD REHAB': 316,
         'CASTLE HILL': 7555,
         'CARLETON MANOR': 551,
         'BUSHWICK II (GROUPS A & C)': 771,
         'WAGNER': 1987,
         'TILDEN': 4035,
         'ATLANTIC TERMINAL SITE 4B': 832,
         'TOMPKINS': 4069,
         'CARVER': 729,
         'PARK ROCK REHAB': 458,
         'HUGHES APARTMENTS': 2292,
         'TODT HILL': 1196,
         'BAISLEY PARK': 496,
         'BEACH 41ST STREET-BEACH CHANNEL DRIVE': 781,
         'BORINQUEN PL

In [51]:
profiles.minmax('X_COORD_CD')

Unnamed: 0,min,max
int,111,1067298


In [52]:
profiles.minmax('Y_COORD_CD')

Unnamed: 0,min,max
int,111,7250292


In [53]:
profiles.minmax('Latitude')

Unnamed: 0,min,max
float,40.11271,59.657274


In [54]:
def validate_lat(sourceValue):
    UNKNOWN = -999
    
    try:
        if sourceValue >= 40 and sourceValue <= 41:  
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('Latitude', validate_lat)

In [55]:
profiles.minmax('Longitude')

Unnamed: 0,min,max
float,-77.519206,-73.684788


In [56]:
def validate_long(sourceValue):
    UNKNOWN = -999
    
    try:
        if sourceValue >= -74 and sourceValue <= -73:  
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('Latitude', validate_long)

In [57]:
profiles.minmax('Lat_Lon')

Unnamed: 0,min,max
str,"(40.112709974, -77.519206334)","(59.657273946, -73.872926184)"


In [58]:
SELECTED_COLUMNS = [\
    'CMPLNT_NUM',\
    'CMPLNT_FR_DT',\
    'CMPLNT_FR_TM',\
    'CMPLNT_TO_DT',\
    'CMPLNT_TO_TM',\
    'ADDR_PCT_CD',\
    'RPT_DT',\
    'KY_CD',\
    'OFNS_DESC',\
    'PD_CD',\
    'PD_DESC',\
    'CRM_ATPT_CPTD_CD',\
    'LAW_CAT_CD',\
    'BORO_NM',\
    'LOC_OF_OCCUR_DESC',\
    'PREM_TYP_DESC',\
    'JURIS_DESC',\
    'PARKS_NM',\
    'HADEVELOPT',\
    'X_COORD_CD',\
    'Y_COORD_CD',\
    'Latitude',\
    'Longitude',\
    'Lat_Lon',\
]

ds_full = ds_full.select(SELECTED_COLUMNS)

In [59]:
ds_full

<openclean.pipeline.DataPipeline at 0x7fe8b5683310>

# VALIDATING DATA AFTER CLEANING

In [60]:
data_df = pd.read_csv("NYPD_Complaint_Map.csv")

In [61]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7375993 entries, 0 to 7375992
Data columns (total 24 columns):
 #   Column             Dtype  
---  ------             -----  
 0   CMPLNT_NUM         int64  
 1   CMPLNT_FR_DT       object 
 2   CMPLNT_FR_TM       object 
 3   CMPLNT_TO_DT       object 
 4   CMPLNT_TO_TM       object 
 5   ADDR_PCT_CD        float64
 6   RPT_DT             object 
 7   KY_CD              int64  
 8   OFNS_DESC          object 
 9   PD_CD              float64
 10  PD_DESC            object 
 11  CRM_ATPT_CPTD_CD   object 
 12  LAW_CAT_CD         object 
 13  BORO_NM            object 
 14  LOC_OF_OCCUR_DESC  object 
 15  PREM_TYP_DESC      object 
 16  JURIS_DESC         object 
 17  PARKS_NM           object 
 18  HADEVELOPT         object 
 19  X_COORD_CD         float64
 20  Y_COORD_CD         float64
 21  Latitude           float64
 22  Longitude          float64
 23  Lat_Lon            object 
dtypes: float64(6), int64(2), object(16)
memory usage: 

In [62]:
ds_full_jc = stream(data_df)
profiles = ds_full_jc.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
CMPLNT_NUM,7375993,0,7373143,0.9996136,22.813633
CMPLNT_FR_DT,7375993,655,8606,0.001166862,12.425578
CMPLNT_FR_TM,7375993,48,1441,0.0001953648,8.136466
CMPLNT_TO_DT,7375993,1704204,6825,0.001203324,12.417984
CMPLNT_TO_TM,7375993,1699541,1441,0.0002538558,8.862856
ADDR_PCT_CD,7375993,2166,77,1.044234e-05,6.14869
RPT_DT,7375993,0,5479,0.0007428152,12.405384
KY_CD,7375993,0,74,1.003255e-05,4.170727
OFNS_DESC,7375993,18823,71,9.65045e-06,4.006583
PD_CD,7375993,6278,432,5.861828e-05,5.913459


In [63]:
data_df['CMPLNT_TO_DT'].unique()

array([nan, '12/29/2019', '12/27/2019', ..., '08/09/1983', '09/05/2000',
       '07/22/2002'], dtype=object)

In [64]:
def validate_CMPLNT_TO_DT(data,column_name):
    data[column_name].replace(np.nan, 'UNKNOWN', inplace=True)
    return data
data_df = validate_CMPLNT_TO_DT(data_df,'CMPLNT_TO_DT')

In [65]:
data_df['CMPLNT_TO_TM'].unique()

array([nan, '16:54:00', '20:15:00', ..., '06:51:00', '05:47:00',
       '24:00:00'], dtype=object)

In [66]:
def validate_CMPLNT_TO_TM(data,column_name):
    data[column_name].replace(np.nan, 'UNKNOWN', inplace=True)
    return data
data_df = validate_CMPLNT_TO_TM(data_df,'CMPLNT_TO_TM')

In [67]:
data_df['LOC_OF_OCCUR_DESC'].unique()

array([nan, 'FRONT OF', 'REAR OF', 'OUTSIDE', 'INSIDE', 'OPPOSITE OF'],
      dtype=object)

In [68]:
def validate_LOC_OF_OCCUR_DESC(data,column_name):
    data[column_name].replace(np.nan, 'UNKNOWN', inplace=True)
    return data
data_df = validate_LOC_OF_OCCUR_DESC(data_df,'LOC_OF_OCCUR_DESC')

In [69]:
data_df = data_df.drop(['PARKS_NM','HADEVELOPT'],axis=1)

In [70]:
data_df = data_df.dropna()

In [71]:
ds_full_jc = stream(data_df)
profiles = ds_full_jc.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
CMPLNT_NUM,7294765,0,7291946,0.9996136,22.797657
CMPLNT_FR_DT,7294765,0,8489,0.001163711,12.423801
CMPLNT_FR_TM,7294765,0,1441,0.0001975389,8.132579
CMPLNT_TO_DT,7294765,0,6808,0.000933272,10.341031
CMPLNT_TO_TM,7294765,0,1442,0.000197676,7.609011
ADDR_PCT_CD,7294765,0,77,1.055551e-05,6.147985
RPT_DT,7294765,0,5479,0.0007510866,12.403614
KY_CD,7294765,0,73,1.000718e-05,4.149982
OFNS_DESC,7294765,0,70,9.595923e-06,3.999898
PD_CD,7294765,0,431,5.908347e-05,5.901381


In [72]:
data_df.to_csv('filtered3.csv')