#### Import dependencies:

In [1]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
import matplotlib as plt
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
import time
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)
import re
import os
import matplotlib.pyplot as plt
%matplotlib inline

Python version: 3.8.8 (default, Apr 13 2021, 12:59:45) 
[Clang 10.0.0 ]
pandas version: 1.2.4
matplotlib version: 3.3.4
NumPy version: 1.20.1
SciPy version: 1.6.2
IPython version: 7.29.0
scikit-learn version: 0.24.1
-------------------------


#### Download data file if it has not been downloaded already:

In [3]:
#url = 'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD'
fn_src = 'https://data.cityofnewyork.us/api/views/5ucz-vwe8/rows.csv?accessType=DOWNLOAD'
fn_dst = 'NYPD-Shooting_Incident_Data-YTD.csv'

from six.moves import urllib

if os.path.isfile(fn_dst):
    print('File %s has already been downloaded' % fn_dst)
else:
    print('Fetching file %s[2.4GB]. This may take a while...' % fn_dst)
    urllib.request.urlretrieve(fn_src, fn_dst)
    print('File %s has been downloaded' % fn_dst)

Fetching file NYPD-Shooting_Incident_Data-YTD.csv[2.4GB]. This may take a while...
File NYPD-Shooting_Incident_Data-YTD.csv has been downloaded


#### Open a stream to the data file so we don't have to load the whole data set into main memory

In [4]:
from openclean.pipeline import stream
ds_full = stream(fn_dst)

In [5]:
print(f'The dataset contains {ds_full.count():,} rows.')

The dataset contains 1,531 rows.


In [6]:
ds_full.head()

Unnamed: 0,INCIDENT_KEY,OCCUR_DATE,OCCUR_TIME,BORO,PRECINCT,JURISDICTION_CODE,LOCATION_DESC,STATISTICAL_MURDER_FLAG,PERP_AGE_GROUP,PERP_SEX,PERP_RACE,VIC_AGE_GROUP,VIC_SEX,VIC_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,New Georeferenced Column
0,230162224,06/27/2021,04:34:00,BROOKLYN,75,0,,False,,,,<18,M,BLACK,1014210,182758,40.668262749000064,-73.89200350499993,POINT (-73.89200350499993 40.668262749000064)
1,228561932,05/21/2021,21:06:00,BROOKLYN,77,0,,False,<18,M,BLACK,25-44,F,WHITE,994341,185941,40.67704431000004,-73.96362125499998,POINT (-73.96362125499998 40.677044310000035)
2,233790860,09/18/2021,23:17:00,BRONX,44,0,,True,,,,18-24,M,BLACK HISPANIC,1010270,246284,40.84263777600006,-73.90596032999997,POINT (-73.90596032999997 40.84263777600006)
3,225295736,03/07/2021,06:15:00,BROOKLYN,75,0,,False,25-44,M,WHITE HISPANIC,25-44,F,WHITE HISPANIC,1020492,187865,40.68225681500007,-73.86933111399996,POINT (-73.86933111399996 40.68225681500007)
4,227647465,05/01/2021,00:22:00,BRONX,46,0,,False,25-44,M,BLACK,25-44,M,BLACK,1011829,249524,40.85152588600005,-73.90031249299993,POINT (-73.90031249299993 40.85152588600005)
5,222848394,01/08/2021,05:50:00,MANHATTAN,23,2,MULTI DWELL - PUBLIC HOUS,False,25-44,M,WHITE HISPANIC,25-44,M,WHITE HISPANIC,998481,225704,40.78617829400008,-73.94861158899994,POINT (-73.94861158899994 40.78617829400008)
6,229034592,06/02/2021,14:30:00,BRONX,48,0,,False,,,,25-44,M,WHITE HISPANIC,1014004,247798,40.84678146700002,-73.89245825499995,POINT (-73.89245825499995 40.84678146700002)
7,225168411,03/04/2021,11:40:00,BRONX,49,0,,False,25-44,M,BLACK HISPANIC,25-44,M,BLACK HISPANIC,1021657,251013,40.85557657700008,-73.86477926499998,POINT (-73.86477926499998 40.85557657700008)
8,224173086,02/09/2021,19:50:00,QUEENS,113,0,PVT HOUSE,False,,,,25-44,M,BLACK,1052651,193294,40.696968027000025,-73.75332736499998,POINT (-73.75332736499998 40.696968027000025)
9,230232277,06/29/2021,15:32:00,MANHATTAN,34,0,HOSPITAL,False,,,,18-24,M,BLACK,1008527,257284,40.872834434000026,-73.91222001499993,POINT (-73.91222001499993 40.872834434000026)


#### Use openclean to profile the data set:

In [7]:
from openclean.profiling.column import DefaultColumnProfiler

#profiles = ds.profile(default_profiler=DefaultColumnProfiler)
profiles = ds_full.profile(default_profiler=DefaultColumnProfiler)

In [8]:
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
INCIDENT_KEY,1531,0,1195,0.780536,10.019048
OCCUR_DATE,1531,0,265,0.173089,7.764277
OCCUR_TIME,1531,0,637,0.416068,8.880947
BORO,1531,0,5,0.003266,1.997318
PRECINCT,1531,0,71,0.046375,5.522855
JURISDICTION_CODE,1531,1,3,0.001961,0.64698
LOCATION_DESC,1531,1045,14,0.028807,2.330563
STATISTICAL_MURDER_FLAG,1531,0,2,0.001306,0.730723
PERP_AGE_GROUP,1531,811,5,0.006944,1.604112
PERP_SEX,1531,811,2,0.002778,0.210842


In [9]:
# Print the most frequent data type for each column.

print('Schema\n------')
for col in ds_full.columns:
    p = profiles.column(col)
    print("  '{}' ({})".format(col, p['datatypes']['distinct'].most_common(1)[0][0]))

Schema
------
  'INCIDENT_KEY' (int)
  'OCCUR_DATE' (date)
  'OCCUR_TIME' (date)
  'BORO' (str)
  'PRECINCT' (int)
  'JURISDICTION_CODE' (int)
  'LOCATION_DESC' (str)
  'STATISTICAL_MURDER_FLAG' (str)
  'PERP_AGE_GROUP' (str)
  'PERP_SEX' (str)
  'PERP_RACE' (str)
  'VIC_AGE_GROUP' (str)
  'VIC_SEX' (str)
  'VIC_RACE' (str)
  'X_COORD_CD' (int)
  'Y_COORD_CD' (int)
  'Latitude' (float)
  'Longitude' (float)
  'New Georeferenced Column' (str)


## Profiling & Cleaning of each field in the data set

In [10]:
profiles.minmax('INCIDENT_KEY')

Unnamed: 0,min,max
int,222524732,234323835


In [11]:
profiles.minmax('OCCUR_DATE')

Unnamed: 0,min,max
date,2021-01-01,2021-09-30


In [12]:
from datetime import datetime
#datetime.strptime('2014-12-04', '%Y-%m-%d').date()


def validate_date(sourceValue):
    dummy_date = datetime.strptime("01/01/1000", '%m/%d/%Y').date()
    
    try:
        source_date = sourceValue
        start_date = datetime.strptime("01/01/2006", '%m/%d/%Y').date()
        end_date = datetime.strptime("12/31/2020", '%m/%d/%Y').date()
    
        if source_date == '':
            source_date = dummy_date
        
        source_date = datetime.strptime(sourceValue, '%m/%d/%Y').date()
        
        if source_date >= start_date and source_date <= end_date:
            return source_date
        else:
            return dummy_date
    except:
        return dummy_date
    

ds_full = ds_full.update('OCCUR_DATE', validate_date)

In [13]:
profiles.minmax('OCCUR_TIME')

Unnamed: 0,min,max
date,2021-12-12,2021-12-12 23:57:00


In [14]:
profiles.minmax('PRECINCT')

Unnamed: 0,min,max
int,5,122


In [15]:
profiles.column('PRECINCT').get('topValues')

[('44', 82),
 ('75', 78),
 ('42', 68),
 ('47', 66),
 ('40', 61),
 ('73', 61),
 ('48', 59),
 ('32', 51),
 ('34', 49),
 ('43', 49)]

In [16]:
profiles.column('LOCATION_DESC').get('topValues')

[('MULTI DWELL - PUBLIC HOUS', 250),
 ('MULTI DWELL - APT BUILD', 99),
 ('GROCERY/BODEGA', 38),
 ('COMMERCIAL BLDG', 28),
 ('PVT HOUSE', 16),
 ('BAR/NIGHT CLUB', 15),
 ('GAS STATION', 8),
 ('HOSPITAL', 7),
 ('HOTEL/MOTEL', 7),
 ('RESTAURANT/DINER', 6)]

In [17]:
ds_full.distinct('LOCATION_DESC')

Counter({'': 1045,
         'MULTI DWELL - PUBLIC HOUS': 250,
         'PVT HOUSE': 16,
         'HOSPITAL': 7,
         'MULTI DWELL - APT BUILD': 99,
         'GROCERY/BODEGA': 38,
         'GAS STATION': 8,
         'BAR/NIGHT CLUB': 15,
         'COMMERCIAL BLDG': 28,
         'DEPT STORE': 4,
         'HOTEL/MOTEL': 7,
         'BEAUTY/NAIL SALON': 5,
         'RESTAURANT/DINER': 6,
         'BANK': 2,
         'FAST FOOD': 1})

In [18]:
ds_full.distinct('STATISTICAL_MURDER_FLAG')

Counter({'false': 1218, 'true': 313})

In [20]:
def validate_crime_indicator(sourceValue):
    UNKNOWN = "3"
    
    try:
        crime_indicator_list = ["0", "1", "2"]
        
        if sourceValue in crime_indicator_list:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('JURISDICTION_CODE', validate_crime_indicator)

In [21]:
ds_full.distinct('JURISDICTION_CODE')

Counter({'0': 1281, '2': 248, '1': 1, '3': 1})

In [22]:
profiles.minmax('X_COORD_CD')

Unnamed: 0,min,max
int,938074,1059372


In [23]:
profiles.minmax('Y_COORD_CD')

Unnamed: 0,min,max
int,148363,269635


In [24]:
profiles.column('PERP_AGE_GROUP').get('topValues')

[('25-44', 366), ('18-24', 242), ('<18', 76), ('45-64', 33), ('65+', 3)]

In [27]:
ds_full.distinct('PERP_AGE_GROUP')

Counter({'UNKNOWN': 811,
         '<18': 76,
         '25-44': 366,
         '18-24': 242,
         '45-64': 33,
         '65+': 3})

In [26]:
def validate_age_group(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ["25-44", "18-24", "45-64", "<18", "65+"]
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('PERP_AGE_GROUP', validate_age_group)

In [28]:
profiles.column('PERP_RACE').get('topValues')

[('BLACK', 479),
 ('WHITE HISPANIC', 136),
 ('BLACK HISPANIC', 82),
 ('WHITE', 12),
 ('ASIAN / PACIFIC ISLANDER', 11)]

In [31]:
ds_full.distinct('PERP_RACE')

Counter({'UNKNOWN': 811,
         'BLACK': 479,
         'WHITE HISPANIC': 136,
         'BLACK HISPANIC': 82,
         'WHITE': 12,
         'ASIAN / PACIFIC ISLANDER': 11})

In [30]:
def validate_race(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ["WHITE HISPANIC", "BLACK", "BLACK HISPANIC", "WHITE", "ASIAN / PACIFIC ISLANDER"]
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('PERP_RACE', validate_race)

In [32]:
profiles.column('PERP_SEX').get('topValues')

[('M', 696), ('F', 24)]

In [35]:
ds_full.distinct('PERP_SEX')

Counter({'U': 811, 'M': 696, 'F': 24})

In [34]:
def validate_sex(sourceValue):
    UNKNOWN = "U"
    
    try:
        values = ["M", "F"] 
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('PERP_SEX', validate_sex)

In [36]:
profiles.minmax('Latitude')

Unnamed: 0,min,max
float,40.573906,40.906668


In [37]:
profiles.minmax('Longitude')

Unnamed: 0,min,max
float,-74.166325,-73.729056


In [39]:
profiles.minmax('New Georeferenced Column')

Unnamed: 0,min,max
str,POINT (-73.72905567199997 40.70526581200004),POINT (-74.16632541599995 40.61675307900003)


In [40]:
profiles.column('BORO').get('topValues')

[('BRONX', 536),
 ('BROOKLYN', 488),
 ('MANHATTAN', 259),
 ('QUEENS', 220),
 ('STATEN ISLAND', 28)]

In [41]:
ds_full.distinct('BORO')

Counter({'BROOKLYN': 488,
         'BRONX': 536,
         'MANHATTAN': 259,
         'QUEENS': 220,
         'STATEN ISLAND': 28})

In [42]:
profiles.column('VIC_AGE_GROUP').get('topValues')

[('25-44', 821),
 ('18-24', 461),
 ('<18', 126),
 ('45-64', 109),
 ('65+', 11),
 ('UNKNOWN', 3)]

In [43]:
ds_full.distinct('VIC_AGE_GROUP')

Counter({'<18': 126,
         '25-44': 821,
         '18-24': 461,
         '45-64': 109,
         'UNKNOWN': 3,
         '65+': 11})

In [44]:
ds_full = ds_full.update('VIC_AGE_GROUP', validate_age_group)

In [45]:
profiles.column('VIC_RACE').get('topValues')

[('BLACK', 1076),
 ('WHITE HISPANIC', 221),
 ('BLACK HISPANIC', 188),
 ('WHITE', 26),
 ('ASIAN / PACIFIC ISLANDER', 19),
 ('UNKNOWN', 1)]

In [46]:
ds_full.distinct('VIC_RACE')

Counter({'BLACK': 1076,
         'WHITE': 26,
         'BLACK HISPANIC': 188,
         'WHITE HISPANIC': 221,
         'ASIAN / PACIFIC ISLANDER': 19,
         'UNKNOWN': 1})

In [47]:
ds_full = ds_full.update('VIC_RACE', validate_race)

In [48]:
profiles.column('VIC_SEX').get('topValues')

[('M', 1369), ('F', 161), ('U', 1)]

In [43]:
ds_full.distinct('VIC_SEX')

Counter({'F': 2204, 'M': 21370, 'U': 11})

In [44]:
ds_full = ds_full.update('VIC_SEX', validate_sex)

In [45]:
SELECTED_COLUMNS = [\
    'INCIDENT_KEY',\
    'OCCUR_DATE',\
    'OCCUR_TIME',\
    'BORO',\
    'PRECINCT',\
    'JURISDICTION_CODE',\
    'LOCATION_DESC',\
    'STATISTICAL_MURDER_FLAG',\
    'PERP_AGE_GROUP',\
    'PERP_SEX',\
    'PERP_RACE',\
    'VIC_AGE_GROUP',\
    'VIC_RACE',\
    'VIC_SEX',\
    'X_COORD_CD',\
    'Y_COORD_CD',\
    'Latitude',\
    'Longitude',\
    'Lon_Lat',\
]

ds_full = ds_full.select(SELECTED_COLUMNS)

In [46]:
ds_full.head()

Unnamed: 0,INCIDENT_KEY,OCCUR_DATE,OCCUR_TIME,BORO,PRECINCT,JURISDICTION_CODE,LOCATION_DESC,STATISTICAL_MURDER_FLAG,PERP_AGE_GROUP,PERP_SEX,PERP_RACE,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
0,24050482,2006-08-27,05:35:00,BRONX,52,0,,True,UNKNOWN,U,UNKNOWN,25-44,BLACK HISPANIC,F,1017541.5625,255918.875,40.86905819000003,-73.87963173099998,POINT (-73.87963173099996 40.86905819000003)
1,77673979,2011-03-11,12:03:00,QUEENS,106,0,,False,UNKNOWN,U,UNKNOWN,65+,WHITE,M,1027543.0,186095.0,40.677366895000034,-73.84392019199998,POINT (-73.84392019199998 40.677366895000034)
2,203350417,2019-10-06,01:09:00,BROOKLYN,77,0,,False,UNKNOWN,U,UNKNOWN,18-24,BLACK,F,995325.0,185155.0,40.674885741000026,-73.96007501899999,POINT (-73.96007501899999 40.674885741000026)
3,80584527,2011-09-04,03:35:00,BRONX,40,0,,False,UNKNOWN,U,UNKNOWN,<18,BLACK,M,1007453.0,233952.0,40.80879780500004,-73.91618413199996,POINT (-73.91618413199996 40.80879780500004)
4,90843766,2013-05-27,21:16:00,QUEENS,100,0,,False,UNKNOWN,U,UNKNOWN,18-24,BLACK,M,1041267.375,157133.515625,40.597796249000055,-73.79468553799995,POINT (-73.79468553799995 40.597796249000055)
5,92393427,2013-09-01,04:17:00,BROOKLYN,67,0,,False,UNKNOWN,U,UNKNOWN,<18,BLACK,M,1001693.9375,170112.890625,40.63358818100005,-73.93715330699996,POINT (-73.93715330699996 40.63358818100005)
6,73057167,2010-06-05,21:16:00,BROOKLYN,77,0,,False,UNKNOWN,U,UNKNOWN,<18,BLACK,M,1001506.75,184055.65625,40.67185839800004,-73.93779204599997,POINT (-73.93779204599997 40.67185839800004)
7,211362213,2020-03-20,21:27:00,BROOKLYN,81,0,,False,UNKNOWN,U,UNKNOWN,25-44,BLACK,M,1001586.375,189839.03125,40.68773229400006,-73.93749012499995,POINT (-73.93749012499995 40.68773229400006)
8,137564752,2014-07-04,00:25:00,QUEENS,101,0,,False,UNKNOWN,U,UNKNOWN,18-24,BLACK,M,1051520.375,155790.40625,40.59403780700006,-73.757778692,POINT (-73.75777869199999 40.59403780700006)
9,147024011,2015-10-18,01:33:00,QUEENS,106,0,,False,UNKNOWN,U,UNKNOWN,18-24,BLACK,M,1038231.3125,184701.421875,40.67348312300004,-73.80539763499996,POINT (-73.80539763499996 40.67348312300004)


# VALIDATING DATA AFTER CLEANING

In [47]:
data_df = pd.read_csv("NYPD-Shooting_Incident_Data.csv")

In [61]:

pd.to_numeric(data_df["JURISDICTION_CODE"])

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
23580    0.0
23581    0.0
23582    0.0
23583    0.0
23584    0.0
Name: JURISDICTION_CODE, Length: 23585, dtype: float64

In [63]:
def validate_jurisdiction_code(data,column_name):
    data = data[data[column_name].between(0,2)]
    return data

In [64]:
data_df = validate_jurisdiction_code(data_df,'JURISDICTION_CODE')

In [82]:
def validate_location_desc(data,column_name):
    data[column_name].replace(np.nan, 'UNKNOWN', inplace=True)
    return data

In [83]:
data_df = validate_location_desc(data_df,'LOCATION_DESC')

In [91]:
def validate_perp(data,column_name):
    data[column_name].replace(np.nan, 'UNKNOWN', inplace=True)
    return data

In [92]:
data_df = validate_location_desc(data_df,'PERP_AGE_GROUP')

In [93]:
data_df = validate_location_desc(data_df,'PERP_SEX')

In [94]:
data_df = validate_location_desc(data_df,'PERP_RACE')

In [95]:
data_df

Unnamed: 0,INCIDENT_KEY,OCCUR_DATE,OCCUR_TIME,BORO,PRECINCT,JURISDICTION_CODE,LOCATION_DESC,STATISTICAL_MURDER_FLAG,PERP_AGE_GROUP,PERP_SEX,PERP_RACE,VIC_AGE_GROUP,VIC_SEX,VIC_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
0,24050482,08/27/2006,05:35:00,BRONX,52,0.0,UNKNOWN,True,UNKNOWN,UNKNOWN,UNKNOWN,25-44,F,BLACK HISPANIC,1.017542e+06,255918.875000,40.869058,-73.879632,POINT (-73.87963173099996 40.86905819000003)
1,77673979,03/11/2011,12:03:00,QUEENS,106,0.0,UNKNOWN,False,UNKNOWN,UNKNOWN,UNKNOWN,65+,M,WHITE,1.027543e+06,186095.000000,40.677367,-73.843920,POINT (-73.84392019199998 40.677366895000034)
2,203350417,10/06/2019,01:09:00,BROOKLYN,77,0.0,UNKNOWN,False,UNKNOWN,UNKNOWN,UNKNOWN,18-24,F,BLACK,9.953250e+05,185155.000000,40.674886,-73.960075,POINT (-73.96007501899999 40.674885741000026)
3,80584527,09/04/2011,03:35:00,BRONX,40,0.0,UNKNOWN,False,UNKNOWN,UNKNOWN,UNKNOWN,<18,M,BLACK,1.007453e+06,233952.000000,40.808798,-73.916184,POINT (-73.91618413199996 40.80879780500004)
4,90843766,05/27/2013,21:16:00,QUEENS,100,0.0,UNKNOWN,False,UNKNOWN,UNKNOWN,UNKNOWN,18-24,M,BLACK,1.041267e+06,157133.515625,40.597796,-73.794686,POINT (-73.79468553799995 40.597796249000055)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23580,186329304,08/12/2018,19:50:00,BROOKLYN,84,0.0,UNKNOWN,False,25-44,M,BLACK,65+,M,BLACK,9.891547e+05,193164.265625,40.696875,-73.982314,POINT (-73.98231375199998 40.69687491500008)
23581,29277330,05/26/2007,04:57:00,BROOKLYN,81,0.0,UNKNOWN,False,UNKNOWN,UNKNOWN,UNKNOWN,25-44,M,BLACK,1.005218e+06,190528.000000,40.689615,-73.924393,POINT (-73.92439311199996 40.689615497000034)
23582,77443443,02/25/2011,01:12:00,BROOKLYN,81,0.0,MULTI DWELL - APT BUILD,False,18-24,M,BLACK,<18,M,BLACK,1.003999e+06,187611.031250,40.681612,-73.928798,POINT (-73.92879814299994 40.681611891000045)
23583,176027888,03/17/2018,00:46:00,BRONX,43,0.0,UNKNOWN,False,25-44,M,UNKNOWN,25-44,M,WHITE HISPANIC,1.018728e+06,242744.234375,40.832893,-73.875408,POINT (-73.87540796899998 40.83289318000004)


In [96]:
ds_full_jc = stream(data_df)
profiles = ds_full_jc.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
INCIDENT_KEY,23583,0,18562,0.787092,13.995405
OCCUR_DATE,23583,0,5054,0.214307,11.942653
OCCUR_TIME,23583,0,1401,0.059407,9.821714
BORO,23583,0,5,0.000212,1.976157
PRECINCT,23583,0,77,0.003265,5.574937
JURISDICTION_CODE,23583,0,3,0.000127,0.669795
LOCATION_DESC,23583,0,40,0.001696,2.099604
STATISTICAL_MURDER_FLAG,23583,0,2,8.5e-05,0.703174
PERP_AGE_GROUP,23583,0,9,0.000382,1.83766
PERP_SEX,23583,0,4,0.00017,1.331032


In [97]:
data_df.to_csv('filtered2.csv')