#### Import dependencies:

In [1]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
import matplotlib as plt
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
import time
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)
import re
import os
import matplotlib.pyplot as plt
%matplotlib inline

Python version: 3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
pandas version: 1.1.3
matplotlib version: 3.3.2
NumPy version: 1.19.5
SciPy version: 1.5.2
IPython version: 7.19.0
scikit-learn version: 0.23.2
-------------------------


#### Download data file if it has not been downloaded already:

In [2]:
#url = 'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD'
fn_src = 'https://data.cityofnewyork.us/api/views/sv2w-rv3k/rows.csv?accessType=DOWNLOAD'
fn_dst = 'NYPD-Criminal-Court-Summons-Historic.csv'

from six.moves import urllib

if os.path.isfile(fn_dst):
    print('File %s has already been downloaded' % fn_dst)
else:
    print('Fetching file %s[2.4GB]. This may take a while...' % fn_dst)
    urllib.request.urlretrieve(fn_src, fn_dst)
    print('File %s has been downloaded' % fn_dst)

File NYPD-Criminal-Court-Summons-Historic.csv has already been downloaded


#### Open a stream to the data file so we don't have to load the whole data set into main memory

In [3]:
from openclean.pipeline import stream
ds_full = stream(fn_dst)

In [4]:
print(f'The dataset contains {ds_full.count():,} rows.')

The dataset contains 5,336,264 rows.


In [5]:
ds_full.head()

Unnamed: 0,SUMMONS_KEY,SUMMONS_DATE,OFFENSE_DESCRIPTION,LAW_SECTION_NUMBER,LAW_DESCRIPTION,SUMMONS_CATEGORY_TYPE,AGE_GROUP,SEX,RACE,JURISDICTION_CODE,BORO,PRECINCT_OF_OCCUR,X_COORDINATE_CD,Y_COORDINATE_CD,Latitude,Longitude,Lon_Lat
0,192724412,01/24/2019,UNINSURED VEHICLE,319,VTL,VTL,UNKNOWN,,,0,NEW YORK,18,990060,217404,40.763406514000046,-73.97902801199997,POINT (-73.97902801199996 40.76340651400006)
1,193633599,02/15/2019,OTHER VTL,9999,VTL,VTL,UNKNOWN,,,0,BROOKLYN,78,991048,187513,40.68136223600004,-73.97549163099995,POINT (-73.97549163099997 40.68136223600004)
2,194790153,03/17/2019,MAKES UNREASONABLE NOISE,240.20(2),,,25-44,M,WHITE HISPANIC,0,BROOKLYN,72,984058,178998,40.65799306600008,-74.00069344599996,POINT (-74.00069344599996 40.657993066000074)
3,198874018,06/24/2019,OTHER SANITARY CODE,9999,ABC,OTHER ABC,UNKNOWN,,,0,QUEENS,109,1031014,217293,40.76298008300006,-73.83118971499994,POINT (-73.83118971499994 40.76298008300006)
4,191809027,01/02/2019,"MARIJUANA, POSSESSION OF",221.05,Penal Law,MARIJUANA,25-44,M,BLACK,0,BRONX,44,1003989,244111,40.83668979700008,-73.92866741799997,POINT (-73.92866741799997 40.836689797000076)
5,189573397,10/29/2018,RATES OF PEDICABS,20-260,,,25-44,M,WHITE,0,NEW YORK,14,986072,212412,40.74970647,-73.99342559,POINT (-73.99342559 40.74970647)
6,188834473,10/16/2018,OTHER TRAFFIC REG,9999,ABC,OTHER ABC,UNKNOWN,,,0,BROOKLYN,63,1005474,161744,40.61060908,-73.92356057,POINT (-73.92356057 40.61060908)
7,192166453,01/11/2019,EQUIPMENT,375,VTL,VTL,UNKNOWN,,,0,NEW YORK,1,982611,202413,40.72226170900007,-74.00591439799997,POINT (-74.00591439799996 40.722261709000065)
8,187391089,09/09/2018,PERMIT UNLICENSED,19-506(B),,,UNKNOWN,,,0,QUEENS,103,1040896,196247,40.70515635,-73.79569434,POINT (-73.79569434 40.70515635)
9,189477530,11/01/2018,RECKLESS DRIVING,1212,VTL,VTL,45-64,M,BLACK,2,BROOKLYN,77,1003508,185056,40.67459999,-73.93057482,POINT (-73.93057482 40.67459999)


#### Use openclean to profile the data set:

In [6]:
from openclean.profiling.column import DefaultColumnProfiler

#profiles = ds.profile(default_profiler=DefaultColumnProfiler)
profiles = ds_full.profile(default_profiler=DefaultColumnProfiler)

In [7]:
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
SUMMONS_KEY,5336264,0,5336248,0.999997,22.347392
SUMMONS_DATE,5336264,0,5479,0.001026748,12.132267
OFFENSE_DESCRIPTION,5336264,1833,18941,0.003550707,5.223593
LAW_SECTION_NUMBER,5336264,76877,6201,0.001179035,5.001091
LAW_DESCRIPTION,5336264,92140,27,5.14862e-06,2.213753
SUMMONS_CATEGORY_TYPE,5336264,97655,94,1.794369e-05,3.887649
AGE_GROUP,5336264,83920,222,4.226684e-05,1.885182
SEX,5336264,76814,4,7.605358e-07,0.755284
RACE,5336264,396655,9,1.822007e-06,0.568514
JURISDICTION_CODE,5336264,0,3,5.621911e-07,0.553216


In [8]:
# Print the most frequent data type for each column.

print('Schema\n------')
for col in ds_full.columns:
    p = profiles.column(col)
    print("  '{}' ({})".format(col, p['datatypes']['distinct'].most_common(1)[0][0]))

Schema
------
  'SUMMONS_KEY' (int)
  'SUMMONS_DATE' (date)
  'OFFENSE_DESCRIPTION' (str)
  'LAW_SECTION_NUMBER' (str)
  'LAW_DESCRIPTION' (str)
  'SUMMONS_CATEGORY_TYPE' (str)
  'AGE_GROUP' (int)
  'SEX' (str)
  'RACE' (str)
  'JURISDICTION_CODE' (int)
  'BORO' (str)
  'PRECINCT_OF_OCCUR' (int)
  'X_COORDINATE_CD' (int)
  'Y_COORDINATE_CD' (int)
  'Latitude' (float)
  'Longitude' (float)
  'Lon_Lat' (str)


### Field Name & Description:
    'CMPLNT_NUM' (int)                 Complaint Number
    'CMPLNT_FR_DT' (date)              Complaint From Date
    'CMPLNT_FR_TM' (date)              Complaint From Time
    'CMPLNT_TO_DT' (date)              Complaint To Date
    'CMPLNT_TO_TM' (date)              Complaint To Time
    'ADDR_PCT_CD' (int)                Code of Precinct in which the Incident Occured
    'RPT_DT' (date)                    Report Date
    'KY_CD' (int)                      "Key Code": Offense Classification Code (3 digits)
    'OFNS_DESC' (str)                  Offense Description
    'PD_CD' (int)                      PD Code of Offense. More granular than Key Code
    'PD_DESC' (str)                    PD Description of Offense.
    'CRM_ATPT_CPTD_CD' (str)           Whether Crime was Atempted or Completed (values: 'COMPLETED', 'ATTEMPTED')
    'LAW_CAT_CD' (str)                 Level of Offense (values: 'FELONY', 'VIOLATION', 'MISDEMEANOR')
    'BORO_NM' (str)                    Name of Borough in which Incident Occurred
    'LOC_OF_OCCUR_DESC' (str)          Description of where the incident occurred with respect to the premises
                                       (values:'FRONT OF', 'REAR OF', 'OUTSIDE', 'INSIDE', 'OPPOSITE OF')
    'PREM_TYP_DESC' (str)              Description of the type of premises in which the Incident Occurred
    'JURIS_DESC' (str)                 Description of Jurisdiction in which Incident Occurred
    'JURISDICTION_CODE' (int)          Jurisdiction Code
    'PARKS_NM' (str)                   Name of Park in which Incident Occurred, if Applicable
    'HADEVELOPT' (str)                 Name of NYCHA Housing Development in which Incident Occurred, if Applicable
    'HOUSING_PSA' (int)                Housing PSA
    'X_COORD_CD' (int)                 X-coordinate, New York State Plane Coordinate System
    'Y_COORD_CD' (int)                 Y-coordinate, New York State Plane Coordinate System
    'SUSP_AGE_GROUP' (int)             Age Group of Suspect
    'SUSP_RACE' (str)                  Race of Suspect
    'SUSP_SEX' (str)                   Sex of Suspect
    'TRANSIT_DISTRICT' (int)           Transit-District code
    'Latitude' (float)                 Global Latitude of Location where Incident Occurred
    'Longitude' (float)                Global Longitude of Location where Incident Occured
    'Lat_Lon' (str)                    'Latitude' and 'Longitude' together
    'PATROL_BORO' (str)                Patrol Borough
    'STATION_NAME' (str)               Station Name
    'VIC_AGE_GROUP' (int)              Age Group of Victim
    'VIC_RACE' (str)                   Race of Victim
    'VIC_SEX' (str)                    Sex of Victim
    
    
    (Note: some field descriptions were taken from https://www1.nyc.gov/assets/nypd/downloads/pdf/analysis_and_planning/incident_level_data_footnotes.pdf)
   

## Profiling & Cleaning of each field in the data set

In [9]:
profiles.minmax('SUMMONS_KEY')

Unnamed: 0,min,max
int,5542337,222637918


In [10]:
profiles.minmax('SUMMONS_DATE')

Unnamed: 0,min,max
date,2006-01-01,2020-12-31


In [11]:
from datetime import datetime
#datetime.strptime('2014-12-04', '%Y-%m-%d').date()


def validate_date(sourceValue):
    dummy_date = datetime.strptime("01/01/1000", '%m/%d/%Y').date()
    
    try:
        source_date = sourceValue
        start_date = datetime.strptime("01/01/2006", '%m/%d/%Y').date()
        end_date = datetime.strptime("12/31/2020", '%m/%d/%Y').date()
    
        if source_date == '':
            source_date = dummy_date
        
        source_date = datetime.strptime(sourceValue, '%m/%d/%Y').date()
        
        if source_date >= start_date and source_date <= end_date:
            return source_date
        else:
            return dummy_date
    except:
        return dummy_date
    

ds_full = ds_full.update('SUMMONS_DATE', validate_date)

In [12]:
profiles.minmax('PRECINCT_OF_OCCUR')

Unnamed: 0,min,max
int,1,868


In [13]:
profiles.column('PRECINCT_OF_OCCUR').get('topValues')

[('40', 190864),
 ('75', 189021),
 ('73', 145704),
 ('79', 145004),
 ('115', 144382),
 ('90', 121689),
 ('52', 121206),
 ('44', 120065),
 ('14', 117273),
 ('34', 117101)]

In [14]:
profiles.minmax('LAW_SECTION_NUMBER')

Unnamed: 0,min,max
int,-20461,62320465
str,*,tax stam
float,1.03,4.04e+07
date,0104-02-11 00:00:00,4012-11-11 00:00:00


In [15]:
profiles.column('LAW_SECTION_NUMBER').get('topValues')

[('10-125', 1397991),
 ('9999', 441493),
 ('1-03', 233137),
 ('19-176', 214188),
 ('240.205A', 208198),
 ('221.05', 190464),
 ('140.05', 158032),
 ('CFR 49', 154846),
 ('153.09', 151386),
 ('240.20', 148692)]

In [16]:
profiles.column('OFFENSE_DESCRIPTION').get('topValues')

[('CONSUMPTION OF ALCOHOL', 1402056),
 ('URINATING IN PUBLIC', 294351),
 ('FAILURE TO COMPLY WITH SIGN', 259395),
 ('BICYCLE ON SIDEWALK', 219527),
 ('DISCON: OBSTRUCT PEDESTRIAN', 210327),
 ('DISORDERLY CONDUCT', 199382),
 ('OTHER ADMIN CODE', 187287),
 ('FEDERAL MOTOR VEH. SAFETY REG', 172954),
 ('MARIJUANA, POSSESSION OF', 167053),
 ('TRESPASS', 158354)]

In [17]:
ds_full.distinct('OFFENSE_DESCRIPTION')

Counter({'UNINSURED VEHICLE': 8249,
         'OTHER VTL': 43454,
         'MAKES UNREASONABLE NOISE': 985,
         'OTHER SANITARY CODE': 582,
         'MARIJUANA, POSSESSION OF': 167053,
         'RATES OF PEDICABS': 38,
         'OTHER TRAFFIC REG': 17740,
         'EQUIPMENT': 7084,
         'PERMIT UNLICENSED': 414,
         'RECKLESS DRIVING': 130913,
         'DISCON: OBSCENE LANGUAGE/GESTURE': 32339,
         'UNREGISTERED VEHICLE': 12198,
         'TRESPASS': 158354,
         'SALE TO A MINOR': 56,
         'IMPROPER ENTRANCE/EXIT': 722,
         'OTHER ABC': 26291,
         'HIGHWAY USE TAX (NYS)': 4177,
         'PUBLIC URINATION': 1926,
         'DEFECTIVE REFLECTOR': 290,
         'DISORDERLY PREMISE': 5291,
         'SMOKING IN TRAIN STATION': 2804,
         'OTHER TRANSIT REG': 4613,
         'REVOKED REGISTRATION': 12050,
         'OVERWEIGHT (VEH)': 999,
         'IMPROPER PLATES': 1554,
         'NON PAYMENT OF FARE (BUS)': 5255,
         'NON PAYMENT OF FARE (OTHER)'

In [20]:
# Print the most frequent values in column 'PD_DESC'

profiles.column('SUMMONS_CATEGORY_TYPE').get('topValues')

[('ALCOHOL', 1418360),
 ('DISORDERLY CONDUCT', 817343),
 ('PARK REGS', 310657),
 ('URINATING', 294578),
 ('VTL', 258055),
 ('BIKE', 254775),
 ('NYS TRANS', 227766),
 ('TLC', 220529),
 ('OTHER ADMIN CODE', 214518),
 ('MARIJUANA', 190431)]

In [21]:
ds_full.distinct('SUMMONS_CATEGORY_TYPE')

Counter({'VTL': 258055,
         '': 97655,
         'OTHER ABC': 34071,
         'MARIJUANA': 190431,
         'DISORDERLY CONDUCT': 817343,
         'TRESPASS': 157997,
         'ABC': 62956,
         'TLC': 220529,
         'TRAFFIC REGS': 12700,
         'SPITTING': 24760,
         'DOG': 52331,
         'LITTERNG': 28303,
         'KNIFE': 35496,
         'BIKE': 254775,
         'NYS TRANS': 227766,
         'ALCOHOL': 1418360,
         'PARK REGS': 310657,
         'OTHER ADMIN CODE': 214518,
         'FIREWORKS': 9482,
         'VENDING': 166964,
         'EXPOSURE': 9692,
         'KEY': 20725,
         'OPEN FIRE': 2826,
         'OTHER PENAL LAW': 13125,
         'NOISE': 133429,
         'AIR GUN': 1555,
         'LITTERING': 65832,
         'BUSINESS': 6733,
         'PANHANDLING': 6515,
         'URINATING': 294578,
         'HARASSMENT 2': 4658,
         'TOW TRUCK': 3044,
         'GAMBLING': 16154,
         'OTHER PARK REG': 31109,
         'POSTING ADVERTISEMENTS': 46

In [24]:
ds_full.distinct('LAW_DESCRIPTION')

Counter({'VTL': 256122,
         'UNKNOWN': 92140,
         'ABC': 97002,
         'Penal Law': 1224278,
         'Tax Law': 26632,
         'Administrative Code': 2648518,
         'Traffic Regs': 29206,
         'Health Code': 289392,
         'Park Regs': 369987,
         'NYS Transportation': 227276,
         'General Business Law': 16850,
         'NYS Workers Comp': 445,
         'Sanitary Code (NYS)': 1239,
         'TLC': 7129,
         'ACA': 5325,
         'Transit Regs': 11715,
         'Multiple Dwelling': 1106,
         'Education Law (NYS)': 2066,
         'Other': 22313,
         'Navigation Law': 5386,
         'Agriculture & Markets Law': 371,
         'Environmental Conservation Law': 310,
         'OTHER PARK REGS': 682,
         'DOG: UNLICENSED': 427,
         'BICYCLE NO BELL': 8,
         'Health Laws': 223,
         'ABC Laws': 99,
         'Traffic Rules': 17})

In [23]:
def validate_law_cat_cd(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ["VTL", "ABC", "Penal Law","Tax Law",
         'Administrative Code',
         'Traffic Regs',
         'Health Code',
         'Park Regs',
         'NYS Transportation',
         'General Business Law',
         'NYS Workers Comp',
         'Sanitary Code (NYS)',
         'TLC',
         'ACA',
         'Transit Regs',
         'Multiple Dwelling',
         'Education Law (NYS)',
         'Other',
         'Navigation Law',
         'Agriculture & Markets Law',
         'Environmental Conservation Law',
         'OTHER PARK REGS',
         'DOG: UNLICENSED',
         'BICYCLE NO BELL',
         'Health Laws',
         'ABC Laws',
         'Traffic Rules'] 
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('LAW_DESCRIPTION', validate_law_cat_cd)

In [25]:
ds_full.distinct('BORO')

Counter({'NEW YORK': 51983,
         'BROOKLYN': 1414313,
         'QUEENS': 879927,
         'BRONX': 1047302,
         'STATEN ISLAND': 146364,
         '': 564609,
         'MANHATTAN': 1231766})

In [26]:
def validate_boro_nm(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ["BRONX","BROOKLYN","MANHATTAN","QUEENS","STATEN ISLAND","NEW YORK"]
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('BORO', validate_boro_nm)

In [27]:
profiles.column('JURISDICTION_CODE').get('topValues')

[('0', 4753370), ('2', 516972), ('1', 65922)]

In [28]:
ds_full.distinct('JURISDICTION_CODE')

Counter({'0': 4753370, '2': 516972, '1': 65922})

In [29]:
profiles.minmax('X_COORDINATE_CD')

Unnamed: 0,min,max
int,913367,1067226


In [30]:
profiles.minmax('Y_COORDINATE_CD')

Unnamed: 0,min,max
int,121131,272932


In [31]:
profiles.column('AGE_GROUP').get('topValues')

[('25-44', 2422557),
 ('18-24', 1318103),
 ('45-64', 1110025),
 ('<18', 218912),
 ('UNKNOWN', 109312),
 ('65+', 72618),
 ('819', 21),
 ('818', 19),
 ('933', 17),
 ('822', 17)]

In [34]:
ds_full.distinct('AGE_GROUP')

Counter({'UNKNOWN': 194049,
         '25-44': 2422557,
         '45-64': 1110025,
         '18-24': 1318103,
         '<18': 218912,
         '65+': 72618})

In [33]:
def validate_age_group(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ["25-44", "18-24", "45-64", "<18", "65+","UNKNOWN"]
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('AGE_GROUP', validate_age_group)

In [35]:
profiles.column('RACE').get('topValues')

[('UNKNOWN', 4542966),
 ('BLACK', 179554),
 ('WHITE HISPANIC', 101554),
 ('WHITE', 52804),
 ('BLACK HISPANIC', 33313),
 ('ASIAN / PACIFIC ISLANDER', 23993),
 ('AMERICAN INDIAN/ALASKAN NATIVE', 2446),
 ('OTHER', 1948),
 ('HISPANIC', 1031)]

In [38]:
ds_full.distinct('RACE')

Counter({'UNKNOWN': 4940652,
         'WHITE HISPANIC': 101554,
         'BLACK': 179554,
         'WHITE': 52804,
         'OTHER': 1948,
         'BLACK HISPANIC': 33313,
         'ASIAN / PACIFIC ISLANDER': 23993,
         'AMERICAN INDIAN/ALASKAN NATIVE': 2446})

In [37]:
def validate_race(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ["UNKNOWN", "WHITE HISPANIC", "BLACK", "BLACK HISPANIC", "WHITE", "ASIAN / PACIFIC ISLANDER","AMERICAN INDIAN/ALASKAN NATIVE", "OTHER"]
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('RACE', validate_race)

In [39]:
profiles.column('SEX').get('topValues')

[('M', 4472662), ('F', 611055), ('D', 110248), ('U', 65485)]

In [42]:
ds_full.distinct('SEX')

Counter({'U': 142299, 'M': 4472662, 'F': 611055, 'D': 110248})

In [41]:
def validate_sex(sourceValue):
    UNKNOWN = "U"
    
    try:
        values = ["M", "F", "U","D"] 
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('SEX', validate_sex)

In [43]:
profiles.minmax('Latitude')

Unnamed: 0,min,max
float,40.498905,40.915782


In [44]:
def validate_lat(sourceValue):
    UNKNOWN = -999
    
    try:
        if sourceValue >= 40 and sourceValue <= 41:  
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('Latitude', validate_lat)

In [45]:
profiles.minmax('Longitude')

Unnamed: 0,min,max
float,-74.254903,-73.700577


In [46]:
def validate_long(sourceValue):
    UNKNOWN = -999
    
    try:
        if sourceValue >= -74 and sourceValue <= -73:  
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('Latitude', validate_long)

In [47]:
profiles.minmax('Lon_Lat')

Unnamed: 0,min,max
str,POINT (-73.70057651099997 40.73903832800004),POINT (-74.25490278599993 40.50771206300004)


In [48]:
SELECTED_COLUMNS = [\
    'SUMMONS_KEY',\
    'SUMMONS_DATE',\
    'OFFENSE_DESCRIPTION',\
    'LAW_SECTION_NUMBER',\
    'LAW_DESCRIPTION',\
    'SUMMONS_CATEGORY_TYPE',\
    'AGE_GROUP',\
    'SEX',\
    'RACE',\
    'JURISDICTION_CODE',\
    'BORO',\
    'PRECINCT_OF_OCCUR',\
    'X_COORDINATE_CD',\
    'Y_COORDINATE_CD',\
    'Latitude',\
    'Longitude',\
    'Lon_Lat',\
]

ds_full = ds_full.select(SELECTED_COLUMNS)

In [49]:
ds_full

<openclean.pipeline.DataPipeline at 0x7ffd7b340f40>

# VALIDATING DATA AFTER CLEANING

In [50]:
data_df = pd.read_csv("NYPD-Criminal-Court-Summons-Historic.csv")

In [53]:
data_df = data_df.dropna()

In [54]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4246146 entries, 4 to 5336262
Data columns (total 17 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   SUMMONS_KEY            int64  
 1   SUMMONS_DATE           object 
 2   OFFENSE_DESCRIPTION    object 
 3   LAW_SECTION_NUMBER     object 
 4   LAW_DESCRIPTION        object 
 5   SUMMONS_CATEGORY_TYPE  object 
 6   AGE_GROUP              object 
 7   SEX                    object 
 8   RACE                   object 
 9   JURISDICTION_CODE      int64  
 10  BORO                   object 
 11  PRECINCT_OF_OCCUR      int64  
 12  X_COORDINATE_CD        float64
 13  Y_COORDINATE_CD        float64
 14  Latitude               float64
 15  Longitude              float64
 16  Lon_Lat                object 
dtypes: float64(4), int64(3), object(10)
memory usage: 583.1+ MB


In [55]:
ds_full_jc = stream(data_df)
profiles = ds_full_jc.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
SUMMONS_KEY,4246146,0,4246141,0.9999988,22.01772
SUMMONS_DATE,4246146,0,5074,0.001194966,11.804854
OFFENSE_DESCRIPTION,4246146,0,291,6.853274e-05,4.873426
LAW_SECTION_NUMBER,4246146,0,248,5.840591e-05,4.702379
LAW_DESCRIPTION,4246146,0,24,5.652184e-06,2.097447
SUMMONS_CATEGORY_TYPE,4246146,0,66,1.554351e-05,3.803605
AGE_GROUP,4246146,0,213,5.016314e-05,1.829645
SEX,4246146,0,4,9.420307e-07,0.62291
RACE,4246146,0,8,1.884061e-06,0.531446
JURISDICTION_CODE,4246146,0,3,7.06523e-07,0.588972


In [56]:
data_df.to_csv('filtered4.csv')