#### Import dependencies:

In [1]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
import matplotlib as plt
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
import time
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)
import re
import os
import matplotlib.pyplot as plt
%matplotlib inline

Python version: 3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
pandas version: 1.1.3
matplotlib version: 3.3.2
NumPy version: 1.19.5
SciPy version: 1.5.2
IPython version: 7.19.0
scikit-learn version: 0.23.2
-------------------------


#### Download data file if it has not been downloaded already:

In [2]:
#url = 'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD'
fn_src = 'https://data.cityofnewyork.us/api/views/mv4k-y93f/rows.csv?accessType=DOWNLOAD'
fn_dst = 'NYPD-YTD-Criminal-Summons-Summary-Dashboard-2.csv'

from six.moves import urllib

if os.path.isfile(fn_dst):
    print('File %s has already been downloaded' % fn_dst)
else:
    print('Fetching file %s[2.4GB]. This may take a while...' % fn_dst)
    urllib.request.urlretrieve(fn_src, fn_dst)
    print('File %s has been downloaded' % fn_dst)

File NYPD-YTD-Criminal-Summons-Summary-Dashboard-2.csv has already been downloaded


#### Open a stream to the data file so we don't have to load the whole data set into main memory

In [3]:
from openclean.pipeline import stream
ds_full = stream(fn_dst)

In [4]:
print(f'The dataset contains {ds_full.count():,} rows.')

The dataset contains 35,297 rows.


In [5]:
ds_full.head()

Unnamed: 0,SUMMONS_KEY,SUMMONS_DATE,OFFENSE_DESCRIPTION,LAW_SECTION_NUMBER,LAW_DESCRIPTION,SUMMONS_CATEGORY_TYPE,AGE_GROUP,SEX,RACE,JURISDICTION_CODE,BORO,PRECINCT_OF_OCCUR,X_COORDINATE_CD,Y_COORDINATE_CD,Latitude,Longitude,New Georeferenced Column
0,234450913,09/30/2021,KNIVES; PUBLIC POSSESSION (BLADE 4 INCHES OR M...,10-133(B),,,25-44,M,BLACK,2,BRONX,40,,,,,
1,234279627,09/30/2021,EQUIPMENT,375,VTL,VTL,UNKNOWN,,,0,BROOKLYN,76,1029932.0,214337.0,40.75487228900005,-73.835115665,POINT (-73.835115665 40.75487228900005)
2,234275352,09/30/2021,EQUIPMENT,375,VTL,VTL,UNKNOWN,,,0,BROOKLYN,71,,,,,
3,234217681,09/29/2021,EQUIPMENT,375,VTL,VTL,UNKNOWN,,,0,BROOKLYN,66,982428.0,187606.0,40.68161992100005,-74.00657058999997,POINT (-74.00657058999997 40.68161992100005)
4,234260308,09/29/2021,RECKLESS DRIVING,1212,VTL,VTL,45-64,M,BLACK,2,BRONX,43,984014.0,185556.0,40.67599331900004,-74.00085226,POINT (-74.00085226 40.67599331900004)
5,234229955,09/28/2021,POSS. OF KNIFE,10-133,Administrative Code,KNIFE,25-44,M,BLACK,2,BRONX,42,,,,,
6,234083252,09/25/2021,RECKLESS DRIVING,1212,VTL,VTL,65+,M,BLACK,0,BROOKLYN,67,,,,,
7,233878429,09/21/2021,ALCOHOLIC BEVERAGE IN PUBLIC,10-125(2B),,,25-44,M,WHITE HISPANIC,0,BROOKLYN,72,,,,,
8,233817343,09/20/2021,MOTOR VEHICLE; ENGINE ON/KEY IN IGNITION (THER...,10-111(A),,,25-44,F,BLACK,0,QUEENS,105,,,,,
9,233755125,09/18/2021,UNLICENSED SALE OF ALCOHOL,100-1,ABC,ABC,25-44,F,BLACK,0,BROOKLYN,73,1016542.0,182634.0,40.66791419900005,-73.88359786099994,POINT (-73.88359786099994 40.66791419900005)


#### Use openclean to profile the data set:

In [6]:
from openclean.profiling.column import DefaultColumnProfiler

#profiles = ds.profile(default_profiler=DefaultColumnProfiler)
profiles = ds_full.profile(default_profiler=DefaultColumnProfiler)

In [7]:
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
SUMMONS_KEY,35297,0,35294,0.999915,15.107088
SUMMONS_DATE,35297,0,273,0.007734,7.953969
OFFENSE_DESCRIPTION,35297,0,388,0.010992,4.259884
LAW_SECTION_NUMBER,35297,0,347,0.009831,4.177027
LAW_DESCRIPTION,35297,8936,14,0.000531,1.981003
SUMMONS_CATEGORY_TYPE,35297,8936,44,0.001669,2.45958
AGE_GROUP,35297,0,6,0.00017,1.83839
SEX,35297,17436,3,0.000168,0.530843
RACE,35297,17437,8,0.000448,2.04068
JURISDICTION_CODE,35297,0,3,8.5e-05,0.476623


In [8]:
# Print the most frequent data type for each column.

print('Schema\n------')
for col in ds_full.columns:
    p = profiles.column(col)
    print("  '{}' ({})".format(col, p['datatypes']['distinct'].most_common(1)[0][0]))

Schema
------
  'SUMMONS_KEY' (int)
  'SUMMONS_DATE' (date)
  'OFFENSE_DESCRIPTION' (str)
  'LAW_SECTION_NUMBER' (str)
  'LAW_DESCRIPTION' (str)
  'SUMMONS_CATEGORY_TYPE' (str)
  'AGE_GROUP' (str)
  'SEX' (str)
  'RACE' (str)
  'JURISDICTION_CODE' (int)
  'BORO' (str)
  'PRECINCT_OF_OCCUR' (int)
  'X_COORDINATE_CD' (int)
  'Y_COORDINATE_CD' (int)
  'Latitude' (float)
  'Longitude' (float)
  'New Georeferenced Column' (str)


### Field Name & Description:
    'CMPLNT_NUM' (int)                 Complaint Number
    'CMPLNT_FR_DT' (date)              Complaint From Date
    'CMPLNT_FR_TM' (date)              Complaint From Time
    'CMPLNT_TO_DT' (date)              Complaint To Date
    'CMPLNT_TO_TM' (date)              Complaint To Time
    'ADDR_PCT_CD' (int)                Code of Precinct in which the Incident Occured
    'RPT_DT' (date)                    Report Date
    'KY_CD' (int)                      "Key Code": Offense Classification Code (3 digits)
    'OFNS_DESC' (str)                  Offense Description
    'PD_CD' (int)                      PD Code of Offense. More granular than Key Code
    'PD_DESC' (str)                    PD Description of Offense.
    'CRM_ATPT_CPTD_CD' (str)           Whether Crime was Atempted or Completed (values: 'COMPLETED', 'ATTEMPTED')
    'LAW_CAT_CD' (str)                 Level of Offense (values: 'FELONY', 'VIOLATION', 'MISDEMEANOR')
    'BORO_NM' (str)                    Name of Borough in which Incident Occurred
    'LOC_OF_OCCUR_DESC' (str)          Description of where the incident occurred with respect to the premises
                                       (values:'FRONT OF', 'REAR OF', 'OUTSIDE', 'INSIDE', 'OPPOSITE OF')
    'PREM_TYP_DESC' (str)              Description of the type of premises in which the Incident Occurred
    'JURIS_DESC' (str)                 Description of Jurisdiction in which Incident Occurred
    'JURISDICTION_CODE' (int)          Jurisdiction Code
    'PARKS_NM' (str)                   Name of Park in which Incident Occurred, if Applicable
    'HADEVELOPT' (str)                 Name of NYCHA Housing Development in which Incident Occurred, if Applicable
    'HOUSING_PSA' (int)                Housing PSA
    'X_COORD_CD' (int)                 X-coordinate, New York State Plane Coordinate System
    'Y_COORD_CD' (int)                 Y-coordinate, New York State Plane Coordinate System
    'SUSP_AGE_GROUP' (int)             Age Group of Suspect
    'SUSP_RACE' (str)                  Race of Suspect
    'SUSP_SEX' (str)                   Sex of Suspect
    'TRANSIT_DISTRICT' (int)           Transit-District code
    'Latitude' (float)                 Global Latitude of Location where Incident Occurred
    'Longitude' (float)                Global Longitude of Location where Incident Occured
    'Lat_Lon' (str)                    'Latitude' and 'Longitude' together
    'PATROL_BORO' (str)                Patrol Borough
    'STATION_NAME' (str)               Station Name
    'VIC_AGE_GROUP' (int)              Age Group of Victim
    'VIC_RACE' (str)                   Race of Victim
    'VIC_SEX' (str)                    Sex of Victim
    
    
    (Note: some field descriptions were taken from https://www1.nyc.gov/assets/nypd/downloads/pdf/analysis_and_planning/incident_level_data_footnotes.pdf)
   

## Profiling & Cleaning of each field in the data set

In [9]:
profiles.minmax('SUMMONS_KEY')

Unnamed: 0,min,max
int,6754670,234590979


In [10]:
profiles.minmax('SUMMONS_DATE')

Unnamed: 0,min,max
date,2021-01-01,2021-09-30


In [11]:
from datetime import datetime
#datetime.strptime('2014-12-04', '%Y-%m-%d').date()


def validate_date(sourceValue):
    dummy_date = datetime.strptime("01/01/1000", '%m/%d/%Y').date()
    
    try:
        source_date = sourceValue
        start_date = datetime.strptime("01/01/2021", '%m/%d/%Y').date()
        end_date = datetime.strptime("09/31/2021", '%m/%d/%Y').date()
    
        if source_date == '':
            source_date = dummy_date
        
        source_date = datetime.strptime(sourceValue, '%m/%d/%Y').date()
        
        if source_date >= start_date and source_date <= end_date:
            return source_date
        else:
            return dummy_date
    except:
        return dummy_date
    

ds_full = ds_full.update('SUMMONS_DATE', validate_date)

In [12]:
profiles.minmax('PRECINCT_OF_OCCUR')

Unnamed: 0,min,max
int,1,123


In [13]:
profiles.column('PRECINCT_OF_OCCUR').get('topValues')

[('76', 7305),
 ('72', 1321),
 ('40', 1231),
 ('44', 1201),
 ('109', 933),
 ('45', 925),
 ('41', 916),
 ('52', 790),
 ('25', 748),
 ('47', 747)]

In [14]:
profiles.minmax('LAW_SECTION_NUMBER')

Unnamed: 0,min,max
str,1-03,GBL 45
int,45,9999
float,20.453,1050.9
date,3003-03-03 00:00:00,3003-03-03 00:00:00


In [15]:
profiles.column('LAW_SECTION_NUMBER').get('topValues')

[('CFR 49', 14047),
 ('221.05', 3689),
 ('10-111(A)', 2606),
 ('10-125(2B)', 1191),
 ('1212', 912),
 ('11-809', 855),
 ('9999', 835),
 ('375', 664),
 ('140.05', 621),
 ('1050.4(A)', 615)]

In [16]:
profiles.column('OFFENSE_DESCRIPTION').get('topValues')

[('FEDERAL MOTOR VEH. SAFETY REG', 14047),
 ('MARIJUANA, UNLAWFUL POSSESSION 2ND DEGREE', 3689),
 ('MOTOR VEHICLE; ENGINE ON/KEY IN IGNITION (THERE IS NO TIME REQUIREMENT)',
  2606),
 ('ALCOHOLIC BEVERAGE IN PUBLIC', 1191),
 ('RECKLESS DRIVING', 912),
 ('NO TAX STAMP', 855),
 ('EQUIPMENT', 664),
 ('TRESPASS', 621),
 ('NON PAYMENT OF FARE (OTHER)', 615),
 ('UNREGISTERED VEHICLE', 511)]

In [17]:
ds_full.distinct('OFFENSE_DESCRIPTION')

Counter({'KNIVES; PUBLIC POSSESSION (BLADE 4 INCHES OR MORE)': 89,
         'EQUIPMENT': 664,
         'RECKLESS DRIVING': 912,
         'POSS. OF KNIFE': 116,
         'ALCOHOLIC BEVERAGE IN PUBLIC': 1191,
         'MOTOR VEHICLE; ENGINE ON/KEY IN IGNITION (THERE IS NO TIME REQUIREMENT)': 2606,
         'UNLICENSED SALE OF ALCOHOL': 33,
         'FEDERAL MOTOR VEH. SAFETY REG': 14047,
         'IMPROPER PLATES': 198,
         'DISORDERLY CONDUCT': 357,
         'OTHER ABC': 151,
         'OVERWEIGHT (VEH)': 221,
         'NO TAX STAMP': 855,
         'OTHER VTL': 62,
         'TRESPASS': 621,
         'OTHER TAX LAW': 71,
         'FAIL TO CONTROL CROWD': 2,
         'UNREGISTERED VEHICLE': 511,
         'UNLICENSED OPERATOR': 91,
         'MARIJUANA, UNLAWFUL POSSESSION 2ND DEGREE': 3689,
         'ILLEGAL BOTTLE CLUB': 43,
         'AGG. UNLICENSED OPER.': 176,
         'DISORDERLY PREMISE': 39,
         'CONSUMPTION OF ALCOHOL': 430,
         'IMPROPER ENTRANCE/EXIT': 64,
         

In [18]:
# Print the most frequent values in column 'PD_DESC'

profiles.column('SUMMONS_CATEGORY_TYPE').get('topValues')

[('NYS TRANS', 14066),
 ('MARIJUANA', 3689),
 ('VTL', 3593),
 ('TLC', 1080),
 ('OTHER ABC', 658),
 ('TRESPASS', 621),
 ('DISORDERLY CONDUCT', 489),
 ('ALCOHOL', 443),
 ('ABC', 212),
 ('VENDING', 191)]

In [19]:
ds_full.distinct('SUMMONS_CATEGORY_TYPE')

Counter({'': 8936,
         'VTL': 3593,
         'KNIFE': 143,
         'ABC': 212,
         'NYS TRANS': 14066,
         'DISORDERLY CONDUCT': 489,
         'OTHER ABC': 658,
         'TLC': 1080,
         'TRESPASS': 621,
         'MARIJUANA': 3689,
         'ALCOHOL': 443,
         'AIR GUN': 103,
         'GAMBLING': 35,
         'PARK REGS': 47,
         'URINATING': 68,
         'FIREWORKS': 155,
         'VENDING': 191,
         'KEY': 117,
         'OPEN FIRE': 14,
         'NOISE': 174,
         'HARASSMENT 2': 20,
         'OTHER ADMIN CODE': 87,
         'LITTERING': 59,
         'BUSINESS': 45,
         'BIKE': 69,
         'TOW TRUCK': 7,
         'EXPOSURE': 8,
         'POSTING ADVERTISEMENTS': 8,
         'OTHER PARK REG': 5,
         'TRAFFIC REGS': 29,
         'OPEN HYDRANT': 5,
         'SANITARY CODE': 20,
         'SMOKING': 10,
         'OTHER': 2,
         'DOG': 8,
         'OTHER PENAL LAW': 23,
         'SPITTING': 20,
         'AMMO': 9,
         'AVIGATION

In [20]:
ds_full.distinct('LAW_DESCRIPTION')

Counter({'': 8936,
         'VTL': 3139,
         'Administrative Code': 2502,
         'ABC': 870,
         'NYS Transportation': 14066,
         'Penal Law': 5060,
         'Park Regs': 79,
         'Tax Law': 475,
         'General Business Law': 50,
         'Health Code': 64,
         'Traffic Regs': 29,
         'Sanitary Code (NYS)': 20,
         'Education Law (NYS)': 4,
         'TLC': 2,
         'NYS Workers Comp': 1})

In [21]:
def validate_law_cat_cd(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ['VTL',
         'Administrative Code',
         'ABC',
         'NYS Transportation',
         'Penal Law',
         'Park Regs',
         'Tax Law',
         'General Business Law',
         'Health Code',
         'Traffic Regs',
         'Sanitary Code (NYS)',
         'Education Law (NYS)',
         'TLC',
         'NYS Workers Comp',
] 
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('LAW_DESCRIPTION', validate_law_cat_cd)

In [22]:
ds_full.distinct('BORO')

Counter({'BRONX': 8136,
         'BROOKLYN': 15956,
         'QUEENS': 5797,
         'STATEN ISLAND': 655,
         'MANHATTAN': 4597,
         'NEW YORK': 156})

In [23]:
profiles.column('JURISDICTION_CODE').get('topValues')

[('0', 32510), ('2', 1562), ('1', 1225)]

In [24]:
ds_full.distinct('JURISDICTION_CODE')

Counter({'2': 1562, '0': 32510, '1': 1225})

In [25]:
profiles.minmax('X_COORDINATE_CD')

Unnamed: 0,min,max
int,914216,1066108


In [26]:
profiles.minmax('Y_COORDINATE_CD')

Unnamed: 0,min,max
int,122552,271578


In [27]:
profiles.column('AGE_GROUP').get('topValues')

[('UNKNOWN', 17500),
 ('25-44', 9317),
 ('18-24', 4678),
 ('45-64', 3054),
 ('<18', 494),
 ('65+', 254)]

In [28]:
ds_full.distinct('AGE_GROUP')

Counter({'25-44': 9317,
         'UNKNOWN': 17500,
         '45-64': 3054,
         '65+': 254,
         '18-24': 4678,
         '<18': 494})

In [29]:
profiles.column('RACE').get('topValues')

[('BLACK', 8707),
 ('WHITE HISPANIC', 4474),
 ('BLACK HISPANIC', 1764),
 ('WHITE', 1217),
 ('ASIAN / PACIFIC ISLANDER', 1217),
 ('UNKNOWN', 276),
 ('AMERICAN INDIAN/ALASKAN NATIVE', 143),
 ('OTHER', 62)]

In [30]:
ds_full.distinct('RACE')

Counter({'BLACK': 8707,
         '': 17437,
         'WHITE HISPANIC': 4474,
         'WHITE': 1217,
         'OTHER': 62,
         'BLACK HISPANIC': 1764,
         'UNKNOWN': 276,
         'ASIAN / PACIFIC ISLANDER': 1217,
         'AMERICAN INDIAN/ALASKAN NATIVE': 143})

In [31]:
def validate_race(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ["UNKNOWN", "WHITE HISPANIC", "BLACK", "BLACK HISPANIC", "WHITE", "ASIAN / PACIFIC ISLANDER","AMERICAN INDIAN/ALASKAN NATIVE", "OTHER"]
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('RACE', validate_race)

In [32]:
profiles.column('SEX').get('topValues')

[('M', 15830), ('F', 1978), ('U', 53)]

In [33]:
ds_full.distinct('SEX')

Counter({'M': 15830, '': 17436, 'F': 1978, 'U': 53})

In [34]:
def validate_sex(sourceValue):
    UNKNOWN = "U"
    
    try:
        values = ["M", "F", "U"] 
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('SEX', validate_sex)

In [35]:
profiles.minmax('Latitude')

Unnamed: 0,min,max
float,40.502784,40.912057


In [36]:
def validate_lat(sourceValue):
    UNKNOWN = -999
    
    try:
        if sourceValue >= 40 and sourceValue <= 41:  
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('Latitude', validate_lat)

In [37]:
profiles.minmax('Longitude')

Unnamed: 0,min,max
float,-74.251831,-73.704627


In [38]:
def validate_long(sourceValue):
    UNKNOWN = -999
    
    try:
        if sourceValue >= -74 and sourceValue <= -73:  
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('Latitude', validate_long)

In [39]:
profiles.minmax('New Georeferenced Column')

Unnamed: 0,min,max
str,POINT (-73.70462716899993 40.735370798000076),POINT (-74.25183113799993 40.50278363800004)


In [40]:
SELECTED_COLUMNS = [\
    'SUMMONS_KEY',\
    'SUMMONS_DATE',\
    'OFFENSE_DESCRIPTION',\
    'LAW_SECTION_NUMBER',\
    'LAW_DESCRIPTION',\
    'SUMMONS_CATEGORY_TYPE',\
    'AGE_GROUP',\
    'SEX',\
    'RACE',\
    'JURISDICTION_CODE',\
    'BORO',\
    'PRECINCT_OF_OCCUR',\
    'X_COORDINATE_CD',\
    'Y_COORDINATE_CD',\
    'Latitude',\
    'Longitude',\
    'New Georeferenced Column',\
]

ds_full = ds_full.select(SELECTED_COLUMNS)

In [41]:
ds_full

<openclean.pipeline.DataPipeline at 0x7fcf98b7db80>

# VALIDATING DATA AFTER CLEANING

In [42]:
data_df = pd.read_csv("NYPD-YTD-Criminal-Summons-Summary-Dashboard-2.csv")

In [43]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35297 entries, 0 to 35296
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   SUMMONS_KEY               35297 non-null  int64  
 1   SUMMONS_DATE              35297 non-null  object 
 2   OFFENSE_DESCRIPTION       35297 non-null  object 
 3   LAW_SECTION_NUMBER        35297 non-null  object 
 4   LAW_DESCRIPTION           26361 non-null  object 
 5   SUMMONS_CATEGORY_TYPE     26361 non-null  object 
 6   AGE_GROUP                 35297 non-null  object 
 7   SEX                       17861 non-null  object 
 8   RACE                      17860 non-null  object 
 9   JURISDICTION_CODE         35297 non-null  int64  
 10  BORO                      35297 non-null  object 
 11  PRECINCT_OF_OCCUR         35297 non-null  int64  
 12  X_COORDINATE_CD           35116 non-null  float64
 13  Y_COORDINATE_CD           35116 non-null  float64
 14  Latitu

In [44]:
ds_full_jc = stream(data_df)
profiles = ds_full_jc.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
SUMMONS_KEY,35297,0,35294,0.999915,15.107088
SUMMONS_DATE,35297,0,273,0.007734,7.953969
OFFENSE_DESCRIPTION,35297,0,388,0.010992,4.259884
LAW_SECTION_NUMBER,35297,0,347,0.009831,4.177027
LAW_DESCRIPTION,35297,8936,14,0.000531,1.981003
SUMMONS_CATEGORY_TYPE,35297,8936,44,0.001669,2.45958
AGE_GROUP,35297,0,6,0.00017,1.83839
SEX,35297,17436,3,0.000168,0.530843
RACE,35297,17437,8,0.000448,2.04068
JURISDICTION_CODE,35297,0,3,8.5e-05,0.476623


In [45]:
data_df['SEX'].unique()

array(['M', nan, 'F', 'U'], dtype=object)

In [46]:
def validate_SEX(data,column_name):
    data[column_name].replace(np.nan, 'U', inplace=True)
    return data

In [47]:
data_df = validate_SEX(data_df,'SEX')

In [48]:
data_df['RACE'].unique()

array(['BLACK', nan, 'WHITE HISPANIC', 'WHITE', 'OTHER', 'BLACK HISPANIC',
       'UNKNOWN', 'ASIAN / PACIFIC ISLANDER',
       'AMERICAN INDIAN/ALASKAN NATIVE'], dtype=object)

In [49]:
def validate_RACE(data,column_name):
    data[column_name].replace(np.nan, 'UNKNOWN', inplace=True)
    return data

In [50]:
data_df = validate_RACE(data_df,'RACE')

In [51]:
data_df['LAW_DESCRIPTION'].unique()

array([nan, 'VTL', 'Administrative Code', 'ABC', 'NYS Transportation',
       'Penal Law', 'Park Regs', 'Tax Law', 'General Business Law',
       'Health Code', 'Traffic Regs', 'Sanitary Code (NYS)',
       'Education Law (NYS)', 'TLC', 'NYS Workers Comp'], dtype=object)

In [52]:
def validate_LAW_DESCRIPTION(data,column_name):
    data[column_name].replace(np.nan, 'UNKNOWN', inplace=True)
    return data
data_df = validate_LAW_DESCRIPTION(data_df,'LAW_DESCRIPTION')

In [53]:
data_df['SUMMONS_CATEGORY_TYPE'].unique()

array([nan, 'VTL', 'KNIFE', 'ABC', 'NYS TRANS', 'DISORDERLY CONDUCT',
       'OTHER ABC', 'TLC', 'TRESPASS', 'MARIJUANA', 'ALCOHOL', 'AIR GUN',
       'GAMBLING', 'PARK REGS', 'URINATING', 'FIREWORKS', 'VENDING',
       'KEY', 'OPEN FIRE', 'NOISE', 'HARASSMENT 2', 'OTHER ADMIN CODE',
       'LITTERING', 'BUSINESS', 'BIKE', 'TOW TRUCK', 'EXPOSURE',
       'POSTING ADVERTISEMENTS', 'OTHER PARK REG', 'TRAFFIC REGS',
       'OPEN HYDRANT', 'SANITARY CODE', 'SMOKING', 'OTHER', 'DOG',
       'OTHER PENAL LAW', 'SPITTING', 'AMMO', 'AVIGATION', 'PANHANDLING',
       'LITTERNG', 'EDUCATION LAW', 'LOITERING', 'DUMPING',
       'IN PUBLIC UNDER DRUGS'], dtype=object)

In [54]:
def validate_SUMMONS_CATEGORY_TYPE(data,column_name):
    data[column_name].replace(np.nan, 'UNKNOWN', inplace=True)
    return data
data_df = validate_SUMMONS_CATEGORY_TYPE(data_df,'SUMMONS_CATEGORY_TYPE')

In [55]:
data_df = data_df.dropna()

In [60]:
data_df = data_df.drop_duplicates(subset='SUMMONS_KEY',keep="first")

In [61]:
ds_full_jc = stream(data_df)
profiles = ds_full_jc.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
SUMMONS_KEY,35113,0,35113,1.0,15.099718
SUMMONS_DATE,35113,0,273,0.007775,7.951915
OFFENSE_DESCRIPTION,35113,0,388,0.01105,4.258857
LAW_SECTION_NUMBER,35113,0,347,0.009882,4.176296
LAW_DESCRIPTION,35113,0,15,0.000427,2.296244
SUMMONS_CATEGORY_TYPE,35113,0,45,0.001282,2.653053
AGE_GROUP,35113,0,6,0.000171,1.838216
SEX,35113,0,3,8.5e-05,1.253582
RACE,35113,0,8,0.000228,1.974212
JURISDICTION_CODE,35113,0,3,8.5e-05,0.476827


In [62]:
data_df.to_csv('filtered6.csv')