#### Import dependencies:

In [25]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
import matplotlib as plt
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
import time
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)
import re
import os
import matplotlib.pyplot as plt
%matplotlib inline

Python version: 3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
pandas version: 1.1.3
matplotlib version: 3.3.2
NumPy version: 1.19.5
SciPy version: 1.5.2
IPython version: 7.19.0
scikit-learn version: 0.23.2
-------------------------


#### Download data file if it has not been downloaded already:

In [26]:
#url = 'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD'
fn_src = 'https://data.cityofnewyork.us/api/views/bqiq-cu78/rows.csv?accessType=DOWNLOAD'
fn_dst = 'NYPD-Hate-Crimes.csv'

from six.moves import urllib

if os.path.isfile(fn_dst):
    print('File %s has already been downloaded' % fn_dst)
else:
    print('Fetching file %s[2.4GB]. This may take a while...' % fn_dst)
    urllib.request.urlretrieve(fn_src, fn_dst)
    print('File %s has been downloaded' % fn_dst)

File NYPD-Hate-Crimes.csv has already been downloaded


#### Open a stream to the data file so we don't have to load the whole data set into main memory

In [27]:
from openclean.pipeline import stream
ds_full = stream(fn_dst)

In [28]:
print(f'The dataset contains {ds_full.count():,} rows.')

The dataset contains 1,181 rows.


In [29]:
ds_full.head()

Unnamed: 0,Full Complaint ID,Complaint Year Number,Month Number,Record Create Date,Complaint Precinct Code,Patrol Borough Name,County,Law Code Category Description,Offense Description,PD Code Description,Bias Motive Description,Offense Category,Arrest Date,Arrest Id
0,201904612204817,2019,2,02/08/2019,46,PATROL BORO BRONX,BRONX,FELONY,FELONY ASSAULT,"ASSAULT 2,1,UNCLASSIFIED",ANTI-MALE HOMOSEXUAL (GAY),Sexual Orientation,02/08/2019,B31678218
1,201904812229517,2019,3,03/09/2019,48,PATROL BORO BRONX,BRONX,MISDEMEANOR,ASSAULT 3 & RELATED OFFENSES,ASSAULT 3,ANTI-WHITE,Race/Color,03/09/2019,B31682790
2,201904812226617,2019,3,03/08/2019,48,PATROL BORO BRONX,BRONX,MISDEMEANOR,ASSAULT 3 & RELATED OFFENSES,ASSAULT 3,ANTI-WHITE,Race/Color,03/09/2019,B31682806
3,201904812231317,2019,3,03/10/2019,48,PATROL BORO BRONX,BRONX,MISDEMEANOR,ASSAULT 3 & RELATED OFFENSES,ASSAULT 3,ANTI-WHITE,Race/Color,03/09/2019,B31682806
4,201904212405517,2019,5,05/11/2019,42,PATROL BORO BRONX,BRONX,FELONY,ROBBERY,"ROBBERY,POCKETBOOK/CARRIED BAG",ANTI-MUSLIM,Religion/Religious Practice,05/10/2019,B31692174
5,201904212405517,2019,5,05/11/2019,42,PATROL BORO BRONX,BRONX,FELONY,ROBBERY,"ROBBERY,POCKETBOOK/CARRIED BAG",ANTI-MUSLIM,Religion/Religious Practice,05/10/2019,B31692175
6,201904212405517,2019,5,05/11/2019,42,PATROL BORO BRONX,BRONX,FELONY,ROBBERY,"ROBBERY,POCKETBOOK/CARRIED BAG",ANTI-MUSLIM,Religion/Religious Practice,05/10/2019,B31692176
7,201904012711317,2019,6,06/01/2019,40,PATROL BORO BRONX,BRONX,FELONY,RAPE,RAPE 1,ANTI-WHITE,Race/Color,06/14/2019,B31697030
8,201904912495217,2019,7,07/09/2019,49,PATROL BORO BRONX,BRONX,FELONY,FELONY ASSAULT,"ASSAULT 2,1,UNCLASSIFIED",ANTI-HISPANIC,Ethnicity/National Origin/Ancestry,07/21/2019,B31702150
9,201904712851617,2019,8,08/05/2019,47,PATROL BORO BRONX,BRONX,MISDEMEANOR,ASSAULT 3 & RELATED OFFENSES,"MENACING,UNCLASSIFIED",ANTI-TRANSGENDER,Gender,08/05/2019,B31704341


#### Use openclean to profile the data set:

In [30]:
from openclean.profiling.column import DefaultColumnProfiler

#profiles = ds.profile(default_profiler=DefaultColumnProfiler)
profiles = ds_full.profile(default_profiler=DefaultColumnProfiler)

In [31]:
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Full Complaint ID,1181,0,1102,0.933108,10.055231
Complaint Year Number,1181,0,3,0.00254,1.555348
Month Number,1181,0,12,0.010161,3.529311
Record Create Date,1181,0,609,0.515665,8.973195
Complaint Precinct Code,1181,0,76,0.064352,5.960441
Patrol Borough Name,1181,0,8,0.006774,2.830716
County,1181,0,5,0.004234,1.984774
Law Code Category Description,1181,0,4,0.003387,1.093351
Offense Description,1181,0,17,0.014395,2.64196
PD Code Description,1181,0,50,0.042337,3.476055


In [32]:
# Print the most frequent data type for each column.

print('Schema\n------')
for col in ds_full.columns:
    p = profiles.column(col)
    print("  '{}' ({})".format(col, p['datatypes']['distinct'].most_common(1)[0][0]))

Schema
------
  'Full Complaint ID' (int)
  'Complaint Year Number' (int)
  'Month Number' (int)
  'Record Create Date' (date)
  'Complaint Precinct Code' (int)
  'Patrol Borough Name' (str)
  'County' (str)
  'Law Code Category Description' (str)
  'Offense Description' (str)
  'PD Code Description' (str)
  'Bias Motive Description' (str)
  'Offense Category' (str)
  'Arrest Date' (date)
  'Arrest Id' (str)


### Field Name & Description:
    'CMPLNT_NUM' (int)                 Complaint Number
    'CMPLNT_FR_DT' (date)              Complaint From Date
    'CMPLNT_FR_TM' (date)              Complaint From Time
    'CMPLNT_TO_DT' (date)              Complaint To Date
    'CMPLNT_TO_TM' (date)              Complaint To Time
    'ADDR_PCT_CD' (int)                Code of Precinct in which the Incident Occured
    'RPT_DT' (date)                    Report Date
    'KY_CD' (int)                      "Key Code": Offense Classification Code (3 digits)
    'OFNS_DESC' (str)                  Offense Description
    'PD_CD' (int)                      PD Code of Offense. More granular than Key Code
    'PD_DESC' (str)                    PD Description of Offense.
    'CRM_ATPT_CPTD_CD' (str)           Whether Crime was Atempted or Completed (values: 'COMPLETED', 'ATTEMPTED')
    'LAW_CAT_CD' (str)                 Level of Offense (values: 'FELONY', 'VIOLATION', 'MISDEMEANOR')
    'BORO_NM' (str)                    Name of Borough in which Incident Occurred
    'LOC_OF_OCCUR_DESC' (str)          Description of where the incident occurred with respect to the premises
                                       (values:'FRONT OF', 'REAR OF', 'OUTSIDE', 'INSIDE', 'OPPOSITE OF')
    'PREM_TYP_DESC' (str)              Description of the type of premises in which the Incident Occurred
    'JURIS_DESC' (str)                 Description of Jurisdiction in which Incident Occurred
    'JURISDICTION_CODE' (int)          Jurisdiction Code
    'PARKS_NM' (str)                   Name of Park in which Incident Occurred, if Applicable
    'HADEVELOPT' (str)                 Name of NYCHA Housing Development in which Incident Occurred, if Applicable
    'HOUSING_PSA' (int)                Housing PSA
    'X_COORD_CD' (int)                 X-coordinate, New York State Plane Coordinate System
    'Y_COORD_CD' (int)                 Y-coordinate, New York State Plane Coordinate System
    'SUSP_AGE_GROUP' (int)             Age Group of Suspect
    'SUSP_RACE' (str)                  Race of Suspect
    'SUSP_SEX' (str)                   Sex of Suspect
    'TRANSIT_DISTRICT' (int)           Transit-District code
    'Latitude' (float)                 Global Latitude of Location where Incident Occurred
    'Longitude' (float)                Global Longitude of Location where Incident Occured
    'Lat_Lon' (str)                    'Latitude' and 'Longitude' together
    'PATROL_BORO' (str)                Patrol Borough
    'STATION_NAME' (str)               Station Name
    'VIC_AGE_GROUP' (int)              Age Group of Victim
    'VIC_RACE' (str)                   Race of Victim
    'VIC_SEX' (str)                    Sex of Victim
    
    
    (Note: some field descriptions were taken from https://www1.nyc.gov/assets/nypd/downloads/pdf/analysis_and_planning/incident_level_data_footnotes.pdf)
   

## Profiling & Cleaning of each field in the data set

In [33]:
profiles.minmax('Full Complaint ID')

Unnamed: 0,min,max
int,201900112120417,202112312274817


In [34]:
profiles.minmax('Complaint Year Number')

Unnamed: 0,min,max
int,2019,2021


In [35]:
profiles.minmax('Complaint Precinct Code')

Unnamed: 0,min,max
int,1,123


In [36]:
profiles.column('Complaint Precinct Code').get('topValues')

[('14', 43),
 ('90', 40),
 ('66', 37),
 ('13', 35),
 ('70', 33),
 ('18', 33),
 ('19', 32),
 ('71', 30),
 ('61', 28),
 ('1', 28)]

In [37]:
profiles.column('Law Code Category Description').get('topValues')

[('MISDEMEANOR', 606),
 ('FELONY', 559),
 ('VIOLATION', 15),
 ('INVESTIGATION', 1)]

In [38]:
ds_full.distinct('Law Code Category Description')

Counter({'FELONY': 559,
         'MISDEMEANOR': 606,
         'VIOLATION': 15,
         'INVESTIGATION': 1})

In [39]:
# Print the most frequent values in column 'PD_DESC'

profiles.column('PD Code Description').get('topValues')

[('AGGRAVATED HARASSMENT 1', 282),
 ('CRIMINAL MISCHIEF 4TH, GRAFFIT', 193),
 ('ASSAULT 3', 161),
 ('AGGRAVATED HARASSMENT 2', 152),
 ('ASSAULT 2,1,UNCLASSIFIED', 135),
 ('MENACING,UNCLASSIFIED', 39),
 ('MISCHIEF,CRIMINAL,    UNCL 2ND', 34),
 ('CRIMINAL MISCHIEF,UNCLASSIFIED 4', 34),
 ('CRIMINAL MIS 2 & 3', 22),
 ('ROBBERY,OPEN AREA UNCLASSIFIED', 12)]

In [40]:
ds_full.distinct('PD Code Description')

Counter({'ASSAULT 2,1,UNCLASSIFIED': 135,
         'ASSAULT 3': 161,
         'ROBBERY,POCKETBOOK/CARRIED BAG': 5,
         'RAPE 1': 1,
         'MENACING,UNCLASSIFIED': 39,
         'AGGRAVATED HARASSMENT 1': 282,
         'ROBBERY,OPEN AREA UNCLASSIFIED': 12,
         'AGGRAVATED HARASSMENT 2': 152,
         'MISCHIEF,CRIMINAL,    UNCL 2ND': 34,
         'BURGLARY,UNCLASSIFIED,NIGHT': 5,
         'MISCHIEF, CRIMINAL 4, OF MOTOR': 11,
         'CRIMINAL MISCHIEF,UNCLASSIFIED 4': 34,
         'TRESPASS 3, CRIMINAL': 1,
         'MURDER,UNCLASSIFIED': 1,
         'CRIMINAL MISCHIEF 4TH, GRAFFIT': 193,
         'WEAPONS POSSESSION 3': 1,
         'LARCENY,GRAND FROM PERSON,PERSONAL ELECTRONIC DEVICE(SNATCH)': 4,
         'CRIMINAL MIS 2 & 3': 22,
         'ROBBERY,CAR JACKING': 1,
         'ROBBERY,DWELLING': 2,
         'MISCHIEF, CRIMINAL 3 & 2, OF M': 9,
         'ROBBERY,PERSONAL ELECTRONIC DEVICE': 8,
         'LARCENY,GRAND FROM PERSON,UNCL': 10,
         'ROBBERY,COMMERCIAL UNCLA

In [41]:
profiles.column('Offense Description').get('topValues')

[('CRIMINAL MISCHIEF & RELATED OF', 310),
 ('MISCELLANEOUS PENAL LAW', 292),
 ('ASSAULT 3 & RELATED OFFENSES', 204),
 ('OFF. AGNST PUB ORD SENSBLTY &', 152),
 ('FELONY ASSAULT', 136),
 ('ROBBERY', 37),
 ('GRAND LARCENY', 17),
 ('HARRASSMENT 2', 15),
 ('BURGLARY', 6),
 ('SEX CRIMES', 3)]

In [42]:
ds_full.distinct('Offense Description')

Counter({'FELONY ASSAULT': 136,
         'ASSAULT 3 & RELATED OFFENSES': 204,
         'ROBBERY': 37,
         'RAPE': 1,
         'MISCELLANEOUS PENAL LAW': 292,
         'OFF. AGNST PUB ORD SENSBLTY &': 152,
         'CRIMINAL MISCHIEF & RELATED OF': 310,
         'BURGLARY': 6,
         'CRIMINAL TRESPASS': 1,
         'MURDER & NON-NEGL. MANSLAUGHTE': 2,
         'DANGEROUS WEAPONS': 1,
         'GRAND LARCENY': 17,
         'SEX CRIMES': 3,
         'HARRASSMENT 2': 15,
         'INVESTIGATIONS/COMPLAINTS ONLY': 1,
         'FRAUDS': 1,
         'PETIT LARCENY': 2})

In [43]:
ds_full.distinct('Offense Category')

Counter({'Sexual Orientation': 166,
         'Race/Color': 321,
         'Religion/Religious Practice': 596,
         'Ethnicity/National Origin/Ancestry': 40,
         'Gender': 50,
         'Unclassified': 6,
         'Age': 1,
         'Disability': 1})

In [46]:
ds_full.distinct('Patrol Borough Name')

Counter({'PATROL BORO BRONX': 95,
         'PATROL BORO BKLYN SOUTH': 218,
         'PATROL BORO BKLYN NORTH': 185,
         'PATROL BORO MAN NORTH': 160,
         'PATROL BORO MAN SOUTH': 261,
         'PATROL BORO QUEENS NORTH': 146,
         'PATROL BORO QUEENS SOUTH': 69,
         'PATROL BORO STATEN ISLAND': 47})

In [45]:
def validate_boro_nm(sourceValue):
    UNKNOWN = "UNKNOWN"
    
    try:
        values = ["PATROL BORO BRONX","PATROL BORO BKLYN SOUTH","PATROL BORO BKLYN NORTH","PATROL BORO MAN NORTH","PATROL BORO MAN SOUTH","PATROL BORO QUEENS NORTH","PATROL BORO QUEENS SOUTH","PATROL BORO STATEN ISLAND"]
        
        if sourceValue in values:
            return sourceValue
        else:
            return UNKNOWN
    except:
        return UNKNOWN

    
ds_full = ds_full.update('Patrol Borough Name', validate_boro_nm)

In [48]:
ds_full.distinct('County')

Counter({'BRONX': 95,
         'KINGS': 403,
         'NEW YORK': 421,
         'QUEENS': 215,
         'RICHMOND': 47})

In [49]:
profiles.column('Bias Motive Description').get('topValues')

[('ANTI-JEWISH', 536),
 ('ANTI-ASIAN', 164),
 ('ANTI-MALE HOMOSEXUAL (GAY)', 142),
 ('ANTI-BLACK', 108),
 ('ANTI-WHITE', 49),
 ('ANTI-TRANSGENDER', 30),
 ('ANTI-MUSLIM', 29),
 ('ANTI-CATHOLIC', 24),
 ('ANTI-OTHER ETHNICITY', 18),
 ('ANTI-HISPANIC', 15)]

In [50]:
ds_full.distinct('Bias Motive Description')

Counter({'ANTI-MALE HOMOSEXUAL (GAY)': 142,
         'ANTI-WHITE': 49,
         'ANTI-MUSLIM': 29,
         'ANTI-HISPANIC': 15,
         'ANTI-TRANSGENDER': 30,
         'ANTI-JEWISH': 536,
         'ANTI-ASIAN': 164,
         'ANTI-BLACK': 108,
         'ANTI-FEMALE HOMOSEXUAL (LESBIAN)': 14,
         'ANTI-ARAB': 7,
         'ANTI-CATHOLIC': 24,
         'ANTI-GENDER NON-CONFORMING': 6,
         'ANTI-FEMALE': 14,
         'ANTI-LGBT (MIXED GROUP)': 11,
         'ANTI-MULTI-RACIAL GROUPS': 3,
         'ANTI-OTHER ETHNICITY': 18,
         '60 YRS AND OLDER': 1,
         'ANTI-HINDU': 2,
         'ANTI-BUDDHIST': 2,
         'ANTI-JEHOVAHS WITNESS': 1,
         'ANTI-PHYSICAL DISABILITY': 1,
         'ANTI-OTHER RELIGION': 2,
         'ANTI-RELIGIOUS PRACTICE GENERALLY': 2})

In [53]:
profiles.column('Arrest Id').get('topValues')

[('B31682806', 2),
 ('B31678218', 1),
 ('B31682790', 1),
 ('B31692174', 1),
 ('B31692175', 1),
 ('B31692176', 1),
 ('B31697030', 1),
 ('B31702150', 1),
 ('B31704341', 1),
 ('B31714935', 1)]

In [54]:
ds_full.distinct('Arrest Id')

Counter({'B31678218': 1,
         'B31682790': 1,
         'B31682806': 2,
         'B31692174': 1,
         'B31692175': 1,
         'B31692176': 1,
         'B31697030': 1,
         'B31702150': 1,
         'B31704341': 1,
         'B31714935': 1,
         'B31714939': 1,
         'B31722870': 1,
         'B32676594': 1,
         'B32682489': 1,
         'B32682490': 1,
         'B32682492': 1,
         'B32684534': 1,
         'B32690509': 1,
         'B32690513': 1,
         'B32692369': 1,
         'B32698112': 1,
         'B32700799': 1,
         'B32701468': 1,
         'B33672940': 1,
         'B33678693': 1,
         'B33679800': 1,
         'B33682164': 1,
         'B33683675': 1,
         'B33683676': 1,
         'B33683677': 1,
         'B33683679': 1,
         'B33683680': 1,
         'B33683681': 1,
         'B33683682': 1,
         'B33683684': 1,
         'B33683686': 1,
         'B33683687': 1,
         'B33683688': 1,
         'B33683690': 1,
         'B33683691': 1,


In [55]:
SELECTED_COLUMNS = [\
    'Full Complaint ID',\
    'Complaint Year Number',\
    'Month Number',\
    'Record Create Date',\
    'Complaint Precinct Code',\
    'Patrol Borough Name',\
    'County',\
    'Law Code Category Description',\
    'Offense Description',\
    'PD Code Description',\
    'Bias Motive Description',\
    'Offense Category',\
    'Arrest Date',\
    'Arrest Id',\
]


ds_full = ds_full.select(SELECTED_COLUMNS)

In [56]:
ds_full

<openclean.pipeline.DataPipeline at 0x7f82b6d305e0>

# VALIDATING DATA AFTER CLEANING

In [57]:
data_df = pd.read_csv("NYPD-Hate-Crimes.csv")

In [58]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1181 entries, 0 to 1180
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Full Complaint ID              1181 non-null   int64 
 1   Complaint Year Number          1181 non-null   int64 
 2   Month Number                   1181 non-null   int64 
 3   Record Create Date             1181 non-null   object
 4   Complaint Precinct Code        1181 non-null   int64 
 5   Patrol Borough Name            1181 non-null   object
 6   County                         1181 non-null   object
 7   Law Code Category Description  1181 non-null   object
 8   Offense Description            1181 non-null   object
 9   PD Code Description            1181 non-null   object
 10  Bias Motive Description        1181 non-null   object
 11  Offense Category               1181 non-null   object
 12  Arrest Date                    397 non-null    object
 13  Arr

In [59]:
ds_full_jc = stream(data_df)
profiles = ds_full_jc.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Full Complaint ID,1181,0,1102,0.933108,10.055231
Complaint Year Number,1181,0,3,0.00254,1.555348
Month Number,1181,0,12,0.010161,3.529311
Record Create Date,1181,0,609,0.515665,8.973195
Complaint Precinct Code,1181,0,76,0.064352,5.960441
Patrol Borough Name,1181,0,8,0.006774,2.830716
County,1181,0,5,0.004234,1.984774
Law Code Category Description,1181,0,4,0.003387,1.093351
Offense Description,1181,0,17,0.014395,2.64196
PD Code Description,1181,0,50,0.042337,3.476055


In [60]:
data_df['Arrest Date'].unique()

array(['02/08/2019', '03/09/2019', '05/10/2019', '06/14/2019',
       '07/21/2019', '08/05/2019', '10/22/2019', '12/28/2019',
       '02/06/2020', '03/28/2020', '04/29/2020', '07/07/2020',
       '08/06/2020', '10/08/2020', '11/03/2020', '11/10/2020',
       '01/08/2021', '03/04/2021', '03/14/2021', '04/09/2021',
       '05/01/2021', '09/06/2021', '01/06/2019', '01/14/2019',
       '01/16/2019', '01/30/2019', '02/27/2019', '03/11/2019',
       '03/29/2019', '04/02/2019', '04/06/2019', '04/05/2019',
       '04/07/2019', '04/08/2019', '05/15/2019', '05/17/2019',
       '06/06/2019', '06/11/2019', '06/19/2019', '06/22/2019',
       '06/25/2019', '07/13/2019', '07/24/2019', '07/26/2019',
       '07/28/2019', '08/09/2019', '08/21/2019', '08/29/2019',
       '09/12/2019', '09/15/2019', '09/22/2019', '10/21/2019',
       '10/30/2019', '11/03/2019', '11/05/2019', '11/13/2019',
       '11/23/2019', '12/13/2019', '12/26/2019', '12/27/2019',
       '01/01/2020', '01/21/2020', '01/29/2020', '02/11

In [61]:
def validate_Arrest_Date(data,column_name):
    data[column_name].replace(np.nan, 'UNKNOWN', inplace=True)
    return data
data_df = validate_Arrest_Date(data_df,'Arrest Date')

In [62]:
data_df['Arrest Id'].unique()

array(['B31678218', 'B31682790', 'B31682806', 'B31692174', 'B31692175',
       'B31692176', 'B31697030', 'B31702150', 'B31704341', 'B31714935',
       'B31714939', 'B31722870', 'B32676594', 'B32682489', 'B32682490',
       'B32682492', 'B32684534', 'B32690509', 'B32690513', 'B32692369',
       'B32698112', 'B32700799', 'B32701468', 'B33672940', 'B33678693',
       'B33679800', 'B33682164', 'B33683675', 'B33683676', 'B33683677',
       'B33683679', 'B33683680', 'B33683681', 'B33683682', 'B33683684',
       'B33683686', 'B33683687', 'B33683688', 'B33683690', 'B33683691',
       'B33695947', 'K31673178', 'K31674720', 'K31675023', 'K31677618',
       'K31677619', 'K31677806', 'K31679592', 'K31682839', 'K31685143',
       'K31688657', 'K31689355', 'K31689358', 'K31690178', 'K31690179',
       'K31690180', 'K31690185', 'K31690190', 'K31690422', 'K31690691',
       'K31697392', 'K31697975', 'K31701421', 'K31702285', 'K31702902',
       'K31703750', 'K31703762', 'K31704373', 'K31704696', 'K317

In [63]:
def validate_Arrest_id(data,column_name):
    data[column_name].replace(np.nan, 'UNKNOWN', inplace=True)
    return data
data_df = validate_Arrest_id(data_df,'Arrest Id')

In [64]:
ds_full_jc = stream(data_df)
profiles = ds_full_jc.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Full Complaint ID,1181,0,1102,0.933108,10.055231
Complaint Year Number,1181,0,3,0.00254,1.555348
Month Number,1181,0,12,0.010161,3.529311
Record Create Date,1181,0,609,0.515665,8.973195
Complaint Precinct Code,1181,0,76,0.064352,5.960441
Patrol Borough Name,1181,0,8,0.006774,2.830716
County,1181,0,5,0.004234,1.984774
Law Code Category Description,1181,0,4,0.003387,1.093351
Offense Description,1181,0,17,0.014395,2.64196
PD Code Description,1181,0,50,0.042337,3.476055


In [71]:
data_df = data_df.drop_duplicates(subset='Full Complaint ID')

In [72]:
ds_full_jc = stream(data_df)
profiles = ds_full_jc.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Full Complaint ID,1102,0,1102,1.0,10.105909
Complaint Year Number,1102,0,3,0.002722,1.554723
Month Number,1102,0,12,0.010889,3.530658
Record Create Date,1102,0,609,0.552632,9.004818
Complaint Precinct Code,1102,0,76,0.068966,5.964229
Patrol Borough Name,1102,0,8,0.00726,2.830209
County,1102,0,5,0.004537,1.981963
Law Code Category Description,1102,0,4,0.00363,1.096697
Offense Description,1102,0,17,0.015426,2.593761
PD Code Description,1102,0,50,0.045372,3.424492


In [73]:
data_df.to_csv('filtered7.csv')