# NYC Crime Data Cleaning
- Subselecting columns of the initial large file
- Remove nans, clean datetimes and save to a new file for analysis.

In [18]:
import pandas as pd
import datetime as dt

In [19]:
path = 'C:/Users/Zaca/Documents/Datasets/nyc/'

In [20]:
# To save memory:
# I have already pre-preared a text file containing the columns I think are most relevant.
selected_cols = pd.read_csv(path + 'crime_selected_cols.txt')
selected_cols

Unnamed: 0,name,description,rename
0,CMPLNT_FR_DT,Exact date of occurrence for the reported event,date
1,CMPLNT_FR_TM,Exact time of occurrence for the reported event,time
2,ADDR_PCT_CD,The precinct in which the incident occurred,precinct
3,KY_CD,Three digit offense classification code,class_code
4,OFNS_DESC,Description of offense corresponding with key ...,description
5,LAW_CAT_CD,Level of offense: felony misdemeanor violation,level
6,BORO_NM,The name of the borough in which the incident ...,borough
7,PREM_TYP_DESC,Specific description of premises (grocery stor...,premises
8,Lat_Lon,Geospatial Location Point (latitude and Longit...,geo


In [47]:
crime = pd.read_csv(path + 'nypd_historic.csv', usecols=selected_cols['name'])

In [48]:
crime

Unnamed: 0,CMPLNT_FR_DT,CMPLNT_FR_TM,ADDR_PCT_CD,KY_CD,OFNS_DESC,LAW_CAT_CD,BORO_NM,PREM_TYP_DESC,Lat_Lon
0,04/10/2008,19:10:00,73.0,341,PETIT LARCENY,MISDEMEANOR,BROOKLYN,STREET,"(40.669413836, -73.91260308)"
1,06/03/2007,15:23:00,28.0,236,DANGEROUS WEAPONS,MISDEMEANOR,MANHATTAN,STREET,"(40.801978284, -73.945511151)"
2,02/16/2010,20:50:00,102.0,105,ROBBERY,FELONY,QUEENS,GROCERY/BODEGA,"(40.699990268, -73.830977746)"
3,11/10/2009,16:35:00,79.0,341,PETIT LARCENY,MISDEMEANOR,BROOKLYN,FOOD SUPERMARKET,"(40.681004729, -73.955034577)"
4,04/11/2006,09:30:00,123.0,112,THEFT-FRAUD,FELONY,STATEN ISLAND,COMMERCIAL BUILDING,
...,...,...,...,...,...,...,...,...,...
6847939,08/23/2015,02:00:00,75.0,340,FRAUDS,MISDEMEANOR,BROOKLYN,STREET,"(40.672269995, -73.875569231)"
6847940,06/15/2013,11:30:00,32.0,578,HARRASSMENT 2,VIOLATION,MANHATTAN,RESIDENCE - PUBLIC HOUSING,"(40.829882139, -73.9367581)"
6847941,07/12/2012,14:00:00,49.0,106,FELONY ASSAULT,FELONY,BRONX,STREET,"(40.846592354, -73.852913364)"
6847942,04/15/2012,19:00:00,112.0,341,PETIT LARCENY,MISDEMEANOR,QUEENS,STREET,"(40.728720422, -73.853512684)"


In [49]:
# change column names
crime.columns = selected_cols['rename']

In [50]:
# re-checking the size of our dataset
crime.shape

(6847944, 9)

In [51]:
# look at dtypes
crime.dtypes

rename
date            object
time            object
precinct       float64
class_code       int64
description     object
level           object
borough         object
premises        object
geo             object
dtype: object

In [52]:
# change to appropriate dtypes
crime.date = pd.to_datetime(crime.date, errors='coerce')

In [53]:
crime.time = pd.to_datetime(crime.time, errors='coerce').dt.hour

In [54]:
# I have tons of data, might as well just drop nas.
crime.isna().sum()
crime.dropna(inplace=True)

In [55]:
# change time to int
crime.time = crime.time.astype('int64')

In [56]:
# clean / filter data by complete years
crime = crime[(crime.date > '01-01-2007') & (crime.date < '01-01-2019')]

In [57]:
# transform precinct column to int
crime['precinct'] = crime.precinct.astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [58]:
# transform geolocations from string to float
crime['geoloc'] = crime['geo'].str.replace('(','').str.replace(')', '').str.split(', ')
crime['latitude'] = crime['geoloc'].apply(lambda x: x[0]).astype('float64')
crime['longitude'] = crime['geoloc'].apply(lambda x: x[1]).astype('float64')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [59]:
# drop columns no longer needed
crime.drop(labels=['geo', 'geoloc'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [61]:
# save to file
crime.to_csv(path + 'nypd_historic_07-18.csv')