# Seattle 911 Cleaning

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

In [None]:
df = pd.read_csv('Seattle_Police_Department_911_Dirty.csv')
df.head()

In [None]:
df.isna().sum().sort_values()

#### CAD CDW ID

In [None]:
# 6 ID Missing : Fill with surrogate random id >= 100000000

# df['CAD CDW ID'].nunique()
# df['CAD CDW ID'].fillna(np.random.randint(100000, 1000000, 1))

def rand_id(x):
    return np.random.randint(100000000, 1000000000, 1)[0]
    
df.loc[df['CAD CDW ID'].isna(),'CAD CDW ID'] = df.loc[df['CAD CDW ID'].isna(),'CAD CDW ID'].apply(rand_id)
# df.loc[df['CAD CDW ID'] > 100000000]

#### CAD Event Number

In [None]:
# CAD Event Number : Take 3rd , 4th digits of 'General Offense Number' * 1000, concatenate with remaining 'General Offense Number'
def GONtoCEN(a):
    a = str(a)
    b = (str(int(a[2:4]) * 1000) + a[4:])
    return b

# Remove Non String values from General Offense Number
df['General Offense Number'] = pd.to_numeric(df['General Offense Number'], errors='coerce', downcast='integer')
df['CAD Event Number'] = pd.to_numeric(df['CAD Event Number'], errors='coerce', downcast='integer')
# df[['CAD Event Number', 'General Offense Number']].isna().sum()

df_copy = df.copy()
# df[df['CAD Event Number'].isna()][['CAD Event Number', 'General Offense Number']].head(20)

df_copy.loc[df_copy['CAD Event Number'].isna(), 'CAD Event Number'] = df_copy[df_copy['CAD Event Number'].isna()]['General Offense Number'].apply(lambda x: GONtoCEN(x))
df_copy['CAD Event Number'] = pd.to_numeric(df_copy['CAD Event Number'], errors='coerce', downcast='integer')
df_copy[df_copy['CAD Event Number'].isna()]
df = df_copy

#### General Offense Number

In [None]:
# df[df['General Offense Number'].isna()][['CAD Event Number', 'General Offense Number']].head(20)

def CENtoGEN(a):
    a = str(10000246471)
    b = str('20' + a[:2] + a[5:])
    return b

df_copy = df.copy()
df_copy.loc[df_copy['General Offense Number'].isna(), 'General Offense Number'] = df_copy[df_copy['General Offense Number'].isna()]['CAD Event Number'].apply(lambda x: CENtoGEN(x))
df_copy['General Offense Number'] = pd.to_numeric(df_copy['General Offense Number'], errors='coerce', downcast='integer')
df_copy[['CAD Event Number', 'General Offense Number']].isna().sum()
df = df_copy

In [None]:
# Note: If Series or Index does not contain NaN values the resultant dtype will be bool, otherwise, an object dtype
# s1.str.contains('og', na=False, regex=True)

#### Event Clearance Code

In [None]:
df_code_reference = df.copy()

df_code_reference = df_code_reference.loc[df_code_reference['Event Clearance Code'].notnull() &
                                df_code_reference['Event Clearance Description'].notnull() &
                                df_code_reference['Event Clearance SubGroup'].notnull() &
                                df_code_reference['Event Clearance Group'].notnull(),
                                ['Event Clearance Code',
                                'Event Clearance Description',
                                'Event Clearance SubGroup',
                                'Event Clearance Group']].drop_duplicates().reset_index()

df_code_reference.drop('index', inplace=True, axis = 1)
# df_code_reference.set_index('Event Clearance Code', inplace=True)
df_code_reference.head(1000)

In [None]:
df_temp = df.copy()
df_temp.loc[df_temp['Event Clearance Code'].isna()]

# We have 39 rows for which we do not know the Event Clearance Code. 
# Now we can try merging with the cod_referecnce df, on other three columns fro the code.

df_temp = pd.merge(df_temp,
                   df_code_reference,
                   left_index=False, 
                   right_index=False, 
                   left_on = ['Event Clearance Description',
                                'Event Clearance SubGroup',
                                'Event Clearance Group'],
                   right_on = ['Event Clearance Description',
                                'Event Clearance SubGroup',
                                'Event Clearance Group'], 
                   how='left')

df_temp.drop_duplicates(subset=['CAD CDW ID'], inplace =True)

df_temp.loc[df_temp['Event Clearance Code_x'].isna(), 'Event Clearance Code_x'] = df_temp.loc[df_temp['Event Clearance Code_x'].isna(), 'Event Clearance Code_y'] 
df_temp.drop('Event Clearance Code_y', axis = 1, inplace = True)
df_temp.rename(columns = {'Event Clearance Code_x':'Event Clearance Code'}, inplace=True)


In [None]:
df_temp.isna().sum()

In [None]:
# df_temp.loc[df_temp['Event Clearance Code'].isna()]
# 3 Event Clearance Code Rows are Nan - Do them manually
df_temp.loc[df_temp['CAD CDW ID'] == 15750, ['Event Clearance Code']] = 250
df_temp.loc[df_temp['CAD CDW ID'] == 15753, ['Event Clearance Code']] = 430
df_temp.loc[df_temp['CAD CDW ID'] == 15756, ['Event Clearance Code']] = 245
df = df_temp

#### Event Clearance Description, Event Clearance SubGroup, Event Clearance Group

In [None]:
df_temp = df.copy()
df_temp = pd.merge(df_temp,
                   df_code_reference,
                   left_index=False, 
                   right_index=False, 
                   left_on = 'Event Clearance Code',
                   right_on = 'Event Clearance Code', 
                   how='left')

df_temp.drop_duplicates(subset=['CAD CDW ID'], inplace =True)

df_temp.loc[df_temp['Event Clearance Code'].notnull(), 'Event Clearance Group_x'] = df_temp.loc[df_temp['Event Clearance Code'].notnull(), 'Event Clearance Group_y']
df_temp.loc[df_temp['Event Clearance Code'].notnull(), 'Event Clearance SubGroup_x'] = df_temp.loc[df_temp['Event Clearance Code'].notnull(), 'Event Clearance SubGroup_y']
df_temp.loc[df_temp['Event Clearance Code'].notnull(), 'Event Clearance Description_x'] = df_temp.loc[df_temp['Event Clearance Code'].notnull(), 'Event Clearance Description_y']

df_temp.drop(['Event Clearance Description_y',
              'Event Clearance SubGroup_y',
              'Event Clearance Group_y'], axis = 1, inplace = True)
df_temp.rename(columns = {'Event Clearance Description_x':'Event Clearance Description',
                         'Event Clearance SubGroup_x':'Event Clearance SubGroup',
                         'Event Clearance Group_x':'Event Clearance Group'}, inplace=True)

df = df_temp

In [None]:
print(df.shape)
df.isna().sum()

In [None]:
df = df.astype({'CAD CDW ID': 'int64',
          'CAD Event Number': 'int64',
          'General Offense Number': 'int64',
          'Event Clearance Code': 'int64'})

#### Event Clearance Date

In [None]:
df['Event Clearance Date'] = pd.to_datetime(df['Event Clearance Date'] , errors='coerce')
df.loc[df['Event Clearance Date'].isna()]
df.loc[df['CAD CDW ID'] == 24335,'Event Clearance Date'] = pd.datetime(2010, 1, 1)

In [None]:
# Change year of dates < 2000 and > 2020 to 2010, based on their General Offence Number

df_temp = df.copy()
df_temp.loc[(df_temp['Event Clearance Date'].apply(lambda x: x.year) <= 2000) |
            (df_temp['Event Clearance Date'].apply(lambda x: x.year) > 2020),
            'Event Clearance Date'] = df_temp.loc[(df_temp['Event Clearance Date'].apply(lambda x: x.year) <= 2000) |
                                                  (df_temp['Event Clearance Date'].apply(lambda x: x.year) > 2020),
                                                  'Event Clearance Date'].apply(lambda dt: dt.replace(year=2010))
df = df_temp

In [None]:
df.isna().sum()

#### Location Data

In [None]:
# Incident Location : Drop    
df.drop('Incident Location', axis = 1, inplace = True)


In [None]:
# Hundred Block Location 
# District/Sector         
# Zone/Beat
# Census Tract    

df[['Hundred Block Location', 'District/Sector', 'Zone/Beat']].nunique()

In [None]:
df['District/Sector'].replace('99', np.NaN, inplace = True)
df['Zone/Beat'].replace('99', np.NaN, inplace = True)
df['Census Tract'].replace('NULL', np.NaN, inplace = True)

df.isna().sum().sort_values()

In [None]:
df = df.reset_index()
df.drop('index', axis = 1, inplace = True)
df

#### Drop Insufficient Columns

In [None]:
df.columns
df.drop(['Initial Type Description', 'Initial Type Subgroup',
       'Initial Type Group', 'At Scene Time'], axis = 1, inplace = True)

In [None]:
from geopy.geocoders import Nominatim #Free
# from geopy.geocoders import GoogleV3  #Paid
from geopy.extra.rate_limiter import RateLimiter

# geolocator = GoogleV3(api_key=google_key)
geolocator = Nominatim(user_agent="seattle_911_cleaning",timeout=1)

location = geolocator.reverse('%f,%f' % (df.iloc[12]['Latitude'], df.iloc[10]['Longitude']))
print(location.raw.get('address'))

# Hundred Block Location:'House number' Block of 'road'

#### Fill Address from Coordinates for missing Hundred Block Location

In [None]:
# df_copy[['Hundred Block Location', 'address']].head(12)

# See all rows where Hundred Block Location doesnt contain the word block
df_copy.loc[~df_copy['Hundred Block Location'].str.contains('BLOCK', na=False),['Hundred Block Location', 'address']]
geolocator = Nominatim(user_agent="seattle_911_cleaning",timeout = 2)
reverse = RateLimiter(geolocator.reverse, min_delay_seconds = 2)

def geodecode(la, ln):
    return reverse((str(la)+','+str(ln)), language='en')

df_copy = df.copy()
df_copy['address'] = None

df_copy2 = df_copy.loc[~df_copy['Hundred Block Location'].str.contains('BLOCK', na=False)]
for index, row in df_copy2.iterrows():
        df_copy2.at[index,'address'] = geodecode(row['Latitude'],row['Longitude']).address
    
df_copy.loc[~df_copy['Hundred Block Location'].str.contains('BLOCK', na=False), 'address'] = df_copy2['address'].apply(lambda x: x)

In [None]:
# Copy address 
df_copy.loc[~df_copy['Hundred Block Location'].str.contains('BLOCK', na=False),'Hundred Block Location'] = df_copy.loc[~df_copy['Hundred Block Location'].str.contains('BLOCK', na=False),'address']

In [None]:
df_copy.isna().sum().sort_values()

### Leave 
Census Tract                     18
<br>
District/Sector                  31
<br>
Zone/Beat                        43

In [None]:
# df_copy.loc[df_copy['District/Sector'].isna() & df_copy['Zone/Beat'].isna() & df_copy['District/Sector'].isna()]

In [None]:
df_copy.drop('address', axis = 1, inplace = True)

In [None]:
df_copy.to_csv('Seattle_Police_Department_911_Clean.csv', index=False)