In [1]:
# NYPD Preparation and Exploration

## Preparation
# 1 - remove columns not relevant to exercise
# 2 - Remove all data outside of range 2015-01-01 to 2019-12-31
# 3 - Remove rows with NaN, (null), UNKNOWN values
# 4 - map values like hour, severity score, offense super categories

# Output: ../data/NYPD_Complaint_Data_Historic_Cleaned_Reduced.csv

## Exploration

In [2]:
# check for required file

import os
from pathlib import Path

FILE_PATH = '../data/NYPD_Complaint_Data_Historic.csv'

file_path = Path(FILE_PATH)

# Check if the file exists
if file_path.exists():
    print(f"File found: {file_path}")
else:
    print(f"File not found: {file_path}. Please ensure the file is downloaded correctly.")



File found: ../data/NYPD_Complaint_Data_Historic.csv


In [None]:
# Load the dataset
import pandas as pd
import numpy as np

initial_df = pd.read_csv(FILE_PATH)

initial_df.info()

  initial_df = pd.read_csv(FILE_PATH)


In [None]:
# Columns to drop
columns_to_drop = ['LAW_CAT_CD', 'CMPLNT_NUM', 'CMPLNT_TO_DT', 'CMPLNT_TO_TM', 'PD_CD', 'PD_DESC', 'CRM_ATPT_CPTD_CD', 
                   'JURIS_DESC', 'JURISDICTION_CODE', 'PARKS_NM', 'HADEVELOPT', 'HOUSING_PSA', 'LOC_OF_OCCUR_DESC', 
                   'RPT_DT', 'X_COORD_CD', 'Y_COORD_CD', 'TRANSIT_DISTRICT', 'Lat_Lon', 'KY_CD', 'ADDR_PCT_CD', 'STATION_NAME',
                   'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'PATROL_BORO']

# Drop the columns
initial_df = initial_df.drop(columns=columns_to_drop)
initial_df.info()

In [None]:
initial_df.head()

In [None]:
# Remove older data
# range 2015-01-01 to 2019-12-31 is non pandemic years
# Alignment with Other Datasets

initial_df['CMPLNT_FR_DT'] = pd.to_datetime(initial_df['CMPLNT_FR_DT'], errors='coerce')

# Filter out records outside the range 2015-01-01 to 2019-12-31
initial_df = initial_df[
    (initial_df['CMPLNT_FR_DT'] >= pd.Timestamp('2015-01-01')) &
    (initial_df['CMPLNT_FR_DT'] <= pd.Timestamp('2019-12-31'))
]

print(f"Filtered data size for records between 2015 and 2019: {initial_df.shape[0]}")



initial_df = initial_df.drop(columns=['CMPLNT_FR_DT'])

initial_df.head()

In [None]:
# Replace '(null)' with NaN
initial_df.replace('(null)', np.nan, inplace=True)

# Replace 'UNKNOWN' with NaN
initial_df.replace('UNKNOWN', np.nan, inplace=True)

# Count the number of NaN values per column
nan_counts = initial_df.isna().sum()

# Print the NaN counts
print(nan_counts)

initial_df = initial_df.dropna()

# Count the number of NaN values per column
nan_counts = initial_df.isna().sum()

# Print the NaN counts
print(nan_counts)

In [None]:
print(f"Cleaned data size: {initial_df.shape}")

In [None]:
# Extract the hour from the time column using .loc to avoid the warning
initial_df.loc[:, 'Hour'] = initial_df['CMPLNT_FR_TM'].str.split(':').str[0].astype(float)
initial_df = initial_df.drop(columns=['CMPLNT_FR_TM'])

# Map Severity Score
# Map Crimes to a severity
# Read the CSV file into a DataFrame
severity_df = pd.read_csv('../mappings/offense_severity_scores.csv')

severity_dict = severity_df.set_index('Offense')['Severity_Score'].to_dict()
initial_df['OFNS_DESC_Severity_Score'] = initial_df['OFNS_DESC'].map(severity_dict)

# Print the updated DataFrame to verify
initial_df.head()

In [None]:
crime_categories = {
    "Violent Crimes": [
        "FELONY ASSAULT", "ROBBERY", "RAPE", "HOMICIDE-NEGLIGENT, UNCLASSIFIED", 
        "MURDER & NON-NEGL. MANSLAUGHTER", "HOMICIDE-NEGLIGENT-VEHICLE", "SEX CRIMES", 
        "ASSAULT 3 & RELATED OFFENSES", "KIDNAPPING & RELATED OFFENSES", "KIDNAPPING", 
        "KIDNAPPING AND RELATED OFFENSES"
    ],
    "Theft and Larceny": [
        "GRAND LARCENY", "PETIT LARCENY", "THEFT-FRAUD", "GRAND LARCENY OF MOTOR VEHICLE", 
        "PETIT LARCENY OF MOTOR VEHICLE", "THEFT OF SERVICES", "OFFENSES INVOLVING FRAUD"
    ],
    "Property Crimes": [
        "BURGLARY", "ARSON", "CRIMINAL MISCHIEF & RELATED OFFENSES", "POSSESSION OF STOLEN PROPERTY", 
        "MISCELLANEOUS PENAL LAW", "CRIMINAL TRESPASS", "UNAUTHORIZED USE OF A VEHICLE"
    ],
    "Drug and Alcohol-Related Offenses": [
        "INTOXICATED & IMPAIRED DRIVING", "INTOXICATED/IMPAIRED DRIVING", "DANGEROUS DRUGS", 
        "UNDER THE INFLUENCE OF DRUGS", "LOITERING FOR DRUG PURPOSES"
    ],
    "Public Order and Administrative Offenses": [
        "DISORDERLY CONDUCT", "LOITERING", "LOITERING/GAMBLING (CARDS, DICE)", 
        "OFFENSES AGAINST PUBLIC ADMINISTRATION", "ADMINISTRATIVE CODE", "ADMINISTRATIVE CODES", 
        "NEW YORK CITY HEALTH CODE", "AGRICULTURE & MKTS LAW-UNCLASSIFIED", "OTHER STATE LAWS", 
        "OTHER STATE LAWS (NON PENAL LAW)"
    ],
    "Fraud and Financial Crimes": [
        "FORGERY", "FRAUDS", "FRAUDULENT ACCOSTING", "ANTICIPATORY OFFENSES"
    ],
    "Weapons and Dangerous Offenses": [
        "DANGEROUS WEAPONS", "UNLAWFUL POSS. WEAP. ON SCHOOL"
    ],
    "Family and Personal Offenses": [
        "OFFENSES AGAINST THE PERSON", "OFFENSES AGAINST MARRIAGE UNCL", 
        "OFFENSES RELATED TO CHILDREN", "CHILD ABANDONMENT/NON SUPPORT"
    ],
    "Negligence and Careless Acts": [
        "ENDAN WELFARE INCOMP", "DISRUPTION OF A RELIGIOUS SERVICE"
    ],
    "Miscellaneous and Specific Offenses": [
        "GAMBLING", "FORTUNE TELLING", "ALCOHOLIC BEVERAGE CONTROL LAW", "JOSTLING", 
        "OTHER OFFENSES RELATED TO THEFT", "OFFENSES AGAINST PUBLIC SAFETY", 
        "OTHER TRAFFIC INFRACTION", "NYS LAWS-UNCLASSIFIED VIOLATION", 
        "NYS LAWS-UNCLASSIFIED FELONY", "ABORTION", "FELONY SEX CRIMES", 
        "CANNABIS RELATED OFFENSES", "HARRASSMENT 2"
    ]
}


import pandas as pd


# Function to categorize each offense
def categorize_crime(offense):
    for category, crimes in crime_categories.items():
        if offense in crimes:
            return category
    return "Other"  # for crimes not listed in the dictionary

# Apply the categorization to the 'OFNS_DESC' column
initial_df['Crime_Category'] = initial_df['OFNS_DESC'].apply(categorize_crime)

# Check the first few rows to verify the new column
initial_df.head()

In [None]:
# Count the number of NaN values per column
nan_counts = initial_df.isna().sum()

# Print the NaN counts
print(nan_counts)

In [None]:
import pandas as pd
from pathlib import Path

file_path = '../data/NYPD_Complaint_Data_Historic_Cleaned_Reduced_Mapped.csv'

initial_df.to_csv(file_path, index=False)
print(f"Data saved to {file_path}")


In [None]:
# Store data to MongoDB for later merging with Property Data
from pymongo import MongoClient

MONGO_URI = 'mongodb://localhost:27017/'
DB_NAME = 'crime_database'
COLLECTION_NAME = 'crime_data'

def populate_db():

    # Combine LATITUDE and LONGITUDE into GeoJSON format
    initial_df['location'] = initial_df.apply(lambda row: {'type': 'Point', 'coordinates': [row['Longitude'], row['Latitude']]}, axis=1)

    # Convert the DataFrame to a dictionary format
    data = initial_df.to_dict(orient='records')

    # Establish a connection to MongoDB
    client = MongoClient(MONGO_URI)

    # Specify the database and collection
    db = client[DB_NAME]
    collection = db[COLLECTION_NAME]

    # Drop the collection if it exists to start fresh
    collection.drop()

    # Insert the data into the collection
    collection.insert_many(data)

    # Create a geospatial index on the location field
    collection.create_index([('location', '2dsphere')])

    print("Data has been successfully inserted into MongoDB with geospatial indexing.")

populate_db()