# CMPD Car Theft Project

## By: Connor , Waqas, Issam , Rishabh
---




### Importing required libraries and data

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip uninstall pillow
!pip install pillow
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

ImportError: DLL load failed while importing _imaging: The specified module could not be found.

In [None]:
# Loading the 2011 Data

# 2011 Incident Data
incident_2011 = pd.read_csv("2011_Incident.csv")

# 2012 Incident Data
incident_2012 = pd.read_csv("2012_Incident.csv")

# 2013 Incident Data
incident_2013 = pd.read_csv("2013_Incident.csv")

# 2014 Incident Data
incident_2014 = pd.read_csv("2014_Incident.csv")

# 2015 Incident Data
incident_2015 = pd.read_csv("2015_Incident.csv")

# 2016 Incident Data
incident_2016 = pd.read_csv("2016_Incident.csv")


## Exploratory Data Analysis (EDA)

Merging all CMPD Data (2011-2016)

In [None]:
# merging all the data sets into one
vehicle_df = pd.concat([incident_2011, incident_2012, incident_2013, incident_2014, incident_2015, incident_2016], ignore_index=True)


In [None]:
vehicle_df.head()

In [None]:
vehicle_df[['X_Coordinate','Y_Coordinate']]

In [None]:
vehicle_df.info()

In [None]:
print(vehicle_df.columns)

We need to combine 'Direction', 'Street Name' and 'Street Type' into 1 column called 'Street Address'

In [None]:
# Combining Direction, Street_Name, Street_Type into Street_Address
vehicle_df.loc[:, 'Street_Address'] = vehicle_df['Direction'] + ' ' + vehicle_df['Street_Name'] + ' ' + vehicle_df['Street_Type']


We can get rid of all the extra columns that are not very relevant to our project.


In [None]:
# Adjusting the columns we want in our dataset
columns_to_keep = ['Street_Address','Zipcode','X_Coordinate','Y_Coordinate','NIBRS_Hi_Class']

vehicle_df = vehicle_df[columns_to_keep]

In [None]:
vehicle_df.head()

In [None]:
# Displaying unique values in the 'NIBRS_Hi_Class' column
print(vehicle_df['NIBRS_Hi_Class'].unique())


In [None]:
# filtering by 'Motor Vehicle Theft'
vehicle_df['NIBRS_Hi_Class'] = vehicle_df['NIBRS_Hi_Class'].str.strip()
vehicle_df = vehicle_df[vehicle_df['NIBRS_Hi_Class'].str.contains('Motor Vehicle Theft', case=False)]

vehicle_df.head(15)

We have some empty values under 'Zipcode' , 'X_Coordinate' , and 'Y_Coordinate' that we have to deal with:
 * Most of them are empty strings and whitespaces

* Idealy we could use the X & Y coordinates to fill in the missing zip codes. Or fill in the missing information by using the street address. However this would take some time and we are on a bit of a time crunch, so we chose to remove rows entirely with missing data.

* Although this gives us less data to work with, we'd rather have accurate data that will work well with modeling than tons of information that will cause problems.




In [None]:
# Remove rows with NaN values
vehicle_df = vehicle_df.dropna()

# Remove rows with empty strings
vehicle_df = vehicle_df[~vehicle_df.apply(lambda row: row.str.strip().eq('').any(), axis=1)]

# Checking the cleaned DataFrame
print(vehicle_df)
vehicle_df.head(15)

# Creating a combined csv
vehicle_df.to_csv('Vehicle_Theft.csv', index=False)

## Visualizations

In [None]:
import folium
import geopandas as gpd

In [None]:
m = folium.Map(location=[vehicle_df['Y_Coordinate'].mean(), vehicle_df['X_Coordinate'].mean()], zoom_start=10)


In [None]:
for index, row in vehicle_df.iterrows():
    folium.Marker([row['Y_Coordinate'], row['X_Coordinate']], tooltip=row['NIBRS_Hi_Class']).add_to(m)

# Display the map
m.save('vehicle_theft_map.html')

## Machine Learning Model (In Progress)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

X = vehicle_df[['Y_Coordinate', 'X_Coordinate']]  # Features
y = vehicle_df['Zipcode']  # Target variable

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Build a RandomForestClassifier (you can use other classifiers as well)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [None]:
# Make predictions and evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [None]:
# Print accuracy and classification report
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

In [None]:
# Predictive Mapping
# Create a grid of locations for predictions
# Replace with your shapefile
# Create a GeoDataFrame from the shapefile
grid = gpd.GeoDataFrame.from_file('Census_Tracts_2020.shx')


In [None]:
# Use the model to predict likelihood of vehicle theft for each location in the grid
grid['predicted_theft_likelihood'] = clf.predict_proba(grid[['Latitude', 'Longitude']])[:, 1]


In [None]:
# Heatmaps and Visualizations
# Visualize predicted likelihood of vehicle theft on the map
fig, ax = plt.subplots(figsize=(10, 10))
grid.plot(column='predicted_theft_likelihood', cmap='coolwarm', linewidth=0.8, ax=ax, legend=True)
plt.title("Predicted Likelihood of Vehicle Theft")
plt.show()