# CMPD Car Theft Project

## By: Connor , Waqas, Issam , Rishabh
---




### Project Scope: To predict the 'Risk' an area has for vehicle theft



### Importing required libraries and data

In [75]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [76]:
# Loading the 2011-2016 CMPDData

# 2011 Incident Data
incident_2011 = pd.read_csv("2011_Incident.csv")

# 2012 Incident Data
incident_2012 = pd.read_csv("2012_Incident.csv")

# 2013 Incident Data
incident_2013 = pd.read_csv("2013_Incident.csv")

# 2014 Incident Data
incident_2014 = pd.read_csv("2014_Incident.csv")

# 2015 Incident Data
incident_2015 = pd.read_csv("2015_Incident.csv")

# 2016 Incident Data
incident_2016 = pd.read_csv("2016_Incident.csv")


## Exploratory Data Analysis (EDA)

Merging all CMPD Data (2011-2016)

In [77]:
# merging all the data sets into one
all_incident_df = pd.concat([incident_2011, incident_2012, incident_2013, incident_2014, incident_2015, incident_2016], ignore_index=True)


In [78]:
all_incident_df.head()

Unnamed: 0,Complaint_No,Block_No,Direction,Street_Name,Street_Type,Suffix,Apt_No,City,State,Zipcode,...,Clearance_Status,Clearance_Date,Case_Status,Reporting_Agency,Follow_up_Section,NIBRS_Hi_Class,Incident_From_Time,Incident_to_Time,Unnamed: 28,Unnamed: 29
0,20110101000308,4425,,EDDLEMAN,RD,,,CHARLOTTE,NC,28208.0,...,Normal Clearance - Cleared by Arrest ...,1/1/2011,Close/Cleared,Charlotte-Mecklenburg Police Department,27-FREEDOM DIVISION ...,Drug Equipment Violations ...,,,,
1,20110101000700,2228,,BEATTIES FORD,RD,,,CHARLOTTE,NC,28216.0,...,Normal Clearance - Cleared by Arrest ...,1/1/2011,Close/Cleared,Charlotte-Mecklenburg Police Department,02-METRO DIVISION ...,Aggravated Assault ...,,,,
2,20110101001104,2300,N,TRYON,ST,,,CHARLOTTE,NC,,...,Open ...,,Inactive,Charlotte-Mecklenburg Police Department,02-METRO DIVISION ...,Damage/Vandalism Of Property ...,,,,
3,20110101001302,4027,,QUAIL GLENN,CT,,K,CHARLOTTE,NC,28226.0,...,Open ...,,Inactive,Charlotte-Mecklenburg Police Department,22-SOUTH DIVISION ...,Burglary/B&E ...,,,,
4,20110101002401,150,S,COLLEGE,ST,,,CHARLOTTE,NC,28202.0,...,Normal Clearance - Cleared by Arrest ...,1/1/2011,Close/Cleared,Charlotte-Mecklenburg Police Department,01-CENTRAL DIVISION ...,Affray ...,,,,


In [79]:
all_incident_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 566789 entries, 0 to 566788
Data columns (total 30 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Complaint_No         566789 non-null  int64  
 1   Block_No             566789 non-null  object 
 2   Direction            566789 non-null  object 
 3   Street_Name          566789 non-null  object 
 4   Street_Type          566788 non-null  object 
 5   Suffix               566789 non-null  object 
 6   Apt_No               566789 non-null  object 
 7   City                 566789 non-null  object 
 8   State                566789 non-null  object 
 9   Zipcode              566789 non-null  object 
 10  X_Coordinate         566789 non-null  object 
 11  Y_Coordinate         566789 non-null  object 
 12  Division             566789 non-null  object 
 13  Reported_Date        566789 non-null  object 
 14  Location_Desc        566788 non-null  float64
 15  Location_Type    

In [80]:
print(all_incident_df.columns)

Index(['Complaint_No', 'Block_No', 'Direction', 'Street_Name', 'Street_Type',
       'Suffix', 'Apt_No', 'City', 'State', 'Zipcode', 'X_Coordinate',
       'Y_Coordinate', 'Division', 'Reported_Date', 'Location_Desc',
       'Location_Type', 'Incident_From_Date', 'Incident_to_date', 'Place1',
       'Place2', 'Clearance_Status', 'Clearance_Date', 'Case_Status',
       'Reporting_Agency', 'Follow_up_Section', 'NIBRS_Hi_Class',
       ' Incident_From_Time', 'Incident_to_Time', 'Unnamed: 28',
       'Unnamed: 29'],
      dtype='object')


We need to combine 'Direction', 'Street Name' and 'Street Type' into 1 column called 'Street Address'

In [81]:
# Combining Direction, Street_Name, Street_Type into Street_Address
all_incident_df.loc[:, 'Street_Address'] = all_incident_df['Direction'] + ' ' + all_incident_df['Street_Name'] + ' ' + all_incident_df['Street_Type']


We can get rid of all the extra columns that are not very relevant to our project.


In [82]:
# Adjusting the columns we want in our dataset
columns_to_keep = ['Street_Address','Zipcode','X_Coordinate','Y_Coordinate','NIBRS_Hi_Class']

all_incident_df = all_incident_df[columns_to_keep]

In [83]:
all_incident_df.head()

Unnamed: 0,Street_Address,Zipcode,X_Coordinate,Y_Coordinate,NIBRS_Hi_Class
0,EDDLEMAN RD,28208.0,1433698,551674,Drug Equipment Violations ...
1,BEATTIES FORD RD,28216.0,1446676,556631,Aggravated Assault ...
2,N TRYON ST,,1456850,549120,Damage/Vandalism Of Property ...
3,QUAIL GLENN CT,28226.0,1453488,502939,Burglary/B&E ...
4,S COLLEGE ST,28202.0,1449765,542246,Affray ...


In [84]:
# Displaying unique values in the 'NIBRS_Hi_Class' column
print(all_incident_df['NIBRS_Hi_Class'].unique())


['Drug Equipment Violations                                                                           '
 'Aggravated Assault                                                                                  '
 'Damage/Vandalism Of Property                                                                        '
 'Burglary/B&E                                                                                        '
 'Affray                                                                                              '
 'All Other Offenses                                                                                  '
 'Simple Assault                                                                                      '
 'Missing Person                                                                                      '
 'Drug/Narcotic Violations                                                                            '
 'Indecent Exposure                                             

In [85]:
# filtering by 'Motor Vehicle Theft'
all_incident_df['NIBRS_Hi_Class'] = all_incident_df['NIBRS_Hi_Class'].str.strip()
vehicle_df = all_incident_df[all_incident_df['NIBRS_Hi_Class'].str.contains('Motor Vehicle Theft', case=False)]

vehicle_df.head(15)

Unnamed: 0,Street_Address,Zipcode,X_Coordinate,Y_Coordinate,NIBRS_Hi_Class
84,INTERURBAN AV,28208.0,1431508,554267,Motor Vehicle Theft
89,SPRINGMONT LN,28208.0,1427405,557580,Motor Vehicle Theft
100,SCOTT FUTRELL DR,28208.0,1425917,546914,Motor Vehicle Theft
103,LISBON LN,28269.0,1463794,564750,Motor Vehicle Theft
110,PERIMETER STATION DR,28269.0,1447340,585770,Motor Vehicle Theft
113,JORDANS POND LN,28214.0,1423483,566774,Motor Vehicle Theft
148,E INDEPENDENCE BV,28212.0,1477862,522932,Motor Vehicle Theft
149,N CALDWELL ST,28202.0,1451678,542269,Motor Vehicle Theft
160,E 7 TH ST,,1451249,542901,Motor Vehicle Theft
165,MAGNOLIA HILL DR,28205.0,1470918,543000,Motor Vehicle Theft


We have some empty values under 'Zipcode' , 'X_Coordinate' , and 'Y_Coordinate' that we have to deal with:
 * Most of them are empty strings and whitespaces.

* Idealy we could use the X & Y coordinates to fill in the missing zip codes. Or fill in the missing information by using the street address. However this would take some time and we are on a bit of a time crunch, so we chose to remove rows entirely with missing data.

* Although this gives us less data to work with, we'd rather have accurate data that will work well with modeling than tons of information that will cause problems.




In [86]:
# Remove rows with NaN values
vehicle_df = vehicle_df.dropna()

# Remove rows with empty strings
vehicle_df = vehicle_df[~vehicle_df.apply(lambda row: row.str.strip().eq('').any(), axis=1)]

# Checking the cleaned DataFrame
print(vehicle_df)
vehicle_df.head(15)

# Creating a combined csv
vehicle_df.to_csv('Vehicle_Theft.csv', index=False)

                                        Street_Address Zipcode X_Coordinate  \
84         INTERURBAN                               AV   28208      1431508   
89         SPRINGMONT                               LN   28208      1427405   
100        SCOTT FUTRELL                            DR   28208      1425917   
103        LISBON                                   LN   28269      1463794   
110        PERIMETER STATION                        DR   28269      1447340   
...                                                ...     ...          ...   
566624     BEATTIES FORD                            RD   28216      1446074   
566626     NEVILLE ABBEY                            DR   28262      1483602   
566631     BREEZEWOOD                               DR   28262      1479544   
566668     BENNETTSVILLE                            LN   28262      1463389   
566755     HERRIN                                   AV   28205      1464791   

       Y_Coordinate       NIBRS_Hi_Class  
84      

In [87]:
# Group data by street address and calculate the count of vehicle theft incidents per street
street_theft_counts = vehicle_df['Street_Address'].value_counts().reset_index()
street_theft_counts.columns = ['Street_Address', 'Theft_Count']

# Displaying the counts for each address
print(street_theft_counts.head)

<bound method NDFrame.head of                                       Street_Address  Theft_Count
0     N  TRYON                                    ST          215
1     E  INDEPENDENCE                             BV          129
2        CENTRAL                                  AV          125
3        SOUTH                                    BV          113
4        ALBEMARLE                                RD          110
...                                              ...          ...
3215     ARDREY STEAD                             CT            1
3216     WATERS TRAIL                             DR            1
3217     MALIBU                                   DR            1
3218     WESLEY VILLAGE                           RD            1
3219     HETHERSETT                               LN            1

[3220 rows x 2 columns]>


## Machine Learning Model (In Progress)

In [88]:
vehicle_df = pd.read_csv("Vehicle_Theft.csv")
vehicle_df.head()

Unnamed: 0,Street_Address,Zipcode,X_Coordinate,Y_Coordinate,NIBRS_Hi_Class
0,INTERURBAN AV,28208,1431508,554267,Motor Vehicle Theft
1,SPRINGMONT LN,28208,1427405,557580,Motor Vehicle Theft
2,SCOTT FUTRELL DR,28208,1425917,546914,Motor Vehicle Theft
3,LISBON LN,28269,1463794,564750,Motor Vehicle Theft
4,PERIMETER STATION DR,28269,1447340,585770,Motor Vehicle Theft


In [89]:
vehicle_df.shape

(10957, 5)

In [90]:
# Group by Street Adress and calculate the total number of theft incidents
theft_counts = vehicle_df.groupby('Street_Address')['NIBRS_Hi_Class'].count().reset_index()
theft_counts.rename(columns={'NIBRS_Hi_Class': 'Theft_Incident_Count'}, inplace=True)

#print(theft_counts.head(10))

print(theft_counts.head(15).sort_values(by='Theft_Incident_Count', ascending=False))


                                    Street_Address  Theft_Incident_Count
10     ACADEMY                                  ST                     8
4      ABBEYDALE                                DR                     6
14     ADMIRAL                                  AV                     4
1                                            77 IN                     3
2      A                                        AV                     2
6      ABELWOOD                                 RD                     2
7      ABERCROMBY                               ST                     2
0      26TH                                     ST                     1
3      ABBEY COURT                              DR                     1
5      ABBOTSBURY                               CT                     1
8      ABERDEEN                                 ST                     1
9      ABERDEEN GLEN                            PL                     1
11     ACADIAN WOODS                            DR 

In [91]:
# finding mean theft count
theft_counts.mean()

Theft_Incident_Count    3.402795
dtype: float64

In [92]:
# Calculate vehicle theft per capita (assuming a default population value)
# This value is based on 2016 Charlotte Population
default_population = 843989
theft_counts['Vehicle_Theft_Per_Capita'] = theft_counts['Theft_Incident_Count'] / default_population

In [93]:
# Define a threshold for classifying as 'HighRisk' (this can be later adjusted)
# Adjust this threshold as needed
threshold = 4

In [94]:
# Create the 'HighRisk' column based on the threshold
theft_counts['HighRisk'] = theft_counts['Vehicle_Theft_Per_Capita'] > threshold

In [95]:
# Merge the 'HighRisk' column back into original vehicle_df
vehicle_df = vehicle_df.merge(theft_counts[['Street_Address', 'HighRisk']], on='Street_Address', how='left')

vehicle_df['HighRisk'] = vehicle_df['HighRisk'].astype(int)

# vehicle_df.head(20)
vehicle_df.head(15).sort_values(by='Street_Address', ascending=False)


Unnamed: 0,Street_Address,Zipcode,X_Coordinate,Y_Coordinate,NIBRS_Hi_Class,HighRisk
9,S COLLEGE ST,28202,1449355,542073,Motor Vehicle Theft,0
7,N CALDWELL ST,28202,1451678,542269,Motor Vehicle Theft,0
6,E INDEPENDENCE BV,28212,1477862,522932,Motor Vehicle Theft,0
11,THRIFTWOOD DR,28208,1431910,553890,Motor Vehicle Theft,0
1,SPRINGMONT LN,28208,1427405,557580,Motor Vehicle Theft,0
12,SETHS DR,28269,1461640,570178,Motor Vehicle Theft,0
2,SCOTT FUTRELL DR,28208,1425917,546914,Motor Vehicle Theft,0
10,SCALEYBARK RD,28209,1443247,526701,Motor Vehicle Theft,0
4,PERIMETER STATION DR,28269,1447340,585770,Motor Vehicle Theft,0
8,MAGNOLIA HILL DR,28205,1470918,543000,Motor Vehicle Theft,0


In [96]:
high_risk_rows = vehicle_df[vehicle_df['HighRisk'] == 1]
print(high_risk_rows)

Empty DataFrame
Columns: [Street_Address, Zipcode, X_Coordinate, Y_Coordinate, NIBRS_Hi_Class, HighRisk]
Index: []


In [97]:
# Save the updated dataset
vehicle_df.to_csv('your_updated_dataset.csv', index=False)

## Model training and testing

In [98]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [99]:
# Extract the first part of the Zipcode (assuming Zipcodes are in format 'XXXXX-XXXX' or 'XXXXX')
vehicle_df['Zipcode'] = vehicle_df['Zipcode'].str.extract(r'^(\d{5})', expand=False)

In [100]:
# Define features (location data) and target variable
# 'Latitude'= Y and 'Longitude'= X
features = ['Y_Coordinate', 'X_Coordinate']
target = 'HighRisk'  # Create a binary 'HighRisk' column where 1 indicates high risk and 0 indicates low risk


In [101]:
# Split data into training and testing sets 80 : 20
X = vehicle_df[features]
y = vehicle_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [102]:
# Checking if data has been split correctly
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (8765, 2)
Shape of X_test: (2192, 2)
Shape of y_train: (8765,)
Shape of y_test: (2192,)


In [103]:
# Create and train Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [104]:
# Make predictions
y_pred = clf.predict(X_test)

In [105]:
# Add a "HighClass" column based on your threshold
vehicle_df['Risk_Prediction'] = clf.predict(X)  # Assuming clf is your trained model

vehicle_df

Unnamed: 0,Street_Address,Zipcode,X_Coordinate,Y_Coordinate,NIBRS_Hi_Class,HighRisk,Risk_Prediction
0,INTERURBAN AV,28208,1431508,554267,Motor Vehicle Theft,0,0
1,SPRINGMONT LN,28208,1427405,557580,Motor Vehicle Theft,0,0
2,SCOTT FUTRELL DR,28208,1425917,546914,Motor Vehicle Theft,0,0
3,LISBON LN,28269,1463794,564750,Motor Vehicle Theft,0,0
4,PERIMETER STATION DR,28269,1447340,585770,Motor Vehicle Theft,0,0
...,...,...,...,...,...,...,...
10952,BEATTIES FORD RD,28216,1446074,561136,Motor Vehicle Theft,0,0
10953,NEVILLE ABBEY DR,28262,1483602,593837,Motor Vehicle Theft,0,0
10954,BREEZEWOOD DR,28262,1479544,586486,Motor Vehicle Theft,0,0
10955,BENNETTSVILLE LN,28262,1463389,569930,Motor Vehicle Theft,0,0


In [106]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [107]:
# Print accuracy and classification report
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2192

    accuracy                           1.00      2192
   macro avg       1.00      1.00      1.00      2192
weighted avg       1.00      1.00      1.00      2192

