<a href="https://colab.research.google.com/github/sujialagar/crime-prediction-on-various-occurance/blob/main/Final_project04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import libraries

import pandas as pd
from datetime import datetime

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# import data
data = pd.read_csv('/content/Crime_Data_from_2020_to_Present.csv')
data.head()

Unnamed: 0,division_number,date_reported,date_occurred,area,area_name,reporting_district,part,crime_code,crime_description,modus_operandi,...,status,status_description,crime_code_1,crime_code_2,crime_code_3,crime_code_4,location,cross_street,latitude,longitude
0,10304468,2020-01-08,2020-01-08 22:30:00,3,Southwest,377,2,624,BATTERY - SIMPLE ASSAULT,0444 0913,...,AO,Adult Other,624.0,,,,1100 W 39TH PL,,34.0141,-118.2978
1,190101086,2020-01-02,2020-01-01 03:30:00,1,Central,163,2,624,BATTERY - SIMPLE ASSAULT,0416 1822 1414,...,IC,Invest Cont,624.0,,,,700 S HILL ST,,34.0459,-118.2545
2,200110444,2020-04-14,2020-02-13 12:00:00,1,Central,155,2,845,SEX OFFENDER REGISTRANT OUT OF COMPLIANCE,1501,...,AA,Adult Arrest,845.0,,,,200 E 6TH ST,,34.0448,-118.2474
3,191501505,2020-01-01,2020-01-01 17:30:00,15,N Hollywood,1543,2,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),0329 1402,...,IC,Invest Cont,745.0,998.0,,,5400 CORTEEN PL,,34.1685,-118.4019
4,191921269,2020-01-01,2020-01-01 04:15:00,19,Mission,1998,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",0329,...,IC,Invest Cont,740.0,,,,14400 TITUS ST,,34.2198,-118.4468


In [5]:
data.shape

(852950, 27)

In [6]:
# Data Cleaning Process

# Checking for missing values in each column
missing_values = data.isnull().sum()


In [7]:
# Checking for duplicate entries
duplicate_rows = data.duplicated().sum()

In [8]:
# Analyzing the data types of each column to see if any conversions are needed
data_types = data.dtypes

missing_values, duplicate_rows, data_types

(division_number             0
 date_reported               0
 date_occurred               0
 area                        0
 area_name                   0
 reporting_district          0
 part                        0
 crime_code                  0
 crime_description           0
 modus_operandi         118311
 victim_age                  0
 victim_sex             112606
 victim_descent         112614
 premise_code               10
 premise_description       518
 weapon_code            556202
 weapon_description     556202
 status                      0
 status_description          0
 crime_code_1               11
 crime_code_2           790429
 crime_code_3           850837
 crime_code_4           852888
 location                    0
 cross_street           717289
 latitude                    0
 longitude                   0
 dtype: int64,
 0,
 division_number          int64
 date_reported           object
 date_occurred           object
 area                     int64
 area_name      

In [9]:
# Replace missing values
data.fillna("Unknown", inplace=True)

# Convert date columns to datetime
data['date_reported'] = pd.to_datetime(data['date_reported'])
data['date_occurred'] = pd.to_datetime(data['date_occurred'])

  data.fillna("Unknown", inplace=True)


In [10]:
data.isnull().sum()

Unnamed: 0,0
division_number,0
date_reported,0
date_occurred,0
area,0
area_name,0
reporting_district,0
part,0
crime_code,0
crime_description,0
modus_operandi,0


In [11]:
# Replacing invalid ages (0 and negative values) with NaN
data['victim_age'] = data['victim_age'].apply(lambda x: x if x > 0 else None)
data['victim_age'].replace(0, pd.NA, inplace=True)

# Create a histogram for the distribution of victim age
fig = px.histogram(data, x='victim_age', nbins=30, color_discrete_sequence=['dodgerblue'])

# Updating layout for the plot
fig.update_layout(
    title='Distribution of Victim Age',
    xaxis_title='Victim Age',
    yaxis_title='Frequency',
    bargap=0.2,
    template='plotly_white'
)

# Display the plot
fig.show()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['victim_age'].replace(0, pd.NA, inplace=True)


In [12]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Extract year from the date column
data['year'] = data['date_occurred'].dt.year

# Count crimes per year
yearly_crime_counts = data['year'].value_counts().sort_index()

# Create the figure
fig = go.Figure()

# Add bar plot for yearly crime distribution
fig.add_trace(go.Bar(x=yearly_crime_counts.index,
                     y=yearly_crime_counts.values,
                     marker_color='dodgerblue',
                     name="Yearly Crime Count"))

# Update layout
fig.update_layout(
    title="Yearly Crime Distribution",
    xaxis_title="Year",
    yaxis_title="Number of Crimes",
    template="plotly_white",
    height=500,
    width=600
)

# Show plot
fig.show()


In [13]:
# Extracting hour from the 'date_occurred' column
data['hour'] = data['date_occurred'].dt.hour

# Counting crimes by each hour of the day
hourly_crime_counts = data['hour'].value_counts().sort_index()

# Plotting the distribution of crimes by hour
fig = px.bar(x=hourly_crime_counts.index, y=hourly_crime_counts.values, labels={'x': 'Hour of the Day (0-23)', 'y': 'Number of Crimes'}, color_discrete_sequence=['dodgerblue'])

# Updating layout for the plot
fig.update_layout(
    title='Crime Distribution by Hour of the Day',
    template='plotly_white',
    showlegend=False
)

fig.update_xaxes(
    tickmode='array',
    tickvals=list(range(24)),
    ticktext=[str(hour) for hour in range(24)]
)

# Display the plot
fig.show()

In [14]:
# Top 10 most common crime descriptions (excluding 'Unknown')
top_crimes = data[data['crime_description'] != 'Unknown']['crime_description'].value_counts().head(10)

# Top 10 most common weapons (excluding 'Unknown', 'UNKNOWN WEAPON/OTHER WEAPON')
top_weapons = data[~data['weapon_description'].isin(['Unknown', 'UNKNOWN WEAPON/OTHER WEAPON'])]['weapon_description'].value_counts().head(10)

# Setting up the figure with two subplots
fig = make_subplots(rows=2, cols=1, subplot_titles=('Top 10 Crime Descriptions', 'Top 10 Weapons Used in Crimes'))

# Horizontal bar chart for top 10 crime descriptions
fig.add_trace(
    go.Bar(x=top_crimes.values, y=top_crimes.index, orientation='h', marker_color='dodgerblue'),
    row=1, col=1
)

# Horizontal bar chart for top 10 weapons used
fig.add_trace(
    go.Bar(x=top_weapons.values, y=top_weapons.index, orientation='h', marker_color='coral'),
    row=2, col=1
)

# Update layout for the charts
fig.update_layout(
    height=800,
    showlegend=False,
    template='plotly_white',
    title_text="Top 10 Crime Descriptions and Weapons Used in Crimes"
)

# Inverting y-axis for both plots to display the highest value at the top
fig.update_yaxes(autorange="reversed", row=1, col=1)
fig.update_yaxes(autorange="reversed", row=2, col=1)

# Update x-axis titles
fig.update_xaxes(title_text="Number of Cases", row=1, col=1)
fig.update_xaxes(title_text="Number of Cases", row=2, col=1)

# Display the plot
fig.show()

In [15]:
# Add case_solved column
data['case_solved'] = data['status_description'].apply(lambda x: 'Not solved' if x == 'Invest Cont' else 'Solved')

# Data preparation for victim_sex plot
sex_solved_counts = data.groupby(['victim_sex', 'case_solved']).size().unstack()
sex_solved_percent = sex_solved_counts.div(sex_solved_counts.sum(axis=1), axis=0) * 100

# Data preparation for crime_description plot
crime_solved_counts = data.groupby(['crime_description', 'case_solved']).size().unstack()
crime_solved_percent = crime_solved_counts.div(crime_solved_counts.sum(axis=1), axis=0) * 100
crime_solved_percent_sorted = crime_solved_percent.sort_values(by='Solved', ascending=False)

# Data preparation for area_name plot
area_solved_counts = data.groupby(['area_name', 'case_solved']).size().unstack()
area_solved_percent = area_solved_counts.div(area_solved_counts.sum(axis=1), axis=0) * 100
area_solved_percent_sorted = area_solved_percent.sort_values(by='Solved', ascending=False)

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Percentage of Cases by Victim Sex and Resolution Status',
                    'Percentage of Cases by Area Name and Resolution Status',
                    'Percentage of Cases by Crime Description and Resolution Status'),
    specs=[[{"type": "bar"}, {"type": "bar"}],
           [{"type": "bar", "colspan": 2}, None]],
    horizontal_spacing=0.15, vertical_spacing=0.2
)

# Plot for Victim Sex
fig.add_trace(
    go.Bar(x=sex_solved_percent.index, y=sex_solved_percent['Solved'], name='Solved', marker_color='dodgerblue'),
    row=1, col=1
)
fig.add_trace(
    go.Bar(x=sex_solved_percent.index, y=sex_solved_percent['Not solved'], name='Not solved', marker_color='salmon'),
    row=1, col=1
)

# Plot for Area Name
fig.add_trace(
    go.Bar(x=area_solved_percent_sorted.index, y=area_solved_percent_sorted['Solved'], name='Solved', marker_color='dodgerblue', showlegend=False),
    row=1, col=2
)
fig.add_trace(
    go.Bar(x=area_solved_percent_sorted.index, y=area_solved_percent_sorted['Not solved'], name='Not solved', marker_color='salmon', showlegend=False),
    row=1, col=2
)

# Plot for Crime Description
fig.add_trace(
    go.Bar(x=crime_solved_percent_sorted.index, y=crime_solved_percent_sorted['Solved'], name='Solved', marker_color='dodgerblue', showlegend=False),
    row=2, col=1
)
fig.add_trace(
    go.Bar(x=crime_solved_percent_sorted.index, y=crime_solved_percent_sorted['Not solved'], name='Not solved', marker_color='salmon', showlegend=False),
    row=2, col=1
)

# Update layout for the charts
fig.update_layout(
    height=800,
    barmode='stack',
    title_text="Case Resolution Status by Victim Sex, Area Name, and Crime Description",
    template='plotly_white'
)

# Update y-axis titles
fig.update_yaxes(title_text="Percentage of Cases (%)", row=1, col=1)
fig.update_yaxes(title_text="Percentage of Cases (%)", row=1, col=2)
fig.update_yaxes(title_text="Percentage of Cases (%)", row=2, col=1)

fig.update_xaxes(showticklabels=False, row=2, col=1)

fig.show()

In [16]:
#hotspots location
district_crime_counts = data.groupby('area').agg(
    latitude=('latitude', 'mean'), longitude=('longitude', 'mean'), counts=('area', 'size')
).reset_index()

# Map Plotting
fig = px.scatter_mapbox(district_crime_counts,
                        lat="latitude",
                        lon="longitude",
                        size="counts",
                        color="counts",
                        hover_name="area",
                        color_continuous_scale="jet",
                        hover_data=["counts", "latitude", "longitude",],
                        zoom=9,
                        height=750, width=1_200,
                        title="Map of Crime Counts by District")
fig.update_layout(mapbox_style="open-street-map")
fig.show()


In [17]:
# Calculate reported_delay and days_after_reported
today = datetime.now()
data['reported_delay'] = (data['date_reported'] - data['date_occurred']).dt.days
data['reported_delay'] = data['reported_delay'].apply(lambda x: x if x >= 0 else 0)
data['days_after_reported'] = (today - data['date_reported']).dt.days

In [18]:
data[['date_reported', 'date_occurred', 'reported_delay', 'days_after_reported']].head()


Unnamed: 0,date_reported,date_occurred,reported_delay,days_after_reported
0,2020-01-08,2020-01-08 22:30:00,0,1886
1,2020-01-02,2020-01-01 03:30:00,0,1892
2,2020-04-14,2020-02-13 12:00:00,60,1789
3,2020-01-01,2020-01-01 17:30:00,0,1893
4,2020-01-01,2020-01-01 04:15:00,0,1893


In [19]:
# Preparing the target variable and converting it to binary
target = 'case_solved'
le = LabelEncoder()
data[target] = le.fit_transform(data[target])

# Features for the model
features = ['area', 'crime_code', 'victim_sex', 'victim_descent', 'weapon_code', 'hour', 'reported_delay', 'days_after_reported']

# Explicitly convert all categorical features to strings
for feature in ['victim_sex', 'victim_descent', 'weapon_code']:
    data[feature] = data[feature].astype(str)

# Encoding categorical features
label_encoders = {}
for feature in ['victim_sex', 'victim_descent', 'weapon_code']:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])
    label_encoders[feature] = le

# Display the first few rows of the data to verify the encoding
data[features].head()

Unnamed: 0,area,crime_code,victim_sex,victim_descent,weapon_code,hour,reported_delay,days_after_reported
0,3,624,1,2,61,22,0,1886
1,1,624,3,7,62,3,0,1892
2,1,845,5,19,79,12,60,1789
3,15,745,1,18,79,17,0,1893
4,19,740,5,19,79,4,0,1893


In [20]:
# Preparing the data
X = data[features]  # Features
y = data[target]    # Target

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [21]:
# Standardizing the features (important for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
# Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)


In [23]:
# Making predictions and evaluating the models
log_reg_pred = log_reg.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, log_reg_pred))
# You can also print out classification reports for more detailed performance analysis
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, log_reg_pred))

Logistic Regression Accuracy: 0.7897024053774157

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.97      0.88    204840
           1       0.32      0.05      0.08     51045

    accuracy                           0.79    255885
   macro avg       0.56      0.51      0.48    255885
weighted avg       0.71      0.79      0.72    255885



In [24]:
# Random Forest Classifier Model
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)


In [25]:
# Making predictions and evaluating the models
rf_clf_pred = rf_clf.predict(X_test)
print("Random Forest Classifier Accuracy:", accuracy_score(y_test, rf_clf_pred))
# You can also print out classification reports for more detailed performance analysis
print("\nRandom Forest Classifier Classification Report:\n", classification_report(y_test, rf_clf_pred))



Random Forest Classifier Accuracy: 0.8289387810930692

Random Forest Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.93      0.90    204840
           1       0.60      0.42      0.49     51045

    accuracy                           0.83    255885
   macro avg       0.73      0.68      0.70    255885
weighted avg       0.81      0.83      0.82    255885



In [31]:
#data loading
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from datetime import datetime  # Importing datetime for calculations

# Load the dataset
data = pd.read_csv('/content/Crime_Data_from_2020_to_Present.csv')

# Handle missing values
data.fillna("Unknown", inplace=True)

# Encode categorical variables
label_encoders = {}
# Using correct column names for crime and weapon descriptions
categorical_features = ['area', 'crime_description', 'victim_sex', 'victim_descent', 'weapon_description']
for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])
    label_encoders[feature] = le

# Creating the missing features
data['date_occurred'] = pd.to_datetime(data['date_occurred'])
data['time_of_crime'] = data['date_occurred'].dt.hour
# Assuming 'date_reported' and 'date_occurred' columns are datetime objects
data['date_reported'] = pd.to_datetime(data['date_reported'])
data['reported_delay'] = (data['date_reported'] - data['date_occurred']).dt.days
data['reported_delay'] = data['reported_delay'].apply(lambda x: x if x >= 0 else 0)
today = datetime.now()
data['days_after_reported'] = (today - data['date_reported']).dt.days

# Define features and target, now including the newly created features
features = ['area', 'crime_description', 'victim_sex', 'victim_descent', 'weapon_description', 'time_of_crime', 'reported_delay', 'days_after_reported']
X = data[features]
# Instead of 'case_solved', use 'status_description' and map values to create 'case_solved'
data['case_solved'] = data['status_description'].apply(lambda x: 0 if x == 'Invest Cont' else 1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'Unknown' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.



In [33]:
#save model
import joblib

# Save the models
joblib.dump(log_reg, 'log_reg_model.pkl')
joblib.dump(rf_clf, 'rf_clf_model.pkl')

# Save the preprocessing objects
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [42]:
from google.colab import files
files.download("log_reg_model.pkl")  # Or 'rf_clf_model.pkl'

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [50]:
from google.colab import files
files.download("rf_clf_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [49]:
import joblib
from google.colab import files

# Assuming 'scaler' is your StandardScaler object
# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# Download the scaler file
files.download('scaler.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [44]:
import joblib

# Create a dictionary to hold all components
crime_prediction_model = {
    'log_reg_model': log_reg,
    'rf_clf_model': rf_clf,
    'label_encoders': label_encoders,
    'scaler': scaler
}

# Save the dictionary to a single file
joblib.dump(crime_prediction_model, 'crime_prediction_model.pkl')

['crime_prediction_model.pkl']

In [46]:
from google.colab import files
files.download("crime_prediction_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>