In [None]:
# Library imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import folium
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap
from folium.plugins import HeatMapWithTime
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scipy.stats as stats
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the dataset
df = pd.read_csv('data.csv')

In [None]:
# Display the first few rows to see what the data looks like
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Get a summary of the dataframe to understand data types and non-null counts
print("\nDataframe Info:")
df.info()

In [None]:
# Checking for any immediate missing values
print("\nMissing Values Summary:")
print(df.isnull().sum())

In [None]:
# Converting 'Date' and 'Updated On' columns to datetime objects
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Updated On'] = pd.to_datetime(df['Updated On'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')

In [None]:
# Checking conversion results
print("Date conversion check:")
print(df[['Date', 'Updated On']].head())

In [None]:
# Checking the shape of the dataset
print("Shape of dataset:", df.shape)

In [None]:
# Droping rows with any missing values across the entire DataFrame
df = df.dropna()

In [None]:
# Checking the shape of the dataset after dropping missing values
print("Shape of dataset after dropping missing values:", df.shape)

In [None]:
# Droping unnecessary columns to streamline the dataset
x = ['ID', 'Case Number', 'IUCR', 'Description', 'FBI Code', 'Beat', 'Location']
df.drop(columns=x, inplace=True, errors='ignore')

In [None]:
# Checking the shape of the dataset after dropping missing values
print("Shape of dataset after dropping unecessary columns:", df.shape)

In [None]:
# Created additional time-based features from the 'Date' column
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Hour'] = df['Date'].dt.hour
df['DayOfWeek'] = df['Date'].dt.dayofweek

In [None]:
# Createing a new column 'Area Type' based on 'Location Description'
# If 'RESIDENCE' is in the description, label it as "Residential", otherwise "Non-Residential"
df['Area Type'] = df['Location Description'].apply(
    lambda x: "Residential" if isinstance(x, str) and "RESIDENCE" in x.upper() else "Non-Residential"
)

In [None]:
# Verifying changes
print("Preprocessed DataFrame columns:", df.columns.tolist())
print("Preprocessed DataFrame shape:", df.shape)

In [None]:
# Display the first few rows to see what the data looks like
print("First 5 rows of the preprocessed dataset:")
df.head()

## EDA Question 1
## What are the most frequent crime types in Chicago, and how do they correlate with location (residential vs. commercial areas)?

In [None]:
# Count overall occurrences of each crime type
crime_counts = df['Primary Type'].value_counts().reset_index()
crime_counts.columns = ['Primary Type', 'Count']

In [None]:
# Grouping by Primary Type and Area Type
crime_by_area = df.groupby(['Primary Type', 'Area Type']).size().reset_index(name='Count')

In [None]:
# Overall top 10 crime types
plt.figure(figsize=(12,6))
top_10 = crime_counts.head(10)
ax = sns.barplot(data=top_10, x='Primary Type', y='Count', hue='Primary Type', palette='viridis', dodge=False)
plt.xticks(rotation=45)
plt.title('Top 10 Most Frequent Crime Types in Chicago')
plt.xlabel('Primary Crime Type')
plt.ylabel('Number of Crimes')
plt.tight_layout()
plt.show()

In [None]:
# Visualize the breakdown of the top 10 crime types by Area Type
top_10_types = top_10['Primary Type'].tolist()
subset = crime_by_area[crime_by_area['Primary Type'].isin(top_10_types)]

plt.figure(figsize=(12,6))
sns.barplot(data=subset, x='Primary Type', y='Count', hue='Area Type', palette='magma')
plt.xticks(rotation=45)
plt.title('Top 10 Crime Types by Area Type (Residential vs Non-Residential)')
plt.xlabel('Primary Crime Type')
plt.ylabel('Number of Crimes')
plt.tight_layout()
plt.show()

In [None]:
# Defined a helper function to extract words from a text
def extract_words(text):
    # Using regex to extract alphanumeric words; converting to uppercase for consistency
    return re.findall(r'\w+', text.upper())

# Initialized an empty list to hold all words
all_words = []

# Iterate over the 'Location Description' column
for desc in df['Location Description']:
    if isinstance(desc, str):
        words = extract_words(desc)
        all_words.extend(words)

# Count the frequency of each word
word_freq = Counter(all_words)

# Display the 50 most common words
top_words = word_freq.most_common(50)
print("Top 50 words in 'Location Description':")
for word, count in top_words:
    print(f"{word}: {count}")

In [None]:
def extract_words(text):
    # Extracting alphanumeric words and convert to uppercase for consistency
    return re.findall(r'\w+', text.upper())

# Defined residential keywords based on the top words observed in above code
residential_keywords = ["RESIDENCE", "APARTMENT", "RESIDENTIAL", "RESID", "HOME", "HOUSE"]

def classify_location_by_words(desc):
    if isinstance(desc, str):
        words = set(extract_words(desc))
        # If any of the residential keywords are present, classify as Residential
        if any(keyword in words for keyword in residential_keywords):
            return "Residential"
        else:
            return "Non-Residential"
    else:
        return "Unknown"

# Apply the classification function to the 'Location Description' column
df['Area Type'] = df['Location Description'].apply(classify_location_by_words)

# Display the distribution of the new classification
print("Updated Area Type Distribution:")
print(df['Area Type'].value_counts())

In [None]:
# Overall top 10 crime types from overall counts
crime_counts = df['Primary Type'].value_counts().reset_index()
crime_counts.columns = ['Primary Type', 'Count']
top_10 = crime_counts.head(10)

plt.figure(figsize=(12,6))
ax = sns.barplot(data=top_10, x='Primary Type', y='Count', hue='Primary Type', palette='viridis', dodge=False)
ax.set(yscale="log")  # Applying log scale
plt.xticks(rotation=45)
plt.title('Top 10 Most Frequent Crime Types in Chicago (Log Scale)')
plt.xlabel('Primary Crime Type')
plt.ylabel('Number of Crimes (Log Scale)')
plt.tight_layout()
plt.show()

In [None]:
# Defined a function to classify time of day
def time_of_day(hour):
    # Defined 6 AM to 6 PM as Daytime, else Nighttime
    return "Daytime" if 6 <= hour < 18 else "Nighttime"

# Creating the TimeOfDay column
df['TimeOfDay'] = df['Hour'].apply(time_of_day)

In [None]:
# Checking the distribution
print("Time of Day Distribution:")
print(df['TimeOfDay'].value_counts())

In [None]:
# Visualizing the top 10 crime types by TimeOfDay
top_10_types = top_10['Primary Type'].tolist()
subset_time = df[df['Primary Type'].isin(top_10_types)]
time_grouped = subset_time.groupby(['Primary Type', 'TimeOfDay']).size().reset_index(name='Count')

plt.figure(figsize=(12,6))
sns.barplot(data=time_grouped, x='Primary Type', y='Count', hue='TimeOfDay', palette='coolwarm')
plt.xticks(rotation=45)
plt.title('Top 10 Crime Types by Time of Day (Daytime vs. Nighttime)')
plt.xlabel('Primary Crime Type')
plt.ylabel('Number of Crimes')
plt.tight_layout()
plt.show()

In [None]:
# Creating a pivot table: count of crimes by District and Month
# Making sure the District column is in a suitable format
df['District'] = df['District'].astype(int)

district_month = df.pivot_table(index='District', columns='Month', values='Primary Type', aggfunc='count', fill_value=0)

plt.figure(figsize=(12,8))
sns.heatmap(district_month, cmap='YlGnBu', annot=True, fmt='d')
plt.title('Heatmap of Crime Counts by District and Month')
plt.xlabel('Month')
plt.ylabel('District')
plt.tight_layout()
plt.show()

In [None]:
# Calculate overall crime counts by Primary Type
crime_counts = df['Primary Type'].value_counts().reset_index()
crime_counts.columns = ['Primary Type', 'Count']

# Get the overall top 10 crime types
top_10_crimes = crime_counts.head(10)['Primary Type'].tolist()

# Filtering the DataFrame for only the top 10 crime types
df_top10 = df[df['Primary Type'].isin(top_10_crimes)]

# Sampled a subset of records for plotting (to avoid overplotting)
df_sample = df_top10.sample(n=10000, random_state=42)

# Creating the scatter plot using longitude and latitude; color by crime type
plt.figure(figsize=(12,10))
scatter = sns.scatterplot(data=df_sample, x='Longitude', y='Latitude', 
                          hue='Primary Type', palette='tab10', alpha=0.5)

plt.title('Spatial Distribution of Top 10 Crime Types in Chicago')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.tight_layout()
plt.show()

In [None]:
# Calculated overall crime counts and extract the top 10 crime types (if not already done)
crime_counts = df['Primary Type'].value_counts().reset_index()
crime_counts.columns = ['Primary Type', 'Count']
top_10_crimes = crime_counts.head(10)['Primary Type'].tolist()

# Filtered the DataFrame for only the top 10 crime types
df_top10 = df[df['Primary Type'].isin(top_10_crimes)]

# Sampled a subset of records for mapping to avoid performance issues
df_sample = df_top10.sample(n=10000, random_state=42)

# Set Chicago's approximate center coordinates
map_center = [41.8781, -87.6298]
crime_map = folium.Map(location=map_center, zoom_start=11)

# Added a marker cluster to group nearby markers
marker_cluster = MarkerCluster().add_to(crime_map)

# Iterated through the sampled data and add markers with popups
for idx, row in df_sample.iterrows():
    # Extracted required fields
    lat = row['Latitude']
    lon = row['Longitude']
    crime_type = row['Primary Type']
    time_of_day = row['TimeOfDay']
    date_str = row['Date'].strftime("%Y-%m-%d %H:%M:%S")
    
    popup_text = f"Crime: {crime_type}<br>Time: {time_of_day}<br>Date: {date_str}"
    
    folium.Marker(
        location=[lat, lon],
        popup=popup_text,
        icon=folium.Icon(color="blue", icon="info-sign")
    ).add_to(marker_cluster)

# Saving the map to an HTML
crime_map.save("chicago_crime_interactive_map.html")

# Display the map in Jupyter Notebook
crime_map 

In [None]:
# Created a list of [latitude, longitude] pairs for all records.
heat_data = df[['Latitude', 'Longitude']].values.tolist()

# Setting up the map centered on Chicago.
map_all = folium.Map(location=[41.8781, -87.6298], zoom_start=11)

# Added a HeatMap layer with custom parameters (tweak radius and blur as needed)
HeatMap(heat_data, radius=10, blur=15, max_zoom=1).add_to(map_all)

# Saving the map to an HTML file if needed
map_all.save("chicago_crime_heatmap.html")

# Display the heatmap in Jupyter Notebook
map_all

In [None]:
# Computing overall crime counts and extract the top 10 crime types
crime_counts = df['Primary Type'].value_counts().reset_index()
crime_counts.columns = ['Primary Type', 'Count']
top_10_crimes = crime_counts.head(10)['Primary Type'].tolist()

# Created a base map centered on Chicago
map_top10 = folium.Map(location=[41.8781, -87.6298], zoom_start=11)

# Loop over each top crime type, filter the DataFrame, and add a HeatMap layer for each
for crime in top_10_crimes:
    df_crime = df[df['Primary Type'] == crime]
    heat_data = df_crime[['Latitude', 'Longitude']].values.tolist()
    
    # Create a feature group for the current crime type
    fg = folium.FeatureGroup(name=crime)
    HeatMap(heat_data, radius=10, blur=15, max_zoom=1).add_to(fg)
    fg.add_to(map_top10)

# Added layer control to toggle layers on/off
folium.LayerControl().add_to(map_top10)

# Save and display the map
map_top10.save("chicago_top10_crime_types_heatmap.html")
map_top10

## EDA Question 2

In [None]:
# Geting the unique crime types and their counts
unique_crime_types = df['Primary Type'].value_counts()

# Print the unique crime types sorted by frequency
print("Crime Types Sorted by Frequency:")
print(unique_crime_types)

In [None]:
# Define classification sets based on our reasoning:
violent_types = {"BATTERY", "ASSAULT", "ROBBERY", "CRIM SEXUAL ASSAULT", "CRIMINAL SEXUAL ASSAULT", "HOMICIDE", "KIDNAPPING"}
property_types = {"THEFT", "BURGLARY", "MOTOR VEHICLE THEFT", "CRIMINAL DAMAGE", "ARSON", "DECEPTIVE PRACTICE"}

# Creating a function to classify each crime
def categorize_crime(primary_type):
    p_type = primary_type.upper()
    if p_type in violent_types:
        return "Violent"
    elif p_type in property_types:
        return "Property"
    else:
        return "Other"

# Applying the classification function to create a new column
df['Crime Category'] = df['Primary Type'].apply(categorize_crime)

# Print the distribution of the new crime categories
print("\nCrime Category Distribution:")
print(df['Crime Category'].value_counts())

In [None]:
# Filtered monthly counts for only Violent and Property crimes
filtered_counts = df[df['Crime Category'].isin(['Violent', 'Property'])]

# Aggregated monthly crime counts by category
monthly_counts = filtered_counts.groupby(['Month', 'Crime Category']).size().reset_index(name='Count')

# Ploting a line chart to visualize seasonal trends
plt.figure(figsize=(10,6))
sns.lineplot(data=monthly_counts, x='Month', y='Count', hue='Crime Category', marker='o')
plt.title('Monthly Crime Counts by Category (Violent vs. Property)')
plt.xlabel('Month')
plt.ylabel('Number of Crimes')
plt.xticks(range(1, 13))
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Filtering the dataset for Violent and Property crimes only
df_vp = df[df['Crime Category'].isin(['Violent', 'Property'])]

# Violent Crimes Heatmap
violent_pivot = df_vp[df_vp['Crime Category'] == 'Violent'] \
    .groupby(['Year', 'Month']).size().unstack(fill_value=0)

plt.figure(figsize=(12,8))
sns.heatmap(violent_pivot, annot=True, fmt='d', cmap='Reds')
plt.title('Heatmap of Violent Crimes by Year and Month')
plt.xlabel('Month')
plt.ylabel('Year')
plt.show()

# Property Crimes Heatmap
property_pivot = df_vp[df_vp['Crime Category'] == 'Property'] \
    .groupby(['Year', 'Month']).size().unstack(fill_value=0)

plt.figure(figsize=(12,8))
sns.heatmap(property_pivot, annot=True, fmt='d', cmap='Blues')
plt.title('Heatmap of Property Crimes by Year and Month')
plt.xlabel('Month')
plt.ylabel('Year')
plt.show()

In [None]:
# Aggregated hourly counts for violent and property crimes
hourly_counts = df_vp.groupby(['Hour', 'Crime Category']).size().reset_index(name='Count')

plt.figure(figsize=(12,6))
sns.lineplot(data=hourly_counts, x='Hour', y='Count', hue='Crime Category', marker='o')
plt.title('Hourly Crime Counts by Category (Violent vs. Property)')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Crimes')
plt.xticks(range(0,24))
plt.grid(True)
plt.show()

In [None]:
# Filtering for violent crimes and then for selected subtypes
violent_sub = df[df['Crime Category'] == 'Violent']
selected_violent = violent_sub[violent_sub['Primary Type'].isin(['BATTERY', 'ASSAULT', 'ROBBERY'])]

# Aggregated hourly counts by Primary Type for violent crimes
hourly_violent = selected_violent.groupby(['Hour', 'Primary Type']).size().reset_index(name='Count')

plt.figure(figsize=(12,6))
sns.lineplot(data=hourly_violent, x='Hour', y='Count', hue='Primary Type', marker='o')
plt.title('Hourly Patterns for Selected Violent Crime Types')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Crimes')
plt.xticks(range(0, 24))
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Aggregated counts by day of the week (0=Monday, 6=Sunday)
dow_counts = df_vp.groupby(['DayOfWeek', 'Crime Category']).size().reset_index(name='Count')

plt.figure(figsize=(12,6))
sns.lineplot(data=dow_counts, x='DayOfWeek', y='Count', hue='Crime Category', marker='o')
plt.title('Day-of-Week Crime Counts by Category (Violent vs. Property)')
plt.xlabel('Day of Week (0=Monday, 6=Sunday)')
plt.ylabel('Number of Crimes')
plt.xticks(range(0,7))
plt.grid(True)
plt.show()

In [None]:
# Defined weekend indicator: Saturday (5) and Sunday (6)
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 'Weekend' if x in [5, 6] else 'Weekday')

# Filtering for violent crimes
violent_data = df[df['Crime Category'] == 'Violent']

# Aggregate hourly counts by Day Type (Weekday vs. Weekend)
hourly_week = violent_data.groupby(['Hour', 'IsWeekend']).size().reset_index(name='Count')

plt.figure(figsize=(12,6))
sns.lineplot(data=hourly_week, x='Hour', y='Count', hue='IsWeekend', marker='o')
plt.title('Hourly Violent Crime Counts: Weekday vs. Weekend')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Violent Crimes')
plt.xticks(range(0, 24))
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
res_vs_non_monthly['Combo'] = res_vs_non_monthly['Area Type'] + '_' + res_vs_non_monthly['Crime Category']

plt.figure(figsize=(12,6))
sns.lineplot(
    data=res_vs_non_monthly,
    x='Month',
    y='Count',
    hue='Combo',    
    marker='o'
)
plt.title('Monthly Crime Counts by Combined Area Type and Crime Category')
plt.xlabel('Month')
plt.ylabel('Number of Crimes')
plt.xticks(range(1, 13))
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Combined 'Area Type' and 'TimeOfDay' into one column
monthly_area_time['AreaTime'] = monthly_area_time['Area Type'] + '_' + monthly_area_time['TimeOfDay']

plt.figure(figsize=(12,6))
sns.lineplot(
    data=monthly_area_time,
    x='Month',
    y='Count',
    hue='AreaTime',
    marker='o'  # Remove style parameter to keep the legend simpler
)
plt.title('Monthly Property Crime Counts by Combined Area Type and Time of Day')
plt.xlabel('Month')
plt.ylabel('Number of Property Crimes')
plt.xticks(range(1,13))
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Filtering the DataFrame for property crimes
property_data = df[df['Crime Category'] == 'Property']

# Aggregated daily property crime counts (group by the date portion of 'Date')
daily_counts = property_data.groupby(property_data['Date'].dt.date).size().reset_index(name='DailyCount')
daily_counts['Date'] = pd.to_datetime(daily_counts['Date'])
daily_counts['Month'] = daily_counts['Date'].dt.month

# Now, group daily counts by Month to get multiple observations per month
groups = [group['DailyCount'].tolist() for name, group in daily_counts.groupby('Month')]

# Performing the ANOVA test on the groups
f_stat, p_value = stats.f_oneway(*groups)
print("ANOVA test for daily property crime counts by month: F =", f_stat, ", p =", p_value)

In [None]:
# Aggregated daily property crime counts (from earlier)
property_data = df[df['Crime Category'] == 'Property']
daily_counts = property_data.groupby(property_data['Date'].dt.date).size().reset_index(name='DailyCount')
daily_counts['Date'] = pd.to_datetime(daily_counts['Date'])
daily_counts['Month'] = daily_counts['Date'].dt.month

# Applying Tukey's HSD
tukey = pairwise_tukeyhsd(endog=daily_counts['DailyCount'], groups=daily_counts['Month'], alpha=0.05)
print(tukey)

In [None]:
# Filtering daily property counts for two time periods
daily_counts['Year'] = daily_counts['Date'].dt.year
pre_2010 = daily_counts[daily_counts['Year'] < 2010]
post_2010 = daily_counts[daily_counts['Year'] >= 2010]

# Grouped daily counts by month for each period
groups_pre = [group['DailyCount'].tolist() for name, group in pre_2010.groupby('Month')]
groups_post = [group['DailyCount'].tolist() for name, group in post_2010.groupby('Month')]

# Performed ANOVA for pre-2010 and post-2010 separately
f_stat_pre, p_value_pre = stats.f_oneway(*groups_pre)
f_stat_post, p_value_post = stats.f_oneway(*groups_post)

print("Pre-2010: F =", f_stat_pre, ", p =", p_value_pre)
print("Post-2010: F =", f_stat_post, ", p =", p_value_post)

## Predictive Modeling

In [None]:
# Aggregated daily crime counts
daily_counts = df.groupby(['Year', 'Month', 'Day']).size().reset_index(name='Crime_Count')

# Created a Date column from Year, Month, and Day and extract Day of Week
daily_counts['Date'] = pd.to_datetime(daily_counts[['Year','Month','Day']])
daily_counts['DayOfWeek'] = daily_counts['Date'].dt.dayofweek  # Monday=0, Sunday=6

# Selected features and target
features = ['Year', 'Month', 'Day', 'DayOfWeek']
X = daily_counts[features]
y = daily_counts['Crime_Count']

# Splited data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialized and train XGBoost regressor
model_xgb = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model_xgb.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model_xgb.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred)
print("XGBoost Mean Squared Error:", mse_xgb)

In [None]:
# Sorted daily_counts by Date
daily_counts.sort_values(by='Date', inplace=True)
data = daily_counts['Crime_Count'].values.reshape(-1, 1)

# Scale the data to the range [0, 1]
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)

# Function to create sequences for time-series prediction
def create_sequences(data, seq_length=7):
    X_seq, y_seq = [], []
    for i in range(len(data) - seq_length):
        X_seq.append(data[i:i+seq_length])
        y_seq.append(data[i+seq_length])
    return np.array(X_seq), np.array(y_seq)

# Create sequences (using 7 days of data to predict the next day)
seq_length = 7
X_seq, y_seq = create_sequences(data_scaled, seq_length)

# Split into training and testing sets (80/20 split)
split_index = int(0.8 * len(X_seq))
X_train_seq, X_test_seq = X_seq[:split_index], X_seq[split_index:]
y_train_seq, y_test_seq = y_seq[:split_index], y_seq[split_index:]

# Define the LSTM model
model_lstm = Sequential()
model_lstm.add(LSTM(50, activation='relu', input_shape=(seq_length, 1)))
model_lstm.add(Dense(1))
model_lstm.compile(optimizer='adam', loss='mse')

# Train the model
model_lstm.fit(X_train_seq, y_train_seq, epochs=20, batch_size=32, validation_split=0.1)

# Predict on the test set and invert scaling
predictions = model_lstm.predict(X_test_seq)
predictions_inv = scaler.inverse_transform(predictions)
y_test_inv = scaler.inverse_transform(y_test_seq)

# Compute Mean Squared Error for LSTM
mse_lstm = np.mean((predictions_inv - y_test_inv)**2)
print("LSTM Mean Squared Error:", mse_lstm)

In [None]:
# Aggregated daily crime counts (if not already done)
# Using the 'Date' column and count the number of crimes per day.
daily_counts = df.groupby(df['Date'].dt.date).size().reset_index(name='Crime_Count')
daily_counts['ds'] = pd.to_datetime(daily_counts['index'] if 'index' in daily_counts.columns else daily_counts['Date'])
daily_counts.rename(columns={'Crime_Count': 'y'}, inplace=True)
daily_counts = daily_counts[['ds', 'y']]

# Sorted the data by date
daily_counts.sort_values('ds', inplace=True)

# Initialized the Prophet model
model_prophet = Prophet(daily_seasonality=True)
model_prophet.fit(daily_counts)

# Created a dataframe to hold predictions for the next 30 days
future = model_prophet.make_future_dataframe(periods=30)
forecast = model_prophet.predict(future)

# Plot the forecast
fig1 = model_prophet.plot(forecast)
plt.title("Daily Crime Count Forecast using Prophet")
plt.xlabel("Date")
plt.ylabel("Crime Count")
plt.show()

# Plot forecast components (trend, weekly seasonality, yearly seasonality)
fig2 = model_prophet.plot_components(forecast)
plt.show()

In [None]:
from prophet.diagnostics import cross_validation, performance_metrics

df_cv = cross_validation(model_prophet, initial='730 days', period='180 days', horizon='365 days')
df_metrics = performance_metrics(df_cv)
print(df_metrics)

In [None]:
# Perform cross-validation:
# - initial: period to use for training initially (e.g., 3 years: 1095 days)
# - period: spacing between cutoff dates (e.g., every 180 days)
# - horizon: how far into the future to forecast (e.g., 365 days)
df_cv = cross_validation(model_prophet, initial='1095 days', period='180 days', horizon='365 days')

# Compute performance metrics (MAE, RMSE, MAPE, etc.)
df_performance = performance_metrics(df_cv)
print(df_performance.head())

In [None]:
# Ploting the forecast
fig = model_prophet.plot(forecast)
plt.title("Forecast of Daily Crime Counts in Chicago")
plt.xlabel("Date")
plt.ylabel("Daily Crime Count")

# Added a vertical line to mark the end of historical data
last_date = daily_counts['ds'].max()  # the last date in your training data
plt.axvline(x=last_date, color='red', linestyle='--', label='Forecast Start')
plt.legend()
plt.show()

In [None]:
fig_components = model_prophet.plot_components(forecast)
plt.show()

In [None]:
# Displaying the forecasted values for the next 30 days
future_forecast = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(30)
print(future_forecast)

In [None]:
# Generate a future dataframe for 2 more years (730 days)
future = model_prophet.make_future_dataframe(periods=730)  
forecast = model_prophet.predict(future)

# Plot the extended forecast
fig = model_prophet.plot(forecast)
plt.title("Forecast of Daily Crime Counts in Chicago")
plt.xlabel("Date")
plt.ylabel("Daily Crime Count")

# Mark the end of the historical data
last_date = daily_counts['ds'].max()
plt.axvline(x=last_date, color='red', linestyle='--', label='Forecast Start')
plt.legend()
plt.show()

# Inspecting the last few rows of the forecast
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(10))

In [None]:
# Create a working copy for classification
clf_df = df.copy()

# Encode categorical features: "Primary Type" and "Area Type"
le_primary = LabelEncoder()
clf_df['Primary_Type_enc'] = le_primary.fit_transform(clf_df['Primary Type'])

le_area = LabelEncoder()
clf_df['Area_Type_enc'] = le_area.fit_transform(clf_df['Area Type'])

# Select features and target
# We use: Primary_Type_enc, Domestic (bool), District, Month, Hour, DayOfWeek, Area_Type_enc
features = ['Primary_Type_enc', 'Domestic', 'District', 'Month', 'Hour', 'DayOfWeek', 'Area_Type_enc']
target = 'Arrest'

X = clf_df[features]
y = clf_df[target].astype(int)  # Convert boolean to integer (0 or 1)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train XGBoost Classifier
model_xgb_clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42,
                                  use_label_encoder=False, eval_metric='logloss')
model_xgb_clf.fit(X_train, y_train)

# Make predictions and evaluate performance
y_pred = model_xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model_xgb_clf.predict_proba(X_test)[:,1])
print("Accuracy:", accuracy)
print("ROC AUC:", roc_auc)
print(classification_report(y_test, y_pred))

# Extract feature importances
importances = model_xgb_clf.feature_importances_
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df.sort_values(by='Importance', ascending=False, inplace=True)
print("\nFeature Importances:")
print(importance_df)

In [None]:
plt.figure(figsize=(10,6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Importance Score')
plt.title('Feature Importances for Predicting Arrest')
plt.gca().invert_yaxis()  # Highest importance at the top
plt.show()

In [None]:
# Group the data by "Primary Type" and calculate total crimes and arrests
arrest_rate_df = df.groupby('Primary Type').agg(
    total_crimes = ('Arrest', 'count'),
    arrests = ('Arrest', 'sum')
).reset_index()

# Calculate the arrest rate (as a percentage)
arrest_rate_df['Arrest_Rate'] = (arrest_rate_df['arrests'] / arrest_rate_df['total_crimes']) * 100

# Sort the DataFrame by arrest rate in descending order
arrest_rate_df.sort_values(by='Arrest_Rate', ascending=False, inplace=True)

# Display the DataFrame
print(arrest_rate_df[['Primary Type', 'total_crimes', 'arrests', 'Arrest_Rate']])

# Plot the arrest rate for each crime type
plt.figure(figsize=(14,8))
sns.barplot(data=arrest_rate_df, x='Arrest_Rate', y='Primary Type', palette='viridis')
plt.xlabel("Arrest Rate (%)")
plt.ylabel("Primary Crime Type")
plt.title("Arrest Rate by Crime Type")
plt.tight_layout()
plt.show()