In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

IMPORTING DATA

In [None]:
data = pd.read_csv('E:\Downloads\Crime_Data_from_2020_to_Present.csv')


In [None]:
print(data.head)

In [None]:
data.dtypes

1. Both the dates reported and occured are in string format. Converting them to datetime object. (if incase we are going to use the data for predictive modeling.)
2. the date reported and date occured both of them have dummy time along with date which is 12:00:00 AM, which isn't useful, so removing the time in the date occured column
3. Time occured has a separate column when the crime has occured, converting that from military format to standard format.


In [None]:
# Convert datetime_column to datetime format
# data['datetime_column'] = pd.to_datetime(data['datetime_column'], format='%m/%d/%Y %I:%M:%S %p')

data['DATE OCC'] = pd.to_datetime(data['DATE OCC'], format='%m/%d/%Y %I:%M:%S %p')

# Extract date part
data['DATE OCC'] = data['DATE OCC'].dt.date

# Print the result
print(data['DATE OCC'])


In [None]:
# converted the string time to datetime object with only the date leaving the dummy time information

type(data['DATE OCC'][0])

As the time occured was saved in military time, there are bound to be single digits let's say time of occurence of crime was 12:01 AM, in military time it's 0001 which might cause problems when converting military to standard format.

In [None]:


data['TIME OCC'][data['TIME OCC'].astype(str).str.len() < 3].value_counts()

1. As I thought all the single and double digits are representing the minutes when its past 12 AM. If we carefully observe the numbers they do not exceed 59 as they are representing the minutes.
2. so we need to convert the `data[TIME OCC]` column from integer to string and pad the minutes with zeroes, so that we can convert the military time to standard format without any errors.

In [None]:
# converting the integer type to string
data['TIME OCC'] = data['TIME OCC'].astype(str) 

# padding the empty space with zeroes
data['TIME OCC'] = data['TIME OCC'].apply(lambda x: x.zfill(4) if len(x) < 3 else x)


In [None]:
#converting the time occured from military format to standard format.

# Convert 'time_column' to datetime with format '%H%M'
data['TIME OCC'] = pd.to_datetime(data['TIME OCC'], format='%H%M')

# Format time in H:M AM/PM format
data['TIME OCC'] = data['TIME OCC'].dt.strftime('%I:%M %p')

# Print the DataFrame with converted time column
print(data['TIME OCC'] )


In [None]:
# combining both date and time occured into one column
# Convert 'date_column' to datetime format
data['DATE OCC'] = pd.to_datetime(data['DATE OCC'])

# Convert combined column to datetime format
data['DATETIME_OCC'] = data['DATE OCC'].dt.strftime('%Y-%m-%d') + ' ' + data['TIME OCC']

# Convert combined column to datetime format
data['DATETIME_OCC'] = pd.to_datetime(data['DATETIME_OCC'], format='%Y-%m-%d %I:%M %p')

# Print the DataFrame with combined datetime column
print(data['DATETIME_OCC'])

In [None]:
data.head()

Can we make use of Mocodes ? 

For the analysis I am trying to perform, Mocodes wouldn't be of that much help as a crime can have any motive, so a motive can lead to any kind of crimes. But to make it easier I will be using Crm Code description which gives us an idea of what the crime is about. For now i will be discarding the Mocodes column as some observations have 1 value and some have at most 10. If i were to convert that to OHE, the data would be too sparse.

In [None]:
data['Mocodes'].value_counts()

`Mocodes` have 295296 unique values, which would be difficult to identify the category the crimes come under

In [None]:
data['Crm Cd Desc'].value_counts()

But the `Crm Cd Desc` have ony 139 unique values which would be lot easier to categorize the crime 

In [None]:
data['Premis Desc'].value_counts()

In [None]:
data['Premis Cd'].value_counts()

In [None]:
data['Weapon Desc'].value_counts()

Leaving the columns with description as it's easier to figure out rather than the codes

In [None]:
data.columns

In [None]:
data['Cross Street'].isna().value_counts()

`Cross Street` has mostly null values so dropping it.

In [None]:
data['LOCATION'].value_counts()

In [None]:
data['AREA NAME'].value_counts()

In [None]:
data['Rpt Dist No'].value_counts()

1. dropping `DATE OCC` and `TIME OCC` as we have a combined column in standard format
2. dropping the crime codes 1,2,3,4 as the Crm Cd has the code for crime commited and other crime codes are less serious.


In [None]:

data.drop(columns=['DR_NO','Part 1-2','Premis Cd','Weapon Used Cd',
                   'Status Desc','Cross Street','Crm Cd 1', 'Crm Cd 2',
                   'Crm Cd 3', 'Crm Cd 4', 'Crm Cd', 'AREA', 'DATE OCC', 'TIME OCC','Mocodes'],inplace=True, axis=1)

In [None]:
data.columns

In [None]:
data.head()

In [None]:
data[data['Weapon Desc'].isna()]

1. There are lot of observations where the weapon used was not known. For some crimes the weapon itself doesn't make sense. For Ex : theft of identity , it can happen in anyway either offline or online. we will fill the NaNs with string `Unknown`

In [None]:
# fillig the weapon as unknown
# as the same description already exists, i am using the same text
data['Weapon Desc'] = data['Weapon Desc'].fillna('UNKNOWN WEAPON/OTHER WEAPON')

In [None]:
data['Weapon Desc'].isna().value_counts()

In [None]:
data.head()

Why do we have Age as zero ?

In [None]:
# we have many rows with victim age as zero
pd.Series(data['Vict Age']==0).value_counts()

In [None]:
data[data['Vict Age'] == 0]

In [None]:
data['Crm Cd Desc'][data['Vict Age'] == 0].value_counts()

In [None]:
data['Vict Sex'] = data['Vict Sex'].fillna('X')

In [None]:
# X indicates unknown 
data['Vict Descent'] = data['Vict Descent'].fillna('X')

In [None]:
data['Vict Descent'].value_counts()

What should we do about the Age being 0 ? Most likely 0 would be representing the Age as Unknown

In [None]:
data[data['Vict Age'] == 0]

In [None]:
data.isnull().sum()

we have 550 observations that do not have the premis where the crime occured. Lets fill it with unknown

In [None]:
data['Premis Desc'] = data['Premis Desc'].fillna('Unknown')

In [None]:
data.isnull().sum()

In [None]:
# some records have 0 for coordinates, will fill them in the later part when plotting maps.
data[data['LON'] == 0]

In [None]:
data.head()

1. LA is divided into 21 geographical areas
2. Each area has sub divisions called districts
3. Also have locations which are present in the districts.
4. LAT, LON to pinpoint the crime location (nearest 10oth block to preserve privacy)

# Analysis

## **Temporal Analysis**


### What is the trend in the number of reported crimes over time (Date Rptd)?

In [None]:
data.dtypes

In [None]:
# grouping the crimes by month and year to plot crimes by each month of the respective year

#extract the month and year from the data
data['month'] = data['DATETIME_OCC'].dt.month
data['year'] = data['DATETIME_OCC'].dt.year

In [None]:
# only taking data ffrom 2020 to 2023 as theyu are complete

data = data[data['year'] != 2024]

In [None]:
crime_count = data.groupby(['year', 'month'])['DATETIME_OCC'].count().reset_index()


In [None]:
# renamin the column to crimes

crime_count = crime_count.rename(columns={'DATETIME_OCC':'crimes'})

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(14,8))
# Create the bar plot with Seaborn
ax = sns.lineplot(x="month", y="crimes", hue="year", data=crime_count, palette='tab10',linewidth=2.5)

# for container in ax.containers:
#     ax.bar_label(container, rotation=55)

months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']

    
plt.xlabel('Month', fontsize= 16)
plt.xticks(range(1,13), months, fontsize= 16)
plt.yticks(fontsize= 16)
plt.ylabel('Number of Crimes',  fontsize= 16)
plt.title('Crimes by Month for each year (2020-2023)', fontsize= 16)
plt.tight_layout()
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0, fontsize= 16)
sns.despine()
plt.show()

In [None]:
# Filter the dataset for the years 2020 to 2023
crime_data_2020 = data[data['year'] == 2020]
crime_data_2021 = data[data['year'] == 2021]
crime_data_2022 = data[data['year'] == 2022]
crime_data_2023 = data[data['year'] == 2023]

# Calculate the total number of crimes for each year
total_crimes_2020 = len(crime_data_2020)
total_crimes_2021 = len(crime_data_2021)
total_crimes_2022 = len(crime_data_2022)
total_crimes_2023 = len(crime_data_2023)

# Total number of crimes by year
years = [2020, 2021, 2022, 2023]
total_crimes = [total_crimes_2020, total_crimes_2021, total_crimes_2022, total_crimes_2023]

# Create the line plot
plt.figure(figsize=(12, 8))
ax = sns.lineplot(x=years, y=total_crimes, marker='o', linewidth=2.5)

years = [2020,2021,2022,2023]
# Add labels and title
plt.xlabel('Year')
plt.ylabel('Total Number of Crimes')
plt.xticks(range(min(years), max(years) + 1, 1))
for i, (year, crime_count) in enumerate(zip(years, total_crimes)):
    plt.annotate(f'{crime_count}', (year, crime_count), textcoords="offset points", xytext=(0,10), ha='center')
plt.title('Total Number of Crimes by Year (2020-2023)')


plt.show()

In [None]:
"""import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))

# Create the horizontal bar plot with Seaborn
ax = sns.barplot(x="crimes", y="month", hue="year", data=crime_count, orient='h')

# Add labels to the bars
for container in ax.containers:
    ax.bar_label(container, fmt='%.0f', padding=3,fontsize=8)

months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']

plt.xlabel('Number of Crimes', fontsize=12)
plt.yticks(range(len(months)), months)
plt.ylabel('Month', fontsize=12)
plt.title('Monthly Crime Trends Over the Years', fontsize=12)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plt.tight_layout()
sns.despine()
plt.show()
"""

In [None]:
crime_count

In [None]:
#### Is there any seasonality in the occurrence of crimes based on the date of occurrence (DATE OCC)?

In [None]:
!pip install --upgrade holidays

In [None]:
from datetime import date 
import holidays as hds

In [None]:
# getting holiday for California state, USA
us_ca_holidays = hds.country_holidays('US', subdiv='CA')

In [None]:
# Define function to check if date is a holiday
def is_holiday(date):
    return date.date() in us_ca_holidays

# Apply function to each row in DATETIME_OCC column
data['is_holiday'] = data['DATETIME_OCC'].apply(is_holiday)

In [None]:
# Define a function to check if a given date is a Saturday or Sunday
def is_weekend(date):
    return date.weekday() in [5, 6]  # 5 represents Saturday, 6 represents Sunday

# Apply the function to each date in the DATETIME_OCC column and create a new column
data['is_weekend'] = data['DATETIME_OCC'].apply(is_weekend)

In [None]:
data['Week'] = data['DATETIME_OCC'].dt.isocalendar().week
data['DayOfWeek'] = data['DATETIME_OCC'].dt.day_name()
weekly_crime_counts = data.groupby(['year', 'Week', 'DayOfWeek']).size().unstack(fill_value=0)

# Plot a graph for each day of the week for every year separately
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
for day in days_of_week:
    plt.figure(figsize=(10, 6))
    for year in weekly_crime_counts.index.levels[0]:
        year_data = weekly_crime_counts.loc[year]
        plt.plot(year_data.index, year_data[day], label=year)
    plt.title(f'Crimes on {day}s')
    plt.xlabel('Week')
    plt.ylabel('Number of Crimes')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
import numpy as np
# Group by year and day of the week, and count crimes
crime_counts = data.groupby(['year', 'DayOfWeek']).size().unstack(fill_value=0)
crime_counts = crime_counts[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]
# Plot grouped bar plot
num_years = len(crime_counts.index)
num_days = len(crime_counts.columns)

bar_width = 0.8 / num_days  # Adjust the width of each bar

plt.figure(figsize=(12, 6))

for i, day in enumerate(crime_counts.columns):
    positions = np.arange(num_years) + (i - num_days / 2 + 0.5) * bar_width
    plt.bar(positions, crime_counts[day], width=bar_width, label=day)

plt.title('Total Crimes by Day of the Week for Each Year', fontsize=12)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Total Number of Crimes', fontsize=12)
plt.xticks(np.arange(num_years), crime_counts.index)
plt.legend()
plt.tight_layout()
sns.despine()
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame containing crime data

# Group by year and day of the week, and count crimes
crime_counts = data.groupby(['year', 'DayOfWeek']).size().unstack(fill_value=0)

# Calculate total crimes for each day of the week within each year
total_crimes_per_day_per_year = crime_counts.sum(axis=0)

# Plot grouped bar chart
num_years = len(crime_counts.index)
num_days = len(crime_counts.columns)
bar_width = 0.8 / num_days  # Adjust the width of each bar

plt.figure(figsize=(12, 8))

for i, day in enumerate(crime_counts.columns):
    # Reverse the order of positions to plot bars in reverse order within each group
    positions = np.arange(num_years) + (num_days / 2 - i - 0.5) * bar_width
    bars = plt.barh(positions, crime_counts[day], height=bar_width, label=day)  # Use colormap index
    
    # Add labels to each bar
    for bar in bars:
        width = bar.get_width()
        plt.text(width + 200, bar.get_y() + bar.get_height()/2 , f'{int(width)}', ha='left', va='center')

plt.title('Crimes by Day of the Week for Each Year', fontsize=12)
plt.ylabel('Year', fontsize=12)
plt.xlabel('Number of Crimes', fontsize=12)
plt.yticks(np.arange(num_years), crime_counts.index)
plt.legend()
plt.tight_layout()
sns.despine()
plt.show()


In [None]:
data.groupby(['year', 'DayOfWeek']).size().unstack(fill_value=0)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame containing crime data

# Group by year and day of the week, and count crimes
crime_counts = data.groupby(['year', 'DayOfWeek']).size().unstack(fill_value=0)
crime_counts = crime_counts[['Monday', 'Tuesday', 'Wednesday','Thursday','Friday','Saturday','Sunday']]
# Calculate total crimes for each day of the week within each year
total_crimes_per_day_per_year = crime_counts.sum(axis=0)

# Plot line chart
num_years = len(crime_counts.index)
num_days = len(crime_counts.columns)

plt.figure(figsize=(12, 8))

for i, day in enumerate(crime_counts.columns):
    # Plot line for each day of the week
    plt.plot(crime_counts.index, crime_counts[day], label=day, linewidth=2.5)

plt.title('Crimes by Day of the Week for Each Year (2020-2023)', fontsize=16)
plt.ylabel('Number of Crimes', fontsize=16)
plt.xlabel('Year', fontsize=16)
plt.xticks( crime_counts.index, fontsize=16)
plt.yticks( fontsize=16)
plt.legend(fontsize=16)
plt.tight_layout()
sns.despine()
plt.show()


In [None]:
crime_counts[['Monday', 'Tuesday', 'Wednesday','Thursday','Friday','Saturday','Sunday']]

In [None]:
# crime_counts.iloc[0].to_list()
#adding each years counts by week into list
crime_by_year_by_days = []
for x in range(4):
    crime_by_year_by_days.append(crime_counts.iloc[x].to_list())

In [None]:
crime_by_year_by_days

In [None]:
# creating a separate dataframe for holiday crime data
crime_on_holidays = data[data['is_holiday'] == True]

In [None]:
crime_on_holidays = crime_on_holidays.sort_values(by='DATETIME_OCC')

In [None]:
crime_on_holidays['date_occ'] = crime_on_holidays['DATETIME_OCC'].dt.date

In [None]:
crime_on_holidays = crime_on_holidays['date_occ'].value_counts().to_frame(name='crimes').reset_index().sort_values(by='date_occ')

In [None]:
# the holiday list only contains days such as new years, thanksgiving, etc. 
#Doesn't include saturday and sunday

plt.figure(figsize=(12,8))
# Create the bar plot with Seaborn
ax = sns.barplot(x="date_occ", y="crimes",data=crime_on_holidays)

for container in ax.containers:
    ax.bar_label(container,  rotation=50)

plt.xlabel('Hours of the day', fontsize=12)
plt.ylabel('Number of Crimes', fontsize=12)
plt.title('Number of Crimes - US Holidays', fontsize=12)
plt.xticks(rotation = 45, ha='right')
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame containing crime data

# Creating a separate dataframe for holiday crime data
crime_on_holidays = data[data['is_holiday'] == True]
crime_on_holidays = crime_on_holidays.sort_values(by='DATETIME_OCC')
crime_on_holidays['date_occ'] = crime_on_holidays['DATETIME_OCC'].dt.date
crime_on_holidays = crime_on_holidays['date_occ'].value_counts().to_frame(name='crimes').reset_index().sort_values(by='crimes', ascending=False)

# Color palette based on threshold
crime_counts = crime_on_holidays['crimes'].tolist()
colors = ['red' if count > 800 else 'skyblue' for count in crime_counts]

# Plot
plt.figure(figsize=(12, 8))

# Horizontal bar plot with Seaborn
ax = sns.barplot(x="crimes", y="date_occ", data=crime_on_holidays, palette=colors)

# Get crime counts as a list for labeling
crime_counts = crime_on_holidays['crimes'].tolist()

# Add labels on top of each bar
for i, bar in enumerate(ax.containers[0]):
  # Offset label position slightly to avoid overlapping with bars
  x_pos = bar.get_width() + 10  # Adjust offset as needed
  plt.text(x_pos, bar.get_y() + bar.get_height() / 2, crime_counts[i], 
           ha='left', va='center', fontsize=10)  # Adjust alignment and font size

# Rotate x-axis labels for better readability
plt.xticks(rotation=0, ha='right')  # Set rotation to 0 and horizontal alignment to 'right'

# Adjust labels and title
plt.xlabel('Number of Crimes', fontsize=14)
plt.ylabel('Date', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=9)
plt.title('Number of Crimes during US Holidays', fontsize=14)

# Customize grid and layout
# plt.grid(axis='x', linestyle='--', alpha=0.6)  # Add horizontal grid lines
plt.tight_layout()
sns.despine()
plt.show()


In [None]:
data.columns

In [None]:
import pandas as pd

# Assuming 'data' is your DataFrame containing the crime data

# Group data by year, day of the week, and holiday status, and calculate the average number of crimes
average_crimes_by_dayofweek = data.groupby(['year', 'DayOfWeek', 'is_holiday']).size().to_frame(name='count').reset_index()

# Filter for each year (2020-2023) separately
for year in range(2020, 2024):
    year_data = average_crimes_by_dayofweek[average_crimes_by_dayofweek['year'] == year]
    
    # Print average number of crimes for each day of the week when it's a holiday versus not a holiday
    print(f"Year {year}:")
    for holiday_status in [True, False]:
        holiday_label = 'Holiday' if holiday_status else 'Not a Holiday'
        holiday_data = year_data[year_data['is_holiday'] == holiday_status]
        print(f"\n{holiday_label}:")
        print(holiday_data[['DayOfWeek', 'count']])


In [None]:
import pandas as pd

holiday_data = []
# Group data by year, day of the week, and holiday status, and calculate the average number of crimes
average_crimes_by_dayofweek = data.groupby(['year', 'DayOfWeek', 'is_holiday']).size().to_frame(name='count').reset_index()

# Filter for each year (2020-2023) separately
for year in range(2020, 2024):
    year_data = average_crimes_by_dayofweek[average_crimes_by_dayofweek['year'] == year]
    
    # Print average number of crimes for each day of the week when it's a holiday versus not a holiday
    print(f"Year {year}:")
    for holiday_status in [True, False]:
        holiday_label = 'Holiday' if holiday_status else 'Not a Holiday'
        holiday_data.append(year_data[year_data['is_holiday'] == holiday_status])
#         print(f"\n{holiday_label}:")
#         print(holiday_data[['DayOfWeek', 'count']])


In [None]:
for x in range(0,7,2):
    holiday_data[x].rename(columns= {'count':'count_holiday'}, inplace=True)
    
for x in range(1,8,2):
    holiday_data[x].rename(columns= {'count':'count_not_holiday'}, inplace=True)
    

In [None]:
merged_df_2020 = pd.merge(holiday_data[0], holiday_data[1], on=['year', 'DayOfWeek'], how='outer')

In [None]:
merged_df_2020['day_count'] = [3,4,2,0,1,1,2]
merged_df_2020['count_holiday'] = merged_df_2020['count_holiday'] / merged_df_2020['day_count']
merged_df_2020['daycount1'] = [52,52,52,52,52,52,52]
merged_df_2020['daycount1'] = merged_df_2020['daycount1'] - merged_df_2020['day_count']
merged_df_2020['count_not_holiday'] = merged_df_2020['count_not_holiday'] / merged_df_2020['daycount1']
merged_df_2020['count_holiday'].fillna(0, inplace=True)

In [None]:

merged_df_2020 = merged_df_2020.reindex([1,5,6,4,0,2,3])

In [None]:
merged_df_2021 = pd.merge(holiday_data[2], holiday_data[3], on=['year', 'DayOfWeek'], how='outer')
merged_df_2021['day_count'] = [3,4,2,0,1,1,2]
merged_df_2021['count_holiday'] = merged_df_2021['count_holiday'] / merged_df_2021['day_count']
merged_df_2021['daycount1'] = [52,52,52,52,52,52,52]
merged_df_2021['daycount1'] = merged_df_2021['daycount1'] - merged_df_2021['day_count']
merged_df_2021['count_not_holiday'] = merged_df_2021['count_not_holiday'] / merged_df_2021['daycount1']
merged_df_2021['count_holiday'].fillna(0, inplace=True)
merged_df_2021 = merged_df_2021.reindex([1,5,6,4,0,2,3])

In [None]:
merged_df_2021

In [None]:
merged_df_2022 = pd.merge(holiday_data[4], holiday_data[5], on=['year', 'DayOfWeek'], how='outer')
merged_df_2022['day_count'] = [3,4,2,0,1,1,2]
merged_df_2022['count_holiday'] = merged_df_2022['count_holiday'] / merged_df_2022['day_count']
merged_df_2022['daycount1'] = [52,52,52,52,52,52,52]
merged_df_2022['daycount1'] = merged_df_2022['daycount1'] - merged_df_2022['day_count']
merged_df_2022['count_not_holiday'] = merged_df_2022['count_not_holiday'] / merged_df_2022['daycount1']
merged_df_2022['count_holiday'].fillna(0, inplace=True)
merged_df_2022 = merged_df_2022.reindex([1,5,6,4,0,2,3])

In [None]:
merged_df_2022

In [None]:
merged_df_2023 = pd.merge(holiday_data[6], holiday_data[7], on=['year', 'DayOfWeek'], how='outer')
merged_df_2023['day_count'] = [3,4,2,0,1,1,2]
merged_df_2023['count_holiday'] = merged_df_2023['count_holiday'] / merged_df_2023['day_count']
merged_df_2023['daycount1'] = [52,52,52,52,52,52,52]
merged_df_2023['daycount1'] = merged_df_2023['daycount1'] - merged_df_2023['day_count']
merged_df_2023['count_not_holiday'] = merged_df_2023['count_not_holiday'] / merged_df_2023['daycount1']
merged_df_2023['count_holiday'].fillna(0, inplace=True)
merged_df_2023 = merged_df_2023.reindex([1,5,6,4,0,2,3])

In [None]:
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import seaborn as sns

def plot_subplots(x, y, data, ax, year):

    plt.figure(figsize=(12, 8))
    
#     colors =  np.where(data['count'] > threshold, 'red', 'skyblue')
    # Create the bar plot with Seaborn
    sns.lineplot(x=x, y=y, data=data, palette=['red','blue'], ax=ax, linewidth=2.5)

    # Set colors based on count values
#     threshold = threshold
#     for bar in ax.patches:
#         if bar.get_height() > threshold:
#             bar.set_color('red')
#         else:
#             bar.set_color('lightblue')

    # Add labels to the bars
    for container in ax.containers:
        ax.bar_label(container)

    ax.set_title(f'Average crime during Holidays vs Normal days {year} ', fontsize=14)
    ax.set_xlabel(' ', fontsize=15)
    ax.set_ylabel(' ', fontsize=15)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
  
    # plt.xticks(rotation=25, ha='right')
    # plt.yticks(fontsize=12)
    # plt.tight_layout()
    




fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8), sharex=False,sharey=True, constrained_layout = True)

plot_subplots(x='DayOfWeek', y='count_holiday',data=merged_df_2020,ax=axes[0,0], year = '2020')
plot_subplots(x='DayOfWeek', y='count_not_holiday',data=merged_df_2020,ax=axes[0,0], year = '2020')
plot_subplots(x='DayOfWeek', y='count_holiday',data=merged_df_2021,ax=axes[0,1], year = "2021")
plot_subplots(x='DayOfWeek', y='count_not_holiday',data=merged_df_2021,ax=axes[0,1], year = "2021")
plot_subplots(x='DayOfWeek', y='count_holiday',data=merged_df_2022,ax=axes[1,0], year = "2022")
plot_subplots(x='DayOfWeek', y='count_not_holiday',data=merged_df_2022,ax=axes[1,0], year = "2022")
plot_subplots(x='DayOfWeek', y='count_holiday',data=merged_df_2023,ax=axes[1,1], year = "2023")
plot_subplots(x='DayOfWeek', y='count_not_holiday',data=merged_df_2023,ax=axes[1,1], year = "2023")
# Adjust layout
# 
plt.subplots_adjust(top=13)
# axes[0,0].subplots_adjust(hspace=0.2, wspace=0.2)
plt.tight_layout(pad=5)
fig.supxlabel('Day of the week', fontsize=16)
fig.supylabel('Crime count', fontsize=16)
fig.suptitle('Average crime during Holidays vs Normal days (2020-2023)', fontsize=16)
red_patch = mpatches.Patch(color='#ff9e00', label='Normal days')
blue_patch = mpatches.Patch(color='#0a6eee', label='Holidays')

# fig.legend(handles=[red_patch, blue_patch])
fig.legend(handles=[red_patch, blue_patch], loc='upper right', bbox_to_anchor=(1, 1.05), fontsize=14)
# Show the plots
plt.show()






In [None]:
import seaborn as sns
import matplotlib.pyplot as plt



# Plotting
plt.figure(figsize=(8, 6))
sns.lineplot(x='DayOfWeek', y='count_holiday', data=merged_df_2020, color='red', label='Crime on holidays')
sns.lineplot(x='DayOfWeek', y='count_not_holiday', data=merged_df_2020, color='blue', label='Crime on normal days')

# Adding labels and title
plt.xlabel('Days of Week')
plt.ylabel('Crime count')
plt.title('Average crime per day on Holidays vs Normal days ')
plt.legend()

# Displaying the plot
plt.show()


In [None]:
holiday_data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Creating a DataFrame for seaborn
import pandas as pd
df = pd.DataFrame({'Categories': categories, 'Blue Bars': values1, 'Red Bars': values2})

# Plotting
plt.figure(figsize=(8, 6))
sns.barplot(x='Categories', y='Blue Bars', data=df, color='blue', label='Blue Bars')
sns.barplot(x='Categories', y='Red Bars', data=df, color='red', bottom=values1, label='Red Bars')

# Adding labels and title
plt.xlabel('Categories')
plt.ylabel('Values')
plt.title('Stacked Bar Plot')
plt.legend()

# Displaying the plot
plt.show()


1. write about the irregular spikes you see which were on new years. but how the next day also there were continued crimes on 2023.

#### At what times of day (TIME OCC) do different types of crimes (Crm Cd Desc) tend to occur most frequently?

In [None]:
# Categorize the hours of the day
data['hour_of_day'] = pd.cut(data['DATETIME_OCC'].dt.hour, bins=[0, 4, 8, 12, 16, 20, 24], labels=['Late Night (12AM - 4AM)', 'Early Morning (4AM - 8AM)', 'Morning (8AM - 12PM)', 'Afternoon (12PM - 4PM)', 'Evening (4PM - 8PM)', 'Night (8PM - 12AM)'], right=False)

In [None]:
# Group by time category and count the number of crimes
crimes_by_time_category = data.groupby('hour_of_day').size().to_frame(name='crimes').sort_values(by='crimes', ascending=False).reset_index()

In [None]:
crimetime = crimes_by_time_category.reset_index(drop=True)

In [None]:

plt.figure(figsize=(12,8))
# Create the bar plot with Seaborn
ax = sns.barplot(x="hour_of_day", y="crimes",data=crimes_by_time_category)


for container in ax.containers:
    ax.bar_label(container, fontsize=12)

plt.xlabel('Hours of the day', fontsize=12)
plt.ylabel('Number of Crimes', fontsize=12)
plt.title('Number of Crimes - hours of the day', fontsize=12)
plt.xticks(fontsize=12, ha='right', rotation=45)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Sample data and labels
graph_data = np.array([199698, 196650, 190652, 153347, 74226, 59583])
labels = ['Afternoon (12PM - 4PM)', 'Evening (4PM - 8PM)', 'Night (8PM - 12AM)', 'Morning (8AM - 12PM)',
          'Early Morning (4AM - 8AM)', 'Late Night (12AM - 4AM)']

# Create a DataFrame
new_data = {'Time of the day': labels, 'Number of crimes': graph_data}
df = pd.DataFrame(new_data)

# Set the style
# sns.set_style("whitegrid")

# Plot using Seaborn countplot
plt.figure(figsize=(12, 8))
ax = sns.barplot(data=df, y='Time of the day', x='Number of crimes', palette='tab10')

for container in ax.containers:
    ax.bar_label(container, fontsize=14)

# Add labels and title
plt.ylabel('Time of the day', fontsize=14)
plt.xlabel('Number of crimes', fontsize=14)
plt.title('Crimes during different time of the day', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
sns.despine()
# Show plot
plt.show()


In [None]:
crime_by_hour_type = data.groupby(['hour_of_day','Crm Cd Desc']).size()

In [None]:
# hours_list = ['Late Night (12AM - 4AM)', 'Early Morning (4AM - 8AM)', 'Morning (8AM - 12PM)', 'Afternoon (12PM - 4PM)', 'Evening (4PM - 8PM)', 'Night (8PM - 12AM)']

crime_late_night = crime_by_hour_type['Late Night (12AM - 4AM)'].sort_values(ascending=False).head().to_frame(name='count').reset_index()
crime_early_morning = crime_by_hour_type['Early Morning (4AM - 8AM)'].sort_values(ascending=False).head().to_frame(name='count').reset_index()
crime_morning = crime_by_hour_type['Morning (8AM - 12PM)'].sort_values(ascending=False).head().to_frame(name='count').reset_index()
crime_afternoon = crime_by_hour_type['Afternoon (12PM - 4PM)'].sort_values(ascending=False).head().to_frame(name='count').reset_index()
crime_evening = crime_by_hour_type['Evening (4PM - 8PM)'].sort_values(ascending=False).head().to_frame(name='count').reset_index()
crime_night = crime_by_hour_type['Night (8PM - 12AM)'].sort_values(ascending=False).head().to_frame(name='count').reset_index()

### Top 5 types of crime during different hours of the day

In [None]:
def print_crime_by_hour(data_in,title_text):
    plt.figure(figsize=(20,16))
    # Create the bar plot with Seaborn
    ax = sns.barplot(x="Crm Cd Desc", y="count",data=data_in)

    for container in ax.containers:
        ax.bar_label(container, fontsize=14)

    plt.xlabel('Type of Crimes', fontsize=14)
    plt.ylabel('Number of Crimes', fontsize=14)
    plt.title(title_text, fontsize=15)
    plt.xticks(fontsize=14, rotation=45, ha='right')
    plt.yticks(fontsize=14)
    plt.tight_layout()
    plt.show()

Converting bar plots from different times of the day into multi bar plot. 
crim types tp consider (theft of identity, burglary, vehicle stolen, vandalisim felony (400 and over), intimate partner simple assault, battery - simple assault, theft plain - petty,  burglary from vehicle, assault with deadly weapon (aggravated)).

To do that get the top 5 crime types from each hour of the day and find unique types

In [None]:
# crimes_by_time_category
# finding the unique crime types from each time category to plot multibar plot

crime_df_list = [crime_late_night,crime_early_morning,crime_morning,crime_afternoon,crime_evening,crime_night]
unique_crime_types = []

for x in crime_df_list : 
    unique_crime_types.append(list(x['Crm Cd Desc']))

In [None]:
# Flatten the list of lists
flattened_list = [item for sublist in unique_crime_types for item in sublist]

# Find unique items
unique_items = list(set(flattened_list))

print(unique_items)

In [None]:
crimes_by_time_category = data[data['Crm Cd Desc'].isin(unique_items)]

In [None]:
temp = crimes_by_time_category.groupby(['hour_of_day','Crm Cd Desc']).size().to_frame(name='count').reset_index()

In [None]:

plt.figure(figsize=(12, 8))

# Plot using Seaborn
# ax = sns.countplot(data=crimes_by_time_category, x='hour_of_day', hue='Crm Cd Desc', palette='tab10')
ax = sns.lineplot(x=temp['hour_of_day'], y=temp['count'], hue=temp['Crm Cd Desc'], data=temp, palette='tab10',linewidth=2.5)
# for container in ax.containers:
#     ax.bar_label(container, fontsize = 8)

# for p in ax.patches:
#     ax.annotate(f"{p.get_height()}", (p.get_x() + p.get_width() / 2, p.get_height()),
#                 ha='left', va='bottom', xytext=(-5,5), textcoords='offset points', rotation = 45)

# Set labels and title
plt.ylabel('Time Category')
plt.xlabel('Number of Crimes')
plt.title('Count of Crime Types by Time Category')

# Show plot
plt.legend(title='Crime Type')
# plt.legend(title='Crime Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
sns.despine()
plt.show()

In [None]:
crimes_by_time_category.groupby(['hour_of_day','Crm Cd Desc']).size().unstack(fill_value=0)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame containing crime data

# Group by time of day and crime code
grouped_data = crimes_by_time_category.groupby(['hour_of_day', 'Crm Cd Desc']).size().unstack(fill_value=0)
bar_width = 0.91
# Define a function to sort values within each group
def sort_within_group(group):
    return group.sort_values(ascending=False)  # Sort by count descending

# Apply the sorting function to each group
sorted_data = grouped_data.apply(sort_within_group)

# Create the horizontal bar plot
ax = sorted_data.plot(kind='barh', figsize=(12, 8), colormap='tab10', width=bar_width)
plt.xlabel('Number of Crimes', fontsize=12)
plt.ylabel('Type of Crime', fontsize=12)
plt.title('Crimes by Time of Day and Type ', fontsize=12)
plt.legend(title='Time of Day', loc='upper left')
# Get handles and labels from plotted bars
handles, labels = plt.gca().get_legend_handles_labels()
# Reverse the order
handles = handles[::-1]
labels = labels[::-1]
# Create legend with reversed order
plt.legend(handles, labels)
# Add labels to the bars
for i in ax.patches:
    ax.text(i.get_width() + 200, i.get_y()+0.01, str(round(i.get_width(), 2)), color='black', fontsize=9)

plt.tight_layout()
sns.despine()
plt.show()


In [None]:
# print_crime_by_hour(crime_late_night, "Top 5 types of crimes during Late Night (12AM - 4AM)")

In [None]:
# print_crime_by_hour(crime_early_morning, "Top 5 types of crimes during Early Morning (4AM - 8AM)")

In [None]:
# print_crime_by_hour(crime_morning, "Top 5 types of crimes during Morning (8AM - 12PM)")

In [None]:
# print_crime_by_hour(crime_afternoon, "Top 5 types of crimes during Afternoon (12PM - 4PM)")

In [None]:
# print_crime_by_hour(crime_evening, "Top 5 types of crimes during Evening (4PM - 8PM)")

In [None]:
# print_crime_by_hour(crime_night, "Top 5 types of crimes during Night (8PM - 12AM)")

### How has the frequency of specific types of crimes evolved over the years? (e.g., Are certain crimes becoming more or less common?)

In [None]:
grouped_data = data.groupby(['year','Crm Cd Desc']).size().reset_index(name='count')

# Sort the crime types based on their count within each group
grouped_data['rank'] = grouped_data.groupby('year')['count'].rank(ascending=False, method='first')

# Sort the DataFrame by year and rank
sorted_data = grouped_data.sort_values(by=['year', 'rank'])


In [None]:
crime_counts = data.groupby('Crm Cd Desc').size()

# Get the top 5 crimes based on overall count
top_crimes = crime_counts.nlargest(5)

# Filter the original data for the top 5 crimes and years 2020 to 2023
filtered_data = data[(data['Crm Cd Desc'].isin(top_crimes.index)) & (data['year'].between(2020, 2023))]

# Group the filtered data by year and crime description and count occurrences
grouped_filtered_data = filtered_data.groupby(['year', 'Crm Cd Desc']).size().reset_index(name='count').sort_values(ascending = False, by='count')

# Plotting using seaborn
plt.figure(figsize=(12, 8))
ax = sns.lineplot(data=grouped_filtered_data, x='year', y='count', hue='Crm Cd Desc', palette = 'tab10',linewidth=2.5)
for container in ax.containers:
    ax.bar_label(container)
plt.title('Top 5 Crimes and their frequencies from 2020 to 2023', fontsize=12)
plt.ylabel('Year', fontsize=12)
plt.xlabel('Number of Crimes', fontsize=12)
plt.legend(title='Crime Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks( [2020,2021,2022,2023])
plt.tight_layout()
sns.despine()
plt.show()

In [None]:
grouped_filtered_data

In [None]:
# adding this column to specifically find out number of non-holiday and holiday days
data['date_occ'] = data['DATETIME_OCC'].dt.date

In [None]:
# Group the data by crime description and count occurrences
crime_counts = data['Crm Cd Desc'].value_counts()

# Take the top 10 crime types
top_10_crimes = crime_counts.nlargest(10).index.tolist()

# Filter data for the top 10 crime types and group by is_holiday
crime_holiday_counts = data[data['Crm Cd Desc'].isin(top_10_crimes)].groupby(['Crm Cd Desc', 'is_holiday']).size().reset_index(name='count')

# Pivot the data for easier comparison
pivot_crime_counts = crime_holiday_counts.pivot(index='Crm Cd Desc', columns='is_holiday', values='count').fillna(0)

# Calculate the total number of non-holiday days and holiday days
total_non_holiday_days = data[data['is_holiday'] == False]['date_occ'].nunique()
total_holiday_days = data[data['is_holiday'] == True]['date_occ'].nunique()

# Calculate the average daily crime occurrences for non-holiday and holiday periods
pivot_crime_counts['avg_daily_non_holiday'] = pivot_crime_counts[False] / total_non_holiday_days
pivot_crime_counts['avg_daily_holiday'] = pivot_crime_counts[True] / total_holiday_days

# Calculate percentage change in average daily crime occurrences
pivot_crime_counts['percentage_change'] = ((pivot_crime_counts['avg_daily_holiday'] - pivot_crime_counts['avg_daily_non_holiday']) / pivot_crime_counts['avg_daily_non_holiday']) * 100

# Sort crimes based on percentage change
sorted_crimes = pivot_crime_counts.sort_values(by='percentage_change', ascending=False)

# Separate crimes with positive and negative percentage change
increased_crimes = sorted_crimes[sorted_crimes['percentage_change'] > 0]
decreased_crimes = sorted_crimes[sorted_crimes['percentage_change'] < 0]

# Plotting using seaborn
plt.figure(figsize=(12, 8))
ax = sns.barplot(data=sorted_crimes, x=sorted_crimes.index, y='percentage_change', palette='coolwarm')
for container in ax.containers:
    ax.bar_label(container,  fmt='%.2f%%', label_type='edge', fontsize=14)
plt.title('Percentage Change in Average Daily Crimes during US holidays',fontsize=14)
plt.xlabel('Crime Type',fontsize=14)
plt.ylabel('Percentage Change',fontsize=14)
plt.xticks(rotation=20, ha='right',fontsize=14)
plt.yticks(fontsize=14)
plt.axhline(0, color='gray', linestyle='--')  # Add horizontal line at y=0 for reference
plt.tight_layout()
sns.despine()
plt.show()



In [None]:
sorted_crimes

## Spatial Analysis

#### Which areas (AREA NAME) have the highest and lowest crime rates?

Also plot the crimes by area and for each month and see the crime growth rate in each area.

Plot crimes by time blocks (categorize times in day into specific blocks) so we can see in which time block more crimes happen. OR in what type of crimes are more popular in what time blocks

In [None]:

crimes_by_area = data.groupby('AREA NAME').size().to_frame(name='count').reset_index()

In [None]:
crimes_by_area = crimes_by_area.sort_values(by='count', ascending=False)

In [None]:
plt.figure(figsize=(12,8))
# Create the bar plot with Seaborn
ax = sns.barplot(x="AREA NAME", y="count",data=crimes_by_area)

# Conditional coloring based on the value of each bar
threshold = 50000
for bar in ax.patches:
    if bar.get_height() > threshold:
        bar.set_color('red')
    else:
        bar.set_color('lightblue')

for container in ax.containers:
    ax.bar_label(container, fontsize=10)

plt.xlabel('Area Name', fontsize=12)
plt.ylabel('Number of Crimes', fontsize=12)
plt.title('Number of Crimes by Area (2020-2023)', fontsize=12)
plt.xticks(rotation=25,ha='right', fontsize=12)
plt.tight_layout()
sns.despine()
plt.show()

1. Why are the crimes highest in a specific area? 

### Is there a correlation between the type of crime and its location (LAT, LON)?

### Can we identify hotspots of specific types of crimes using spatial clustering techniques?

In [None]:
data.groupby('Crm Cd Desc').size().sort_values(ascending=False).head(15).index.to_list()

In [None]:
"""import requests
import json

area_names = list(data['AREA NAME'].unique())

# Base URL of the API endpoint
base_url = 'https://maps.lacity.org/arcgis/rest/services/Permits/BOE_Permits_Geocoder/MapServer/402/query?'

# Iterate through area names
for area in area_names:
    # Capitalize the area name and replace spaces with '+'
    formatted_area = area.upper().replace(' ', '+')
    
    # Construct the API call URL for the current area
    api_url = f'{base_url}text={formatted_area}&geometryType=esriGeometryEnvelope&outFields=REPDIST%2CAPREC&returnGeometry=true&f=geojson'
    
    # Make API call to retrieve GeoJSON data for the current area
    response = requests.get(api_url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse JSON response
        geojson_data = response.json()
        
        # Save GeoJSON data to a file
        with open(f'{area}.geojson', 'w') as f:
            json.dump(geojson_data, f)
            
        print(f'GeoJSON data for {area} saved successfully.')
    else:
        print(f'Failed to retrieve GeoJSON data for {area}. Status code: {response.status_code}')
"""

In [None]:
"""# RERUNNING as the api accepts different names for west la : WEST LOS ANGELES and N HOLLYWOOD : NORTH HOLLYWOOD
import requests
import json

area_names = ['NORTH HOLLYWOOD','WEST LOS ANGELES']

# Base URL of the API endpoint
base_url = 'https://maps.lacity.org/arcgis/rest/services/Permits/BOE_Permits_Geocoder/MapServer/402/query?'

# Iterate through area names
for area in area_names:
    # Capitalize the area name and replace spaces with '+'
    formatted_area = area.upper().replace(' ', '+')
    
    # Construct the API call URL for the current area
    api_url = f'{base_url}text={formatted_area}&geometryType=esriGeometryEnvelope&outFields=REPDIST%2CAPREC&returnGeometry=true&f=geojson'
    
    # Make API call to retrieve GeoJSON data for the current area
    response = requests.get(api_url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse JSON response
        geojson_data = response.json()
        
        # Save GeoJSON data to a file
        with open(f'{area}.geojson', 'w') as f:
            json.dump(geojson_data, f)
            
        print(f'GeoJSON data for {area} saved successfully.')
    else:
        print(f'Failed to retrieve GeoJSON data for {area}. Status code: {response.status_code}')
"""

**Zipcodes in the below heatmap should not be considered as it is, beacsue an LAPD rpt dist no can come into multiple zipcodes (use caution)**

In [None]:
# base_map.save('crimeheatmap.html')


**plotting the crime hotspot change per each quarter**

In [None]:
# data.head(5).to_csv('final_sample_data.csv', index=False)

In [None]:
# creating lists of areas with respective dist nos
 
null_locations = data[data['LAT'] == 0][['AREA NAME','Rpt Dist No']]

In [None]:
null_locations.nunique()

**we have 751 records with null values for LAT, LON**

In [None]:
# creating a list of area names 

area_list = list(data['AREA NAME'].unique())

In [None]:
# creating empty lists using areanames
def create_empty_lists(list_names):
    # Initialize an empty dictionary to store lists
    lists = {}
    
    # Create empty lists with predefined names
    for name in list_names:
        lists[name] = []
    
    return lists

# Call the function and pass list names as arguments
list_dict = create_empty_lists(area_list)

In [None]:
# assign the district that belong to respective areas

for key in list_dict : 
    list_dict[key] = list(null_locations[null_locations['AREA NAME'] == f"{key}"]['Rpt Dist No'])
    

In [None]:
import json
import random
from shapely.geometry import shape, Polygon, Point

generated_locations = {}

def assign_random_point(geojson_data, district_list):
    """
    Assigns a random latitude and longitude within each specified GeoJSON district boundary.

    Args:
        geojson_data: A dictionary representing the GeoJSON data (loaded from a file).
        district_list: A list containing district numbers (REPDIST) to match.

    """

    # Get all district polygons from GeoJSON data (store in a dictionary for faster lookup)
    district_polygons = {}
    for feature in geojson_data['features']:
        district_polygons[feature['properties']['REPDIST']] = shape(feature['geometry'])

    for district in district_list:
        # Check if the district exists in GeoJSON data
        if district not in district_polygons:
            print(f"Warning: District {district} not found in GeoJSON data.")
            generated_locations[district] = (0,0)
            continue

        chosen_polygon = district_polygons[district]

        # Ensure the polygon is a valid Polygon type
        if not isinstance(chosen_polygon, Polygon):
            raise ValueError("Invalid GeoJSON data: Not a Polygon geometry")

        # Get the bounding box of the chosen district polygon
        min_x, min_y, max_x, max_y = chosen_polygon.bounds

        # Generate random coordinates within the bounding box
        random_lon = random.uniform(min_x, max_x)
        random_lat = random.uniform(min_y, max_y)

        # Check if the random point lies within the chosen district polygon
        point = Point(random_lon, random_lat)
        if not chosen_polygon.contains(point):
            # If not, try again with a new random point (adjust max retries as needed)
            for _ in range(100):  # Maximum 100 retries
                random_lon = random.uniform(min_x, max_x)
                random_lat = random.uniform(min_y, max_y)
                point = Point(random_lon, random_lat)
                if chosen_polygon.contains(point):
                    break

        # If point remains outside after retries, raise an exception
        if not chosen_polygon.contains(point):
            raise ValueError("Failed to generate point within district boundary after retries")

        # Add the random location (lat, lon) for this district
        generated_locations[district] = (random_lat, random_lon)


In [None]:
# using the above function to get randomly generated locations per respective district
# if not found default (0,0)
for area in area_list:
    district_list = list_dict[area]
    # Load GeoJSON data from a file (replace 'your_district.geojson' with your file path)
    with open(f'{area}.geojson_new', 'r') as f:
        geojson_data = json.load(f)
    assign_random_point(geojson_data, district_list)


In [None]:
missing_data = data[(data['LON'] == 0)][['LAT', 'LON','Rpt Dist No']]

In [None]:
# Iterate through missing data rows
for index in missing_data.index:
    dist_no = missing_data.loc[index, 'Rpt Dist No']
    if dist_no in generated_locations:
        missing_data.at[index, 'LAT'] = generated_locations[dist_no][0]
        missing_data.at[index, 'LON'] = generated_locations[dist_no][1]
    else:
        # Handle missing district cases (set default values, raise warnings, etc.)
        missing_data.at[index, 'LAT'] = 0  # Set LAT to NaN (Not a Number)
        missing_data.at[index, 'LON'] = 0  # Set LON to NaN
        # Or, you can assign a specific value or flag the missing data


In [None]:
# changing the LAT and LON in the main dataframe

for index in missing_data.index:
    data.at[index, 'LAT'] = missing_data.at[index, 'LAT']
    data.at[index, 'LON'] = missing_data.at[index, 'LON']

In [None]:
# plotting main heatmap
heat_map_data = data[['LAT','LON']].value_counts().to_frame().reset_index()

In [None]:
import random
import os
import folium
from folium.plugins import HeatMap
import json
import branca.colormap as cm
import geopandas as gpd

police_stations = gpd.read_file("E:\Downloads\LAPD_Police_Stations_9102059882219472073.geojson")

df_folium = pd.DataFrame({'Lat':heat_map_data['LAT'],'Long':heat_map_data['LON'],'Count':heat_map_data['count']})

# df_folium['weight'] = df_folium['Count'] / df_folium['Count'].abs().max()

def generateBaseMap(loc, zoom=10, tiles='OpenStreetMap', crs='ESPG2263'):
    return folium.Map(location=loc,
                   control_scale=True, 
                   zoom_start=zoom,
                   tiles=tiles)
  
base_map = generateBaseMap([34.052235, -118.243683] )


linear = cm.LinearColormap(["blue", "lime", "yellow","orange","red"], vmin=0.1, vmax=1)
linear.caption = "Crime Density"

# Directory containing GeoJSON files
geojson_dir = "C:\\Users\\badam\\STA6704"

# Iterate through GeoJSON files in the directory
for filename in os.listdir(geojson_dir):
    if filename.endswith('.geojson_new'):
        with open(os.path.join(geojson_dir, filename), 'r') as f:
            geojson_data = json.load(f)

        # Check if filename matches specific names
        if filename.startswith('Mission') or filename.startswith('Foothill') or filename.startswith('Hollenbeck') :
            # Add GeoJSON data with red bounding box
            folium.GeoJson(geojson_data, name=filename.split('.')[0], style={'weight': 0.7, 'color': 'black'}, 
                           tooltip=folium.GeoJsonTooltip(fields=['APREC', 'REPDIST', 'ZIPCODE'], 
                                                         aliases=['Area Name', 'LAPD Dist No', 'Zipcode'])).add_to(base_map)
        else:
            # Add GeoJSON data with default style
            folium.GeoJson(geojson_data, name=filename.split('.')[0], style={'weight': 0.7}, 
                           tooltip=folium.GeoJsonTooltip(fields=['APREC', 'REPDIST', 'ZIPCODE'], 
                                                         aliases=['Area Name', 'LAPD Dist No', 'Zipcode'])).add_to(base_map)

# folium.GeoJson(police_stations, style={'weight': 0.7 },tooltip=folium.GeoJsonTooltip(fields=['DIVISION'] )).add_to(base_map)

map_values1 = df_folium[['Lat','Long']]

map_data = map_values1.values.tolist()
           
hm = HeatMap(map_data,gradient={0.1: 'blue', 0.3: 'lime', 0.5: 'yellow', 0.7: 'orange', 1: 'red'}, 
                min_opacity=0.08, 
                max_opacity=0.9, 
                radius=25,
                use_local_extrema=True)#.add_to(base_map)

# folium.GeoJson('E:\Downloads\central.geojson').add_to(hm)
base_map.add_child(linear)
base_map.add_child(hm)


In [None]:
hm.save('safeplaces.html')

In [None]:
heatmap_data_2020 = data[data['year'] == 2020][['LAT','LON']].value_counts().to_frame().reset_index()
heatmap_data_2021 = data[data['year'] == 2021][['LAT','LON']].value_counts().to_frame().reset_index()
heatmap_data_2022 = data[data['year'] == 2022][['LAT','LON']].value_counts().to_frame().reset_index()
heatmap_data_2023 = data[data['year'] == 2023][['LAT','LON']].value_counts().to_frame().reset_index()

In [None]:
def plot_heatmap(datain):
    df_folium = pd.DataFrame({'Lat':datain['LAT'],'Long':datain['LON'],'Count':datain['count']})

#     df_folium['weight'] = df_folium['Count'] / df_folium['Count'].abs().max()

    def generateBaseMap(loc, zoom=10, tiles='OpenStreetMap', crs='ESPG2263'):
        return folium.Map(location=loc,
                       control_scale=True, 
                       zoom_start=zoom,
                       tiles=tiles)

    base_map = generateBaseMap([34.052235, -118.243683] )


    linear = cm.LinearColormap(["blue", "lime", "yellow","orange","red"], vmin=0.1, vmax=1)
    linear.caption = "Crime Density"

    # Directory containing GeoJSON files
    geojson_dir = "C:\\Users\\badam\\STA6704"

    # Iterate through GeoJSON files in the directory
    for filename in os.listdir(geojson_dir):
        if filename.endswith('.geojson_new'):
            with open(os.path.join(geojson_dir, filename), 'r') as f:
                geojson_data = json.load(f)

            # Check if filename matches specific names
            if filename.startswith('77th Street') or filename.startswith('Pacific') or filename.startswith('Central') or filename.startswith('Mission') or filename.startswith('hollenbeck') or filename.startswith('Foothill'):
                # Add GeoJSON data with red bounding box
                folium.GeoJson(geojson_data, name=filename.split('.')[0], style={'weight': 0.7, 'color': 'black'}, 
                               tooltip=folium.GeoJsonTooltip(fields=['APREC', 'REPDIST', 'ZIPCODE'], 
                                                             aliases=['Area Name', 'LAPD Dist No', 'Zipcode'])).add_to(base_map)
            else:
                # Add GeoJSON data with default style
                folium.GeoJson(geojson_data, name=filename.split('.')[0], style={'weight': 0.7}, 
                               tooltip=folium.GeoJsonTooltip(fields=['APREC', 'REPDIST', 'ZIPCODE'], 
                                                             aliases=['Area Name', 'LAPD Dist No', 'Zipcode'])).add_to(base_map)

    # folium.GeoJson(police_stations, style={'weight': 0.7 },tooltip=folium.GeoJsonTooltip(fields=['DIVISION'] )).add_to(base_map)

    map_values1 = df_folium[['Lat','Long']]

    map_data = map_values1.values.tolist()

    hm = HeatMap(map_data,gradient={0.1: 'blue', 0.3: 'lime', 0.5: 'yellow', 0.7: 'orange', 1: 'red'}, 
                    min_opacity=0.08, 
                    max_opacity=0.9, 
                    radius=25,
                    use_local_extrema=True)#.add_to(base_map)

    # folium.GeoJson('E:\Downloads\central.geojson').add_to(hm)
    base_map.add_child(linear)
    base_map.add_child(hm)
    return base_map

In [None]:
heat_map = plot_heatmap(heatmap_data_2020)
heat_map

In [None]:
heat_map = plot_heatmap(heatmap_data_2021)
heat_map

In [None]:
heat_map = plot_heatmap(heatmap_data_2022)
heat_map

In [None]:
heat_map = plot_heatmap(heatmap_data_2023)
heat_map

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

# Load images
image1 = Image.open('C:\\Users\\badam\\OneDrive - University of Central Florida\\STA 6704 write up graphs\\hm2020.png')
image2 = Image.open('C:\\Users\\badam\\OneDrive - University of Central Florida\\STA 6704 write up graphs\\hm2021.png')
image3 = Image.open('C:\\Users\\badam\\OneDrive - University of Central Florida\\STA 6704 write up graphs\\hm2022.png')
image4 = Image.open('C:\\Users\\badam\\OneDrive - University of Central Florida\\STA 6704 write up graphs\\hm2023.png')

# Create subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 8))

# Display images in subplots
axes[0, 0].imshow(image1)
axes[0, 0].set_title('Crime hotspots - 2020')
axes[0, 0].axis('off')

axes[0, 1].imshow(image2)
axes[0, 1].set_title('Crime hotspots - 2021')
axes[0, 1].axis('off')

axes[1, 0].imshow(image3)
axes[1, 0].set_title('Crime hotspots - 2022')
axes[1, 0].axis('off')

axes[1, 1].imshow(image4)
axes[1, 1].set_title('Crime hotspots - 2023')
axes[1, 1].axis('off')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
import random
import os
import folium
from folium.plugins import HeatMap
import json
import branca.colormap as cm
import geopandas as gpd

police_stations = gpd.read_file("E:\Downloads\LAPD_Police_Stations_9102059882219472073.geojson")

df_folium = pd.DataFrame({'Lat':heat_map_data['LAT'],'Long':heat_map_data['LON'],'Count':heat_map_data['count']})

df_folium['weight'] = df_folium['Count'] / df_folium['Count'].abs().max()

def generateBaseMap(loc, zoom=10, tiles='OpenStreetMap', crs='ESPG2263'):
    return folium.Map(location=loc,
                   control_scale=True, 
                   zoom_start=zoom,
                   tiles=tiles)
  
base_map = generateBaseMap([34.052235, -118.243683] )


linear = cm.LinearColormap(["blue", "lime", "yellow","orange","red"], vmin=0.1, vmax=1)
linear.caption = "Crime Density"

# Directory containing GeoJSON files
geojson_dir = "C:\\Users\\badam\\STA6704"

# Iterate through GeoJSON files in the directory
for filename in os.listdir(geojson_dir):
    if filename.endswith('.geojson_new'):
        
        # Read GeoJSON file
        with open(os.path.join(geojson_dir, filename), 'r') as f:
            geojson_data = json.load(f)
        
        # Add GeoJSON data as a separate layer to the map
        folium.GeoJson(geojson_data, name=filename.split('.')[0],style={'weight': 0.7 },tooltip=folium.GeoJsonTooltip(fields=['APREC','REPDIST','ZIPCODE'], aliases=['Area Name','LAPD Dist No','Zipcode'])).add_to(base_map)

# folium.GeoJson(police_stations, style={'weight': 0.7 },tooltip=folium.GeoJsonTooltip(fields=['DIVISION'] )).add_to(base_map)

map_values1 = df_folium[['Lat','Long','weight']]

map_data = map_values1.values.tolist()
           
hm = HeatMap(map_data,gradient={0.1: 'blue', 0.3: 'lime', 0.5: 'yellow', 0.7: 'orange', 1: 'red'}, 
                min_opacity=0.08, 
                max_opacity=0.9, 
                radius=25,
                use_local_extrema=False)#.add_to(base_map)

# folium.GeoJson('E:\Downloads\central.geojson').add_to(hm)
base_map.add_child(linear)
base_map.add_child(hm)


In [None]:
heatmap_time = data[['LAT','LON','month','year']]

In [None]:
# Define a lambda function to get the quarter from the month column
get_quarter = lambda row: {
  1: 'Q1',
  2: 'Q1',
  3: 'Q1',
  4: 'Q2',
  5: 'Q2',
  6: 'Q2',
  7: 'Q3',
  8: 'Q3',
  9: 'Q3',
  10: 'Q4',
  11: 'Q4',
  12: 'Q4'
}.get(row['month'])

# Apply the lambda function to the 'month' column using apply
heatmap_time['quarter'] = heatmap_time.apply(get_quarter, axis=1)

heatmap_time

In [None]:
heatmap_time_grp = heatmap_time.groupby(['year', 'quarter'])

In [None]:
result_list = []
for year, group in heatmap_time_grp:
    group = group[['LAT','LON']].value_counts().to_frame().reset_index()
    group['weight'] = group['count'] / group['count'].max()
    print(year,group)
    result_list.append(group)

In [None]:
for result_df in result_list:
    print(result_df)

In [None]:
# heatmap_time['date_occ'] = heatmap_time['date_occ'].apply(lambda x: x.strftime("%Y-%m-%d"))

In [None]:
for year, group in heatmap_time_grp:
    print(group)

In [None]:
"""import folium
from folium.plugins import HeatMapWithTime
import pandas as pd
from itertools import chain

# Assuming result_list contains DataFrames with columns 'LAT', 'LON', and 'weight'
def generateBaseMap(loc, zoom=10, tiles='OpenStreetMap', crs='ESPG2263'):
    return folium.Map(location=loc,
                   control_scale=True, 
                   zoom_start=zoom,
                   tiles=tiles)
  
base_map = generateBaseMap([34.052235, -118.243683] )

linear = cm.LinearColormap(["blue", "lime", "yellow","orange","red"], vmin=0.1, vmax=1)
linear.caption = "Crime Density"

quarter_data = []
initial_index = ['Q1 2020', 'Q2 2020', 'Q3 2020','Q4 2020',
         'Q1 2021','Q2 2021','Q3 2021','Q4 2021',
         'Q1 2022','Q2 2022', 'Q3 2022', 'Q4 2022',
         'Q1 2023', 'Q2 2023', 'Q3 2023', 'Q4 2023',
         'Q1 2024']
final_index = []

for i, result_df in enumerate(result_list):
    # Extract latitude, longitude, and weight from the DataFrame
    data_points = list(zip(result_df['LAT'], result_df['LON'], result_df['weight']))
    
    # Append the data points to the data list
    quarter_data.append(data_points)
    
for x in range(len(quarter_data)) : 
    final_index.append( [initial_index[x]] * len(quarter_data[x]))
    
final_index = list(chain.from_iterable(final_index))    
    
quarter_data = list(chain.from_iterable(quarter_data))

quarter_data = [list(t) for t in quarter_data]

# hm = HeatMapWithTime(quarter_data, index=final_index, auto_play=True, max_opacity=0.3)

hm = HeatMapWithTime(quarter_data,index=final_index, auto_play=False,
                     gradient={0.1: 'blue', 0.3: 'lime', 0.5: 'yellow', 0.7: 'orange', 1: 'red'}, 
                     min_opacity=0.3, 
                     max_opacity=0.9, 
                     radius=100,
                     use_local_extrema=False)

base_map.add_child(linear)

base_map.add_child(hm)
"""

In [None]:
map_data

In [None]:
quarter_data
list_of_lists = [list(t) for t in quarter_data]

In [None]:
list_of_lists

In [None]:
import geopandas as gpd
from shapely.geometry import Point

# Assuming your GeoJSON file is named "zipcodes.geojson"
zipcode_geojson = "E:\Downloads\LA_County_ZIP_Codes.geojson"


# Create a list of geometry points from LAT/LON
geometry = [Point(xy) for xy in zip(data["LON"], data["LAT"])]

# Create a GeoDataFrame from the data with the points
data_geodataframe = gpd.GeoDataFrame(data, geometry=geometry, crs="EPSG:4326")

# Perform a spatial join to find containing zipcode for each point
joined = gpd.sjoin(data_geodataframe, gpd.read_file(zipcode_geojson), how="left", op="within")

# Add the zipcode column to your original dataframe
data["ZIP_CODE"] = joined["ZIPCODE"]  # Assuming "ZIP_CODE" is the property with zipcode information

# Handle cases where no zipcode is found (optional)
data.loc[data["ZIP_CODE"].isna(), "ZIP_CODE"] = "NOT FOUND"  # Assign "NOT FOUND" for missing zipcodes

# Keep the original dataframe or use the GeoDataFrame (data_geodataframe) depending on your needs.


In [None]:
"""import glob
import geopandas as gpd
from shapely.geometry import Point

# Path to directory containing GeoJSON files (replace with actual path)
directory_path = "C:\\Users\\badam\\STA6704"

# Path to GeoJSON containing zipcode boundaries
zipcode_geojson = "E:\Downloads\LA_County_ZIP_Codes.geojson"

# Function to process a single GeoJSON file
def process_geojson(filename):
    # Read the existing GeoJSON data
    area_data = gpd.read_file(filename)

    zipcodes = gpd.read_file(zipcode_geojson)

    # Perform a spatial join to find zipcode for each district
    joined = gpd.sjoin(area_data, zipcodes, how="left", predicate="intersects")
    
    if joined.empty:
        print("No zipcodes found for any district.")
    else:
        # Assuming "ZIP_CODE" is the property with zipcode information in your zipcode data
        area_data["ZIPCODE"] = joined['ZIPCODE'].reset_index(drop=True)
    
    filename = filename + '_new'
    # Write the updated GeoJSON with zipcodes (optional)
    area_data.to_file(f"{filename}", driver="GeoJSON")  # Uncomment to write

    

# Iterate through all GeoJSON files in the directory
for filename in glob.glob(f"{directory_path}/*.geojson"):
    process_geojson(filename)

print("Zipcodes added to GeoJSON files (if write option was uncommented).")
"""

Try adding cloropleth map for resported district wise or zipcode wise

Why do we have LAT, LON for some crimes as (0,0), most probably missing. Let's try figuring it out. 

we shall narrow down the location based on the area name , and rpt dist no

In [None]:
data['AREA NAME'].unique()

In [None]:
data[data['LAT'] == 0]

## **Victim Analysis**

### What are the demographics (Vict Age, Vict Sex, Vict Descent) of crime victims?

In [None]:
# setting records with 'H' as gender with X as unknown
data.loc[data['Vict Sex'] == 'H', 'Vict Sex'] = 'X'

In [None]:
# setting records with '-' as gender with X as unknown
data.loc[data['Vict Sex'] == '-', 'Vict Sex'] = 'X'

In [None]:
# replace the H with X as unknown, should have only 3 genders. M,F,X
crimes_by_sex = data.groupby('Vict Sex').size().to_frame(name='count').reset_index()

In [None]:
crimes_by_sex = crimes_by_sex.sort_values(ascending=False,by='count')

In [None]:
plt.figure(figsize=(10,6))
# Create the bar plot with Seaborn
ax = sns.barplot(x="Vict Sex", y="count",data=crimes_by_sex, width=0.4)

for container in ax.containers:
    ax.bar_label(container, fontsize=14)

plt.xlabel('Victim Sex', fontsize = 14)
plt.ylabel('Number of Crimes', fontsize = 14)
plt.title('Number of Crimes by Sex', fontsize = 14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.tight_layout()
plt.xticks(range(3),['Male', 'Female','unknown'])
sns.despine()
plt.show()

In [None]:
# setting the ages less than 0 to 0. because it might be an error.
data.loc[data['Vict Age'] < 0, 'Vict Age'] = 0

In [None]:
crimes_by_age = data.groupby('Vict Age').size().to_frame(name='count').reset_index()

In [None]:
# creating an empty column for age group

crimes_by_age['Age group'] = ''

In [None]:
crimes_by_age['Age group'][crimes_by_age['Vict Age'] == 0] = 'Unknown (0)'

In [None]:
crimes_by_age['Age group'][(0 < crimes_by_age['Vict Age']) & (crimes_by_age['Vict Age'] < 3)] = 'Infants (1-2)'

In [None]:
crimes_by_age['Age group'][(2 < crimes_by_age['Vict Age']) & (crimes_by_age['Vict Age'] < 6)] = 'Early Children (3-5)'

In [None]:
crimes_by_age['Age group'][(5 < crimes_by_age['Vict Age']) & (crimes_by_age['Vict Age'] < 13)] = 'School Children (6-12)' 

In [None]:
crimes_by_age['Age group'][(12 < crimes_by_age['Vict Age']) & (crimes_by_age['Vict Age'] < 20)] = 'Teenager (13-19)' 

In [None]:
crimes_by_age['Age group'][(19 < crimes_by_age['Vict Age']) & (crimes_by_age['Vict Age'] < 35)] = 'Young Adult (20-34)' 

In [None]:
crimes_by_age['Age group'][(34 < crimes_by_age['Vict Age']) & (crimes_by_age['Vict Age'] < 65)] = 'Middle Adult (35-64)' 

In [None]:
crimes_by_age['Age group'][(64 < crimes_by_age['Vict Age'])] = 'Seniors (65+)' 

In [None]:
crimes_by_agegroup = crimes_by_age.groupby('Age group')['count'].sum().to_frame(name='crimes').reset_index()
                                                          

In [None]:
crimes_by_agegroup = crimes_by_agegroup.sort_values(by='crimes', ascending=False)

In [None]:
plt.figure(figsize=(12,8))
# Create the bar plot with Seaborn
ax = sns.barplot(x="Age group", y="crimes",data=crimes_by_agegroup)

for container in ax.containers:
    ax.bar_label(container, fontsize=14)

plt.xlabel('Age group', fontsize=14)
plt.ylabel('Number of Crimes', fontsize=14)
plt.title('Number of Crimes by Age group (2020-2023)', fontsize=14)
plt.xticks(rotation=25, ha='right', fontsize=14)
plt.yticks(fontsize=14)
sns.despine()
plt.tight_layout()
plt.show()

### Do certain age groups or demographics experience specific types of crimes more frequently?

In [None]:
data['Age group'] = ''

In [None]:
# setting age group based in the age

data['Age group'][data['Vict Age'] == 0] = 'Unknown (0)'
data['Age group'][(0 < data['Vict Age']) & (data['Vict Age'] < 3)] = 'Infants (1-2)'
data['Age group'][(2 < data['Vict Age']) & (data['Vict Age'] < 6)] = 'Early Children (3-5)'
data['Age group'][(5 < data['Vict Age']) & (data['Vict Age'] < 13)] = 'School Children (6-12)' 
data['Age group'][(12 < data['Vict Age']) & (data['Vict Age'] < 20)] = 'Teenager (13-19)' 
data['Age group'][(19 < data['Vict Age']) & (data['Vict Age'] < 35)] = 'Young Adult (20-34)' 
data['Age group'][(34 < data['Vict Age']) & (data['Vict Age'] < 65)] = 'Middle Adult (35-64)' 
data['Age group'][(64 < data['Vict Age'])] = 'Seniors (65+)' 


In [None]:
crimes_types_by_age_group = data.groupby(['Age group','Crm Cd Desc']).size() 

In [None]:
crimes_on_age_unknown = crimes_types_by_age_group['Unknown (0)'].sort_values(ascending=False).head(3).to_frame(name='count').reset_index()
crimes_on_infants = crimes_types_by_age_group['Infants (1-2)'].sort_values(ascending=False).head(3).to_frame(name='count').reset_index()
crimes_on_early_children = crimes_types_by_age_group['Early Children (3-5)'].sort_values(ascending=False).head(3).to_frame(name='count').reset_index()
crimes_on_school_children = crimes_types_by_age_group['School Children (6-12)'].sort_values(ascending=False).head(3).to_frame(name='count').reset_index()
crimes_on_teenager = crimes_types_by_age_group['Teenager (13-19)'].sort_values(ascending=False).head(3).to_frame(name='count').reset_index()
crimes_on_young_adult = crimes_types_by_age_group['Young Adult (20-34)'].sort_values(ascending=False).head(3).to_frame(name='count').reset_index()
crimes_on_middle_adult = crimes_types_by_age_group['Middle Adult (35-64)'].sort_values(ascending=False).head(3).to_frame(name='count').reset_index()
crimes_on_seniors = crimes_types_by_age_group['Seniors (65+)'].sort_values(ascending=False).head(3).to_frame(name='count').reset_index()

In [None]:
def print_crimes(data_in,title_text):
    plt.figure(figsize=(12,8)) 
    # Create the bar plot with Seaborn
    ax = sns.barplot(x="Crm Cd Desc", y="count",data=data_in)

    for container in ax.containers:
        ax.bar_label(container, fontsize=18)

    plt.xlabel('Type of Crimes', fontsize=18)
    plt.ylabel('Number of Crimes', fontsize=18)
    plt.title(title_text, fontsize=18)
    plt.xticks(fontsize=18, rotation=15, ha='right')
    plt.yticks(fontsize=18)
    sns.despine()
    plt.tight_layout()
    plt.show()

In [None]:
crimes_on_age_unknown

In [None]:
data_list[index]

In [None]:
print_crimes(crimes_on_age_unknown, "Top 3 type of crimes on people of unknown age")

In [None]:
print_crimes(crimes_on_infants, "Top 3 type of crimes on Infants (1-2)")

In [None]:
print_crimes(crimes_on_early_children, "Top 3 type of crimes on Early Children (3-5)")

In [None]:
print_crimes(crimes_on_school_children, "Top 3 type of crimes on School Children (6-12)")

In [None]:
print_crimes(crimes_on_teenager, "Top 3 type of crimes on Teenager (13-19)")

In [None]:
print_crimes(crimes_on_young_adult, "Top 3 type of crimes on Young Adult (20-34)")

In [None]:
print_crimes(crimes_on_middle_adult, "Top 3 type of crimes on Middle Adult (35-64)")

In [None]:
print_crimes(crimes_on_seniors, "Top 3 type of crimes on Seniors (65+)")

### Do certain age groups or demographics experience specific types of crimes more frequently?

In [None]:
#top 3 crimes based on the descent, white , black , latinx

crimes_types_by_descent = data.groupby(['Vict Descent','Crm Cd Desc']).size() 

In [None]:
crimes_on_white = crimes_types_by_descent['W'].sort_values(ascending=False).head(3).to_frame(name='count').reset_index()
crimes_on_black = crimes_types_by_descent['B'].sort_values(ascending=False).head(3).to_frame(name='count').reset_index()
crimes_on_hispanic = crimes_types_by_descent['H'].sort_values(ascending=False).head(3).to_frame(name='count').reset_index()

In [None]:
print_crimes(crimes_on_white, "Top 3 Crime types on White descent")

In [None]:
print_crimes(crimes_on_black, "Top 3 Crime types on Black descent")

In [None]:
print_crimes(crimes_on_hispanic, "Top 3 Crime types on Hispanic/Latin/Mexican descent")

-------------------------------------------------------------------------------------------------------

In [None]:
# changing the - to X representing unknown
data.loc[data['Vict Descent'] == '-', 'Vict Descent'] = 'X'

In [None]:
crimes_by_descent = data.groupby('Vict Descent').size().to_frame(name='crimes').reset_index()

In [None]:
crimes_by_descent['Vict_desc_ext'] = ['Other Asian', 'Black', 'Chinese', 'Cambodian', 'Filipino', 'Guamanian', 'Hispanic/Latin/Mexican', 'American Indian/Alaskan Native', 'Japanese', 'Korean', 'Laotian', 'Other', 'Pacific Islander', 'Samoan', 'Hawaiian', 'Vietnamese', 'White', 'Unknown', 'Asian Indian']

In [None]:
crimes_by_descent = crimes_by_descent.sort_values(by='crimes', ascending=False)


In [None]:
import numpy as np
plt.figure(figsize=(12,8))
# Create the bar plot with Seaborn
ax = sns.barplot(x="Vict_desc_ext", y="crimes",data=crimes_by_descent, palette='tab10')

for container in ax.containers:
    ax.bar_label(container)

plt.xlabel('Victim Descent',fontsize=12)
plt.ylabel('Number of Crimes',fontsize=12)
plt.title('Number of Crimes - Victim Descent',fontsize=12)
plt.xticks(rotation = 25, ha='right')
plt.yticks(fontsize=12)
plt.tight_layout()
sns.despine()
plt.show()

### Is there any correlation between victim demographics and the location or time of occurrence of crimes?

Also combine the other dataset about the cities/ sheriff police stations, their location and analyze if present police stations are overloaded with cases due to less number? Do we need any new police stations ? if so where ? 

In [None]:
crimes_by_descent_hod = data.groupby(['Vict Descent','hour_of_day']).size()

In [None]:
crimes_by_descent_hod_W = crimes_by_descent_hod['W'].to_frame(name='count').reset_index()

In [None]:
crimes_by_descent_hod_B = crimes_by_descent_hod['B'].to_frame(name='count').reset_index()

In [None]:
crimes_by_descent_hod_H = crimes_by_descent_hod['H'].to_frame(name='count').reset_index()

In [None]:
crimes_by_descent_hod_unknown = crimes_by_descent_hod['X'].to_frame(name='count').reset_index()

In [None]:
def print_crimes_hod(data_in,title_text):
    plt.figure(figsize=(12,8))
    # Create the bar plot with Seaborn
    ax = sns.barplot(x="hour_of_day", y="count",data=data_in)

    for container in ax.containers:
        ax.bar_label(container, fontsize=14)

    plt.xlabel('Different hours of the day', fontsize=14)
    plt.ylabel('Number of Crimes', fontsize=14)
    plt.title(title_text, fontsize=15)
    plt.xticks(fontsize=14, rotation=45, ha='right')
    plt.yticks(fontsize=14)
    plt.tight_layout()
    plt.show()

In [None]:
print_crimes_hod(crimes_by_descent_hod_W, "Crimes on White descent during different hours of the day")

In [None]:
print_crimes_hod(crimes_by_descent_hod_B, "Crimes on Black descent during different hours of the day")

In [None]:
print_crimes_hod(crimes_by_descent_hod_H, "Crimes on Hispanic/Latin/Mexican descent during different hours of the day")

In [None]:
print_crimes_hod(crimes_by_descent_hod_unknown, "Crimes on Unknown descent during different hours of the day")

In [None]:
temp = data.groupby(['AREA NAME','Vict Descent']).size().sort_values(ascending=False).unstack(fill_value=0)

In [None]:
temp

In [None]:
crimes_black = temp['B'].sort_values(ascending=False).to_frame(name='count').reset_index()

In [None]:
crimes_white = temp['W'].sort_values(ascending=False).to_frame(name='count').reset_index()

In [None]:
crimes_hispanic = temp['H'].sort_values(ascending=False).to_frame(name='count').reset_index()

In [None]:
crimes_unknown = temp['X'].sort_values(ascending=False).to_frame(name='count').reset_index()

In [None]:


plt.figure(figsize=(12, 8))
# Create the bar plot with Seaborn
colors =  np.where(crimes_black['count'] > threshold, 'red', 'skyblue')
ax = sns.barplot(y="AREA NAME", x="count", data=crimes_black, palette=colors,orient='h')



# Add labels to the bars
for container in ax.containers:
    ax.bar_label(container)

plt.title('Crimes on Victims from Black Descent across different areas of LA (2020-2023)', fontsize=12)
plt.ylabel('Number of Crimes', fontsize=12)
plt.xlabel('Area Name', fontsize=12)
plt.xticks(rotation=25, ha='right')
plt.yticks(fontsize=12)
plt.tight_layout()
sns.despine()
plt.show()


In [None]:


plt.figure(figsize=(12, 8))
colors =  np.where(crimes_hispanic['count'] > threshold, 'red', 'skyblue')
ax = sns.barplot(y="AREA NAME", x="count", data=crimes_hispanic, palette=colors,orient='h')

# Add labels to the bars
for container in ax.containers:
    ax.bar_label(container)

plt.title('Crimes on Victims from Hispanic/Latin/Mexican Descent across different areas of LA (2020-2023)', fontsize=12)
plt.ylabel('Number of Crimes', fontsize=12)
plt.xlabel('Area Name', fontsize=12)
plt.xticks(rotation=25, ha='right')
plt.yticks(fontsize=12)
plt.tight_layout()
sns.despine()
plt.show()


In [None]:


plt.figure(figsize=(12, 8))
colors =  np.where(crimes_white['count'] > threshold, 'red', 'skyblue')
ax = sns.barplot(y="AREA NAME", x="count", data=crimes_white, palette=colors,orient='h')

# Set colors based on count values
threshold = 6000
for bar in ax.patches:
    if bar.get_height() > threshold:
        bar.set_color('red')
    else:
        bar.set_color('lightblue')

# Add labels to the bars
for container in ax.containers:
    ax.bar_label(container)

plt.title('Crimes on Victims from White Descent across different areas of LA (2020-2023)', fontsize=12)
plt.ylabel('Number of Crimes', fontsize=12)
plt.xlabel('Area Name', fontsize=12)
plt.xticks(rotation=25, ha='right')
plt.yticks(fontsize=12)
plt.tight_layout()
sns.despine()
plt.show()


In [None]:


plt.figure(figsize=(12, 8))
# Create the bar plot with Seaborn
ax = sns.barplot(x="AREA NAME", y="count", data=crimes_unknown, color='lightblue')

# Set colors based on count values
threshold = 11000
for bar in ax.patches:
    if bar.get_height() > threshold:
        bar.set_color('red')
    else:
        bar.set_color('lightblue')

# Add labels to the bars
for container in ax.containers:
    ax.bar_label(container)

plt.title('Crimes on Victims from Unknown Descent across different areas of LA (2020-2023)', fontsize=12)
plt.ylabel('Number of Crimes', fontsize=12)
plt.xlabel('Area Name', fontsize=12)
plt.xticks(rotation=25, ha='right')
plt.yticks(fontsize=12)
plt.tight_layout()
sns.despine()
plt.show()


In [None]:

# Create subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8), sharex=False)

# Plot the first bar plot
sns.barplot(ax=axes[0, 0],y="AREA NAME", x="count", data=crimes_unknown, color='lightblue', orient='h')
for container in axes[0, 0].containers:
    axes[0, 0].bar_label(container)
axes[0, 0].set_title('Crimes on Victims from Unknown Descent across different areas of LA (2020-2023)', fontsize=12)
# axes[0, 0].set_xlabel('Area Name', fontsize=12)
# axes[0, 0].set_ylabel('Number of Crimes', fontsize=12)
sns.despine(ax=axes[0,0])



# Plot the second bar plot
sns.barplot(ax=axes[0, 1], x=x, y=y2, color='lightgreen')
axes[0, 1].set_title('Plot 2')

# Plot the third bar plot
sns.barplot(ax=axes[1, 0], x=x, y=y3, color='salmon')
axes[1, 0].set_title('Plot 3')

# Plot the fourth bar plot
sns.barplot(ax=axes[1, 1], x=x, y=y4, color='lightcoral')
axes[1, 1].set_title('Plot 4')

# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

def plot_subplots(x, y, data, ax, threshold, descent):

    plt.figure(figsize=(12, 8))
    
    colors =  np.where(data['count'] > threshold, 'red', 'skyblue')
    # Create the bar plot with Seaborn
    sns.barplot(x=x, y=y, data=data, palette=colors, ax=ax)

    # Set colors based on count values
#     threshold = threshold
#     for bar in ax.patches:
#         if bar.get_height() > threshold:
#             bar.set_color('red')
#         else:
#             bar.set_color('lightblue')

    # Add labels to the bars
    for container in ax.containers:
        ax.bar_label(container)

    ax.set_title(f'Crimes on Victims from {descent} Descent across different areas of LA', fontsize=12)
    ax.set_xlabel(' ', fontsize=12)
    ax.set_ylabel(' ', fontsize=12)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
  
    # plt.xticks(rotation=25, ha='right')
    # plt.yticks(fontsize=12)
    # plt.tight_layout()
    




fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8), sharex=False, constrained_layout = True)
plot_subplots(x='count', y='AREA NAME',data=crimes_black,ax=axes[0,0], descent = 'Black', threshold=7000)
plot_subplots(x='count', y='AREA NAME',data=crimes_hispanic,ax=axes[0,1], descent = "Hispanic", threshold=15000)
plot_subplots(x='count', y='AREA NAME',data=crimes_white,ax=axes[1,0], descent = "White", threshold=6000)
plot_subplots(x='count', y='AREA NAME',data=crimes_unknown,ax=axes[1,1], descent = "Unknown", threshold=11000)
# Adjust layout
# 
plt.subplots_adjust(top=13)
# axes[0,0].subplots_adjust(hspace=0.2, wspace=0.2)
plt.tight_layout(pad=5)
fig.supxlabel('Number of Crimes', fontsize=14)
fig.supylabel('Area Name', fontsize=14)
fig.suptitle('Crimes on Victims from various Descents across different areas of LA', fontsize=14)
# Show the plots
plt.show()






## **Crime Classification Analysis**

### What are the most common types of crimes (Crm Cd Desc) reported in the dataset?

In [None]:
top_ten_crimes = data.groupby('Crm Cd Desc').size().sort_values(ascending=False).head(10).to_frame(name='count').reset_index()

In [None]:
plt.figure(figsize=(12,8))
# Create the bar plot with Seaborn
ax = sns.barplot(x="Crm Cd Desc", y="count", data=top_ten_crimes)

for container in ax.containers:
    ax.bar_label(container, fontsize=10)

plt.xlabel('Types of Crime', fontsize= 12)
plt.ylabel('Number of Crimes', fontsize= 12)
plt.title('Top 10 types of crimes in Los Angeles (2020-2023)', fontsize= 12)
plt.tight_layout()
plt.xticks(rotation= 25, ha='right')
# plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
sns.despine()
plt.show()

### Are there any patterns or trends in the occurrence of specific types of crimes over time or across different areas? 

In [None]:
crime_data_vehicle_stolen_only = data[data['Crm Cd Desc'] == 'VEHICLE - STOLEN']

In [None]:
"""import folium

# Group data by time intervals (e.g., months)
crime_data_vehicle_stolen_only['month_year'] = crime_data_vehicle_stolen_only['DATETIME_OCC'].dt.to_period('M')
grouped_data = crime_data_vehicle_stolen_only.groupby('month_year')

# Initialize map centered at Los Angeles
latitude_center = 34.052235
longitude_center = -118.243683

# Initialize map centered at a location
m = folium.Map(location=[latitude_center, longitude_center], zoom_start=10)

# Iterate through each time interval
for name, group in grouped_data:
    # Create a feature group for each time interval
    feature_group = folium.FeatureGroup(name=name.strftime('%Y-%m'))
    
    # Add markers for crime incident locations
    for index, row in group.iterrows():
        folium.Marker([row['LAT'], row['LON']], popup=row['Crm Cd Desc']).add_to(feature_group)
    
    # Add feature group to map
    feature_group.add_to(m)

# Add layer control to toggle between time intervals
folium.LayerControl().add_to(m)

# Display the map
m"""

In [None]:
"""import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import plotly.express as px

# Assuming 'crime_data' is your DataFrame with columns 'latitude', 'longitude', 'timestamp', and 'crime_type'
# Convert 'timestamp' column to datetime if it's not already in datetime format
# crime_data['timestamp'] = pd.to_datetime(crime_data['timestamp'])

# Create a GeoDataFrame from the DataFrame
geometry = [Point(xy) for xy in zip(crime_data_vehicle_stolen_only['LON'], crime_data_vehicle_stolen_only['LAT'])]
crime_gdf = gpd.GeoDataFrame(crime_data_vehicle_stolen_only, geometry=geometry)

# Group data by crime type
grouped_data = crime_gdf.groupby('Crm Cd Desc')

# Create a scatter plot with Plotly Express
fig = px.scatter_mapbox(crime_gdf, lat='LAT', lon='LON', color='Crm Cd Desc',
                        hover_data=['DATETIME_OCC'], zoom=10)

# Customize map layout
fig.update_layout(mapbox_style="carto-positron",
                  mapbox_zoom=10,
                  mapbox_center={"lat": 34.052235, "lon": -118.243683},
                  margin={"r":0,"t":0,"l":0,"b":0})

# Show interactive map
fig.show()"""

## **Police Response Analysis**

### How long does it typically take for crimes to be reported after they occur (Date Rptd - DATE OCC)

In [None]:
# caculate average off all crime types

data.columns

In [None]:
data['Date Rptd'] = pd.to_datetime(data['Date Rptd'], format='%m/%d/%Y %I:%M:%S %p')

In [None]:
time_diff = data['Date Rptd'] - data['DATETIME_OCC']

In [None]:
time_diff.mean()

The average time taken to report a crime in LA is 10 days, 11hrs when calculated across all crime types

### Is there any correlation between the time it takes to report a crime and its severity or type?

In [None]:
# Calculate the time difference between reported and occurred dates for each observation in minutes
data['reporting_time'] = (data['Date Rptd'] - data['DATETIME_OCC']).dt.total_seconds() / 3600

# Filter out rows with negative reporting times
reporting_df = data[data['reporting_time'] >= 0]

# Group data by crime type and calculate average reporting time for each group
average_reporting_time = reporting_df.groupby('Crm Cd Desc')['reporting_time'].mean().reset_index()

# Sort DataFrame by reporting time in ascending order
sorted_crime_data = average_reporting_time.sort_values(by='reporting_time')

# Plot the crimes with the lowest reporting time in minutes
plt.figure(figsize=(12, 8))
ax = sns.barplot(data=sorted_crime_data.head(10), y='Crm Cd Desc', x='reporting_time', palette='coolwarm', orient='h')

for container in ax.containers:
    ax.bar_label(container, fmt='%.2f', fontsize=16)

plt.title('Top 10 Crime types with Lowest Reporting Time (2020-2023)', fontsize=16)
plt.ylabel('Crime Type', fontsize=14)
plt.xlabel('Reporting Time (hours)', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)                
plt.tight_layout()
sns.despine()
plt.show()


In [None]:
# Calculate the time difference between reported and occurred dates for each observation in minutes
data['reporting_time'] = (data['Date Rptd'] - data['DATETIME_OCC']).dt.days

# Filter out rows with negative reporting times
reporting_df = data[data['reporting_time'] >= 0]

# Group data by crime type and calculate average reporting time for each group
average_reporting_time = reporting_df.groupby('Crm Cd Desc')['reporting_time'].mean().reset_index()

# Sort DataFrame by reporting time in ascending order
sorted_crime_data = average_reporting_time.sort_values(by='reporting_time')

# Plot the crimes with the lowest reporting time in minutes
plt.figure(figsize=(12, 8))
ax = sns.barplot(data=sorted_crime_data.tail(10), y='Crm Cd Desc', x='reporting_time', palette='coolwarm', orient='h')

for container in ax.containers:
    ax.bar_label(container, fmt='%.2f', fontsize=16)

plt.title('Top 10 Crime types with Highest Reporting Time (2020-2023)', fontsize=16)
plt.ylabel('Crime Type', fontsize=14)
plt.xlabel('Reporting Time (days)', fontsize=14)
plt.xticks( fontsize=14)
plt.yticks( fontsize=14)
plt.tight_layout()
sns.despine()
plt.show()

### Any relation between type of crime and arrests ? 

Like for what type of crimes are the perpetrators got arrested the most and for what type of crimes the perpetrators are still free (investigation is still being done)

In [None]:
data['Status'].value_counts() # as of Feb 22, 2024

In [None]:
investigation_df = data[data['Status']== 'IC']

In [None]:
# total cases based on the crime category.

crime_categories = data.groupby('Crm Cd Desc').size().sort_values(ascending = False)

In [None]:
crime_categories[crime_categories > 10000].head(10)

In [None]:
crime_categories[crime_categories > 10000].tail(10)

In [None]:
arrested_df = data[data['Status']!= 'IC']

In [None]:
arrested_df = arrested_df.groupby('Crm Cd Desc').size()

In [None]:
arrested_df[['VEHICLE - STOLEN', 'BATTERY - SIMPLE ASSAULT', 'THEFT OF IDENTITY',
       'BURGLARY FROM VEHICLE', 'BURGLARY',
       'VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)',
       'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT',
       'THEFT PLAIN - PETTY ($950 & UNDER)',
       'INTIMATE PARTNER - SIMPLE ASSAULT',
       'THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)','THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND OVER)', 'ROBBERY',
       'THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD',
       'VANDALISM - MISDEAMEANOR ($399 OR UNDER)',
       'SHOPLIFTING - PETTY THEFT ($950 & UNDER)',
       'CRIMINAL THREATS - NO WEAPON DISPLAYED', 'BRANDISH WEAPON',
       'TRESPASSING', 'INTIMATE PARTNER - AGGRAVATED ASSAULT',
       'VIOLATION OF RESTRAINING ORDER']]

These above block shows the crimes that were solved for top 20 crime types that have registerd cases over 10k

In [None]:
# Filter out cases with status != 'IC' to get the number of cases solved
cases_solved = data[data['Status'] != 'IC'].groupby('Crm Cd Desc').size()

# Total cases for each crime type
total_cases = data.groupby('Crm Cd Desc').size()

# Calculate percentage of cases solved
percentage_solved = (cases_solved / total_cases) * 100

# Create a DataFrame with the results
result_df = pd.DataFrame({
    'Total_Cases': total_cases,
    'Cases_Solved': cases_solved,
    'Percentage_Solved': percentage_solved
})

# Sort the DataFrame based on percentage of cases solved
result_df = result_df.sort_values(by='Percentage_Solved', ascending=False)

# Plot the top 10 and bottom 10 crime types based on percentage of cases solved
# top_and_bottom_10 = pd.concat([result_df.head(10), result_df.tail(10)])

# Plotting
plt.figure(figsize=(12, 8))
ax = sns.barplot(y=result_df.head(10).index, x='Percentage_Solved', data=result_df.head(10), palette='coolwarm', orient='h')

for container in ax.containers :
    ax.bar_label(container, fmt='%.2f%%',fontsize=16 )

plt.title('Top 10 Crime Types by Highest Percentage of Cases Solved (2020-2023)',fontsize=16)
plt.ylabel('Crime Type',fontsize=14)
plt.xlabel('Percentage of Cases Solved',fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.tight_layout()
sns.despine()
plt.show()


In [None]:
# Plotting
plt.figure(figsize=(12, 8))
ax = sns.barplot(y=result_df.tail(20).head(12).index, x='Percentage_Solved', data=result_df.tail(20).head(12),
                 palette='coolwarm', orient='h')

for container in ax.containers :
    ax.bar_label(container, fmt='%.2f%%',fontsize=16)

plt.title('Top 10 Crime Types by Lowest Percentage of Cases Solved (2020-2023)',fontsize=16)
plt.ylabel('Crime Type',fontsize=14)
plt.xlabel('Percentage of Cases Solved',fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
sns.despine()
plt.tight_layout()
plt.show()
