In [1]:
!pip install plotly



In [2]:
import calendar
import datetime
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go

In [3]:
df = pd.read_csv('/content/NYPD_Shooting.csv')
df.head()

Unnamed: 0,INCIDENT_KEY,OCCUR_DATE,OCCUR_TIME,BORO,LOC_OF_OCCUR_DESC,PRECINCT,JURISDICTION_CODE,LOC_CLASSFCTN_DESC,LOCATION_DESC,STATISTICAL_MURDER_FLAG,...,PERP_SEX,PERP_RACE,VIC_AGE_GROUP,VIC_SEX,VIC_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
0,244608249,05/05/2022,00:10:00,MANHATTAN,INSIDE,14,0.0,COMMERCIAL,VIDEO STORE,True,...,M,BLACK,25-44,M,BLACK,986050.0,214231.0,40.754692,-73.9935,POINT (-73.9935 40.754692)
1,247542571,07/04/2022,22:20:00,BRONX,OUTSIDE,48,0.0,STREET,(null),True,...,(null),(null),18-24,M,BLACK,1016802.0,250581.0,40.854402,-73.88233,POINT (-73.88233 40.854402)
2,84967535,05/27/2012,19:35:00,QUEENS,,103,0.0,,,False,...,,,18-24,M,BLACK,1048632.0,198262.0,40.710634,-73.767773,POINT (-73.76777349199995 40.71063412500007)
3,202853370,09/24/2019,21:00:00,BRONX,,42,0.0,,,False,...,M,UNKNOWN,25-44,M,BLACK,1014493.0,242565.0,40.832417,-73.890714,POINT (-73.89071440599997 40.832416753000075)
4,27078636,02/25/2007,21:00:00,BROOKLYN,,83,0.0,,,False,...,M,BLACK,25-44,M,BLACK,1009149.375,190104.703125,40.688443,-73.910219,POINT (-73.91021857399994 40.68844345900004)


In [4]:
df = df.drop(
    labels=['INCIDENT_KEY', 'JURISDICTION_CODE', 'PERP_RACE', 'Lon_Lat',
            'PERP_SEX', 'X_COORD_CD', 'Y_COORD_CD', 'PERP_AGE_GROUP'],
    axis=1)
df = df.replace(to_replace='(null)', value=np.nan)
df.head()

Unnamed: 0,OCCUR_DATE,OCCUR_TIME,BORO,LOC_OF_OCCUR_DESC,PRECINCT,LOC_CLASSFCTN_DESC,LOCATION_DESC,STATISTICAL_MURDER_FLAG,VIC_AGE_GROUP,VIC_SEX,VIC_RACE,Latitude,Longitude
0,05/05/2022,00:10:00,MANHATTAN,INSIDE,14,COMMERCIAL,VIDEO STORE,True,25-44,M,BLACK,40.754692,-73.9935
1,07/04/2022,22:20:00,BRONX,OUTSIDE,48,STREET,,True,18-24,M,BLACK,40.854402,-73.88233
2,05/27/2012,19:35:00,QUEENS,,103,,,False,18-24,M,BLACK,40.710634,-73.767773
3,09/24/2019,21:00:00,BRONX,,42,,,False,25-44,M,BLACK,40.832417,-73.890714
4,02/25/2007,21:00:00,BROOKLYN,,83,,,False,25-44,M,BLACK,40.688443,-73.910219


In [5]:
def get_month():
  months = []

  for date in df['OCCUR_DATE']:
    month_num = int(date.split("/")[0])
    months.append(calendar.month_name[month_num])

  return months

def time_ofday():

  def time_in_range(start, end, current):
    """Returns whether current is in the range [start, end]"""
    return start <= current <= end

  time = []
  time_dict = {
      "midnight": {
          "start": datetime.time(0, 0, 0),
          "end": datetime.time(3, 59, 59)
      },
      "early morning": {
          "start": datetime.time(4, 0, 0),
          "end": datetime.time(7, 59, 59)
      },
      "morning": {
          "start": datetime.time(8, 0, 0),
          "end": datetime.time(11, 59, 59)
      },
      "afternoon": {
          "start": datetime.time(12, 0, 0),
          "end": datetime.time(16, 59, 59)
      },
      "evening": {
          "start": datetime.time(17, 0, 0),
          "end": datetime.time(20, 59, 59)
      },
      "late evening": {
          "start": datetime.time(21, 0, 0),
          "end": datetime.time(23, 59, 59)
      }
  }

  for _, occur_time in enumerate(df['OCCUR_TIME']):
    for key, value in time_dict.items():
      if time_in_range(value.get("start"), value.get("end"), occur_time):
        time.append(key.title())

  return time

In [6]:
# convert occur_time to datetime object
df['OCCUR_TIME'] = pd.to_datetime(df['OCCUR_TIME'],format= '%H:%M:%S' ).dt.time
df['OCCUR_TIME'] = time_ofday()
df['OCCUR_DATE'] = get_month()

In [7]:
# victim age group modification
age_map = {
    '<18': '0-18',
    '18-24': '18-25',
    '25-44': '25-45',
    '45-64': '45-65',
    '65+': '65-100',
    'UNKNOWN': 'UNKNOWN'
}
df['VIC_AGE_GROUP'] = df['VIC_AGE_GROUP'].map(age_map)

In [8]:
df = df[df['VIC_AGE_GROUP'] != 'UNKNOWN']
df = df[df['VIC_RACE'] != 'UNKNOWN']
df = df[df['VIC_SEX'] != 'U']
df.head()

Unnamed: 0,OCCUR_DATE,OCCUR_TIME,BORO,LOC_OF_OCCUR_DESC,PRECINCT,LOC_CLASSFCTN_DESC,LOCATION_DESC,STATISTICAL_MURDER_FLAG,VIC_AGE_GROUP,VIC_SEX,VIC_RACE,Latitude,Longitude
0,May,Midnight,MANHATTAN,INSIDE,14,COMMERCIAL,VIDEO STORE,True,25-45,M,BLACK,40.754692,-73.9935
1,July,Late Evening,BRONX,OUTSIDE,48,STREET,,True,18-25,M,BLACK,40.854402,-73.88233
2,May,Evening,QUEENS,,103,,,False,18-25,M,BLACK,40.710634,-73.767773
3,September,Late Evening,BRONX,,42,,,False,25-45,M,BLACK,40.832417,-73.890714
4,February,Late Evening,BROOKLYN,,83,,,False,25-45,M,BLACK,40.688443,-73.910219


In [9]:
regions = df['BORO'].value_counts(sort=False).to_dict()
region_count = {
    'region': [ele for ele in list(regions.keys())],
    'count': [ele for ele in list(regions.values())]
}
region_df = pd.DataFrame(data=region_count)
region_df

Unnamed: 0,region,count
0,MANHATTAN,3752
1,BRONX,8341
2,QUEENS,4248
3,BROOKLYN,11298
4,STATEN ISLAND,804


In [10]:
area_count = px.bar(
    region_df, x='region', y='count',
    labels={'region': 'Region name', 'count': 'Number of victims'}
)
area_count.update_layout(
    title_text='Shooting incidents reported in each region of NYC',
    title_x=0.5
)
pyo.iplot(area_count, filename='Regionwise_Shooting_barplot')

In [11]:
area_pie = px.pie(
    data_frame=region_df,
    names = 'region',
    values = 'count'
)
area_pie.update_layout(
    title_text = 'Proportion of shooting cases reported in each region of NYC',
    title_x = 0.5
)
pyo.iplot(area_pie, filename='Regionwise_Shooting_pie')

# Monthly and daily analysis

**Useful but unimplemented**: [Line plot as plt.graph_object](https://plotly.com/python/line-charts/)

In [12]:
month_to_num = {
    'January': 1, 'February': 2, 'March': 3,
    'April': 4, 'May': 5, 'June': 6, 'July': 7,
    'August': 8, 'September': 9, 'October': 10,
    'November': 11, 'December': 12
}

month_dict = df['OCCUR_DATE'].value_counts(sort=False).to_dict()
month_data = {'months': [ele for ele in list(month_dict.keys())],
              'count': [ele for ele in list(month_dict.values())]}
month_df = pd.DataFrame(month_data)
month_df['month_num'] = month_df['months'].map(month_to_num)
month_df = month_df.sort_values(by='month_num')
month_df = month_df.drop(labels=['month_num'], axis=1)
month_df

Unnamed: 0,months,count
10,January,1802
3,February,1441
11,March,1788
6,April,2063
0,May,2673
4,June,2948
1,July,3382
8,August,3249
2,September,2663
9,October,2366


In [13]:
monthly_plot = px.line(
    data_frame=month_df,
    x='months',
    y='count',
    markers=True,
    labels = {
        'months': 'Month of the year',
        'count': 'Number of victims'
    }
)
monthly_plot.update_layout(
    title_text='Monthly trend in reported shooting incidents over 10 years',
    title_x = 0.5,
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=True),
)
pyo.iplot(monthly_plot, filename='MonthlyTrend')

In [14]:
time_to_num = {
    'Early Morning': 1, 'Morning': 2, 'Afternoon': 3,
    'Evening': 4, 'Late Evening': 5, 'Midnight': 6
}

time_dict = df['OCCUR_TIME'].value_counts(sort=False).to_dict()
time_data = {'time': [ele for ele in list(time_dict.keys())],
              'count': [ele for ele in list(time_dict.values())]}
time_df = pd.DataFrame(time_data)
time_df['time_num'] = time_df['time'].map(time_to_num)
time_df = time_df.sort_values(by='time_num')
time_df = time_df.drop(labels=['time_num'], axis=1)
time_df

Unnamed: 0,time,count
3,Early Morning,2853
5,Morning,1215
4,Afternoon,4014
2,Evening,5721
1,Late Evening,6716
0,Midnight,7924


In [15]:
time_plot = px.line(
    data_frame=time_df,
    x='time',
    y='count',
    markers=True,
    labels = {
        'time': 'Time of the day',
        'count': 'Number of victims'
    }
)
time_plot.update_layout(
    title_text='Daily trend in reported shooting incidents over 10 years',
    title_x = 0.5,
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=True),
)
pyo.iplot(time_plot, filename='DailyTrend')

# Age groups

In [16]:
age_to_num = {
    '0-18': 1, '18-25': 2, '25-45': 3,
    '45-65': 4, '65-100': 5
}
age_groups = df['VIC_AGE_GROUP'].value_counts(sort=False).to_dict()
age_data = {'age': [ele for ele in list(age_groups.keys())],
            'count': [ele for ele in list(age_groups.values())]}
age_df = pd.DataFrame(age_data)
age_df['age_num'] = age_df['age'].map(age_to_num)
age_df = age_df.sort_values(by='age_num')
age_df = age_df.drop(labels=['age_num'], axis=1)
age_df.head()

Unnamed: 0,age,count
4,0-18,2950
1,18-25,10360
0,25-45,12952
2,45-65,1975
3,65-100,205


In [17]:
age_radial = px.line_polar(
    age_df,
    r="count",
    theta="age",
    markers=True,
    line_close=True,
    start_angle=60
)
pyo.iplot(age_radial, filename='Radial')

# Ethnicity Analysis

In [18]:
victim_race = df['VIC_RACE'].value_counts(sort=False).to_dict()
victim_race = {
    "race": [ele for ele in list(victim_race.keys())],
    "count": [ele for ele in list(victim_race.values())]
}
df_victim = pd.DataFrame(victim_race)
df_victim

Unnamed: 0,race,count
0,BLACK,20213
1,WHITE,714
2,WHITE HISPANIC,4275
3,BLACK HISPANIC,2791
4,ASIAN / PACIFIC ISLANDER,439
5,AMERICAN INDIAN/ALASKAN NATIVE,11


In [19]:
annotation = [
    dict(
        xref='paper', yref='paper',
        x=0.5, y=0.5,
        xanchor= 'center',
        yanchor='middle',
        text= '<b>RACE OF VICTIMS</b>',
        font=dict(family="Arial", size=12),
        showarrow=False,
  )
]
ethnicity_pie = go.Figure(
    data = [
        go.Pie(
        labels=df_victim['race'].tolist(),
        values=df_victim['count'].tolist(),
        direction ='clockwise',
        hole=.8,
        sort=False
    )]
)
ethnicity_pie.update_layout(annotations=annotation)
pyo.iplot(ethnicity_pie, filename='Ethnicity_analysis_pie')

# Locality of reported incidents

In [20]:
df_new = df.loc[:, ['BORO', 'LOC_CLASSFCTN_DESC']].dropna()
pivot = df_new.groupby(['BORO', 'LOC_CLASSFCTN_DESC']).size().unstack(fill_value=0)

# Reset index to convert the pivot table to a regular DataFrame
pivot_table = pivot.reset_index().drop(['STREET', 'OTHER', 'HOUSING'], axis=1)

# Melt the DataFrame to long format
df_locality = pivot_table.melt(id_vars='BORO', var_name='LOC_CLASSFCTN_DESC', value_name='Count')
# Print the melted DataFrame
df_locality

Unnamed: 0,BORO,LOC_CLASSFCTN_DESC,Count
0,BRONX,COMMERCIAL,56
1,BROOKLYN,COMMERCIAL,75
2,MANHATTAN,COMMERCIAL,42
3,QUEENS,COMMERCIAL,31
4,STATEN ISLAND,COMMERCIAL,3
5,BRONX,DWELLING,65
6,BROOKLYN,DWELLING,99
7,MANHATTAN,DWELLING,18
8,QUEENS,DWELLING,46
9,STATEN ISLAND,DWELLING,14


In [21]:
# Create the stacked bar plot
fig = px.bar(
    df_locality,
    x='BORO', y='Count',
    color='LOC_CLASSFCTN_DESC',
    labels={
        'BORO': 'Regions of NYC',
        'Count': 'Number of incidents reported'
    },
    barmode='stack'
)
fig.update_layout(
    title_text='Incidents reported in different localities in different regions',
    title_x=0.5,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        title="Locality classification",
        y=-0.2,
        xanchor="center",
        x=0.5,
        traceorder="grouped"
    ),
)
# Show the plot
pyo.iplot(fig, filename='Locality_classification_incident_count')

# Age group of male and female victims

In [22]:
df_age = df.loc[:, ['VIC_AGE_GROUP', 'VIC_SEX']].dropna()
df_age = df_age.groupby(['VIC_AGE_GROUP', 'VIC_SEX']).size().unstack(fill_value=0)
pivot_table = df_age.reset_index()
df_age = pivot_table.melt(id_vars='VIC_AGE_GROUP', var_name='VIC_SEX', value_name='Count')
df_age

Unnamed: 0,VIC_AGE_GROUP,VIC_SEX,Count
0,0-18,F,427
1,18-25,F,826
2,25-45,F,1072
3,45-65,F,366
4,65-100,F,63
5,0-18,M,2523
6,18-25,M,9534
7,25-45,M,11880
8,45-65,M,1609
9,65-100,M,142


In [23]:
# Create the stacked bar plot
fig = px.bar(
    df_age,
    x='VIC_AGE_GROUP', y='Count',
    color='VIC_SEX',
    labels={
        'VIC_AGE_GROUP': 'Age group of victims',
        'Count': 'Number of victims'
    },
    barmode='stack'
)
fig.update_layout(
    {
        "paper_bgcolor": "rgba(0, 0, 0, 0)",
        "plot_bgcolor": "rgba(0, 0, 0, 0)",
    },
    title_text='Gender wise representation of each age group',
    title_x=0.5,
)
# Show the plot
pyo.iplot(fig, filename='Age_Gender_count')

# Bonus plots

Histogram of the PRECINCT column

In [24]:
hist = px.histogram(
    df, x='PRECINCT',
    labels={
        'PRECINCT': 'Numeric values of PRECINCT',
        'count': 'Frequency'
    }
)
hist.update_layout(
    title_text='Plot of PRECINCT column values',
    title_x=0.5
)
pyo.iplot(hist, filename='Histogram')

Sankey diagram for classification and description

[Reference](https://stackoverflow.com/a/72764554)

In [25]:
hierarchy = df[['LOC_CLASSFCTN_DESC', 'LOCATION_DESC']].dropna()
hierarchy = hierarchy[hierarchy['LOC_CLASSFCTN_DESC']!='OTHER']
hierarchy.loc[hierarchy.LOCATION_DESC.str.contains("STORE"),'LOCATION_DESC'] = "STORE"
hierarchy.loc[hierarchy.LOCATION_DESC.str.contains("HOUS"),'LOCATION_DESC'] = "HOUSE"
hierarchy.loc[hierarchy.LOCATION_DESC.str.contains("CLUB"),'LOCATION_DESC'] = "CLUB"
hierarchy = hierarchy.groupby(['LOC_CLASSFCTN_DESC', 'LOCATION_DESC']).size().unstack(fill_value=0)
pivot_table = hierarchy.reset_index()
loc_df = pivot_table.melt(id_vars='LOC_CLASSFCTN_DESC', var_name='LOCATION_DESC', value_name='Count')
loc_df = loc_df[loc_df['Count']!=0]
loc_df

Unnamed: 0,LOC_CLASSFCTN_DESC,LOCATION_DESC,Count
0,COMMERCIAL,BEAUTY/NAIL SALON,6
4,STREET,BEAUTY/NAIL SALON,8
5,COMMERCIAL,CLUB,40
9,STREET,CLUB,45
10,COMMERCIAL,COMMERCIAL BLDG,6
13,PARKING LOT,COMMERCIAL BLDG,7
14,STREET,COMMERCIAL BLDG,26
15,COMMERCIAL,DRY CLEANER/LAUNDRY,1
20,COMMERCIAL,FAST FOOD,1
23,PARKING LOT,FAST FOOD,1


In [26]:
classifications = list(set(loc_df['LOC_CLASSFCTN_DESC'].tolist()))
descriptions = list(set(loc_df['LOCATION_DESC'].tolist()))
classification_dict = {}
description_dict = {}
classification_idx = 0

for idx, classification in enumerate(classifications):
  classification_dict.update({classification: idx})
  classification_idx = idx

for idx, description in enumerate(descriptions):
  description_dict.update({description: classification_idx+1+idx})

print(classification_dict)
print(description_dict)

{'DWELLING': 0, 'STREET': 1, 'COMMERCIAL': 2, 'PARKING LOT': 3, 'HOUSING': 4}
{'SMALL MERCHANT': 5, 'FAST FOOD': 6, 'GAS STATION': 7, 'COMMERCIAL BLDG': 8, 'HOTEL/MOTEL': 9, 'SUPERMARKET': 10, 'CLUB': 11, 'GROCERY/BODEGA': 12, 'DRY CLEANER/LAUNDRY': 13, 'HOUSE': 14, 'BEAUTY/NAIL SALON': 15, 'MULTI DWELL - APT BUILD': 16, 'RESTAURANT/DINER': 17, 'STORE': 18, 'HOSPITAL': 19}


In [27]:
# add source and target columns to dataframe
def get_source_target(df):
  source_list, target_list = [], []
  for row in df['LOC_CLASSFCTN_DESC']:
    source_list.append(classification_dict.get(row))

  for row in df['LOCATION_DESC']:
    target_list.append(description_dict.get(row))

  return source_list, target_list

In [28]:
loc_df['Source'], loc_df['Target'] = get_source_target(df=loc_df)
loc_df = loc_df.drop_duplicates()
loc_df

Unnamed: 0,LOC_CLASSFCTN_DESC,LOCATION_DESC,Count,Source,Target
0,COMMERCIAL,BEAUTY/NAIL SALON,6,2,15
4,STREET,BEAUTY/NAIL SALON,8,1,15
5,COMMERCIAL,CLUB,40,2,11
9,STREET,CLUB,45,1,11
10,COMMERCIAL,COMMERCIAL BLDG,6,2,8
13,PARKING LOT,COMMERCIAL BLDG,7,3,8
14,STREET,COMMERCIAL BLDG,26,1,8
15,COMMERCIAL,DRY CLEANER/LAUNDRY,1,2,13
20,COMMERCIAL,FAST FOOD,1,2,6
23,PARKING LOT,FAST FOOD,1,3,6


In [29]:
sankey_node = dict(
    label = classifications+descriptions,
    line = dict(color = "black", width = 0.5),
    pad = 15,
    thickness = 20,
    x = [0.001, 0.001, 0.001, 0.001, 0.001],
    y = [0.001, 75/285, 160/285, 190/285, 0.001, 75/285, 130/285, 215/285],
    color = ["#305CA3", "#C9304E", "#C1DAF1", "#F7DC70", "#5CA330"]
)

sankey_link = dict(
    source = loc_df['Source'],
    target = loc_df['Target'],
    value = loc_df['Count'],
)

sankey = go.Figure(data=[go.Sankey(
    arrangement='snap',
    node = sankey_node,
    link = sankey_link
    )
])
sankey.update_layout(
    title_text="Location Categories and Descriptions as a Sankey Diagram",
    title_x=0.5,
    font_size=10
)
pyo.iplot(sankey, filename='Sankey')

# Plotting on a map

1. [Guide using GeoPandas](https://plotly.com/python/scattermapbox/)
2. [All kind of maps](https://plotly.com/python/maps/)

In [34]:
px.set_mapbox_access_token(token="pk.eyJ1IjoiZGl2aS0wMDciLCJhIjoiY2x3dnRlMnU4MG5vOTJycXprMG1pOGZ3dCJ9.WULsINqBNsYBvpo11qP7kg")


In [35]:
murder_df = df[df['STATISTICAL_MURDER_FLAG']][['BORO', 'Latitude', 'Longitude']].dropna()
murder_df

Unnamed: 0,BORO,Latitude,Longitude
0,MANHATTAN,40.754692,-73.993500
1,BRONX,40.854402,-73.882330
6,QUEENS,40.673306,-73.789887
9,BRONX,40.860463,-73.865561
11,QUEENS,40.769744,-73.909872
...,...,...,...
28544,BROOKLYN,40.634916,-73.932963
28549,QUEENS,40.700826,-73.881490
28551,BROOKLYN,40.695772,-73.979528
28557,BRONX,40.903785,-73.850098


In [37]:
shooting_map = px.scatter_mapbox(
    murder_df, lat="Latitude", lon="Longitude", color='BORO',
    color_continuous_scale=px.colors.cyclical.IceFire, size_max=2, zoom=10
)
shooting_map.update_layout(
    title_text="Map depicting shooting incidents in NYC",
    title_x=0.5,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.1,
        xanchor="center",
        x=0.5,
        traceorder="grouped",
        title="REGION",
        bgcolor="yellow"
    )
)
pyo.iplot(shooting_map, filename='Map')