In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

import plotly.plotly as plotly
import plotly.graph_objs as go

### This notebook uses a dataset from Kaggle (https://www.kaggle.com/jameslko/gun-violence-data) that contains gun violence data in America from 2013-2018. The entries for 2013 are incomplete, and only data up to March of 2018 are included. Not all gun incidents, including the Las Vegas Shooting, are included. Because that specific incident is  so important, it was added into the dataset. 

### Four analyses were performed. The first two looks at different ways to portray total incidents by mapping them out per state. The next one looks at where the more violent incidents occured, and the last one plots out the total number of incidents per month to see if there is a pattern in gun violence in America.

#### Downloading the data and adding Vegas statistics to the dataset

In [2]:
data = pd.read_csv('./gun-violence-data.csv')

In [3]:
lasvegas = [999999999, '2017-10-01', 'Nevada', 'Las Vegas', None, 59, 411, None, None, None, None, 
            None, None, None, 36.095, None, -115.171667, 47, None, None, None, None, None, None, 
            None, None, None, None, None]
data.loc[len(data)] = lasvegas

In [4]:
data.tail()

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
239673,1083139,2018-03-31,Louisiana,Natchitoches,247 Keyser Ave,1,0,http://www.gunviolencearchive.org/incident/108...,http://www.ksla.com/story/37854648/man-wanted-...,False,...,1::21,0::Adult 18+||1::Adult 18+,0::Male||1::Male,0::Jamal Haskett||1::Jaquarious Tyjuan Ardison,,"0::Killed||1::Unharmed, Arrested",0::Victim||1::Subject-Suspect,http://www.ksla.com/story/37854648/man-wanted-...,23.0,31.0
239674,1083151,2018-03-31,Louisiana,Gretna,1300 block of Cook Street,0,1,http://www.gunviolencearchive.org/incident/108...,http://www.nola.com/crime/index.ssf/2018/04/sh...,False,...,0::21,0::Adult 18+,0::Male,,,0::Injured,0::Victim,http://www.nola.com/crime/index.ssf/2018/04/sh...,85.0,7.0
239675,1082514,2018-03-31,Texas,Houston,12630 Ashford Point Dr,1,0,http://www.gunviolencearchive.org/incident/108...,https://www.chron.com/news/houston-texas/houst...,False,...,0::42,0::Adult 18+,0::Male,0::Leroy Ellis,,0::Killed,0::Victim,http://www.khou.com/article/news/hpd-investiga...,149.0,17.0
239676,1081940,2018-03-31,Maine,Norridgewock,434 Skowhegan Rd,2,0,http://www.gunviolencearchive.org/incident/108...,https://www.centralmaine.com/2018/03/31/police...,False,...,0::58||1::62,0::Adult 18+||1::Adult 18+,0::Female||1::Male,0::Marie Lancaster Hale||1::William Hale,1::Significant others - current or former,0::Killed||1::Killed,0::Victim||1::Subject-Suspect,https://www.centralmaine.com/2018/03/31/police...,111.0,3.0
239677,999999999,2017-10-01,Nevada,Las Vegas,,59,411,,,,...,,,,,,,,,,


#### Add columns that break up the date by year and month, and calculate total number of victims per gun incident. The total number of victims is defined as the sum of the number of people killed and the number of people injured.

In [5]:
data['year'] = data['date'].apply(lambda x:pd.to_datetime(x).year)
data['month'] = data['date'].apply(lambda x:pd.to_datetime(x).month)
data['victims'] = data['n_killed'] + data['n_injured']

In [6]:
data.head()

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district,year,month,victims
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,False,...,0::Julian Sims,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,,,2013,1,4
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,False,...,0::Bernard Gillis,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://losangeles.cbslocal.com/2013/01/01/man-...,62.0,35.0,2013,1,4
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,False,...,0::Damien Bell||1::Desmen Noble||2::Herman Sea...,,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,http://www.morningjournal.com/general-news/201...,56.0,13.0,2013,1,4
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,False,...,0::Stacie Philbrook||1::Christopher Ratliffe||...,,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://denver.cbslocal.com/2013/01/06/officer-...,40.0,28.0,2013,1,4
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,False,...,0::Danielle Imani Jameison||1::Maurice Eugene ...,3::Family,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://myfox8.com/2013/01/08/update-mother-sho...,62.0,27.0,2013,1,4


#### Read in population data. Data was downloaded from the Center for Disease Control website (https://wonder.cdc.gov/population-projections.html)

In [7]:
pop_data = pd.read_table('./state_pop_projections_2014-2018.txt', dtype = {'Year': np.int32, 'Projected Populations': np.int32})

In [8]:
pop_data.head()

Unnamed: 0,Notes,State,State Code,Year,Year Code,Projected Populations
0,,Alabama,1,2014,2014,4649932
1,,Alabama,1,2015,2015,4663111
2,,Alabama,1,2016,2016,4676269
3,,Alabama,1,2017,2017,4689410
4,,Alabama,1,2018,2018,4702519


#### Create a dataframe to properly display the population data

In [9]:
state_dict = {'Alabama': "AL", 'Alaska': "AK", "Arizona": "AZ", "Arkansas": "AR",
              "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
              "District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Hawaii": "HI",
              "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
              "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME",
              "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN",
              "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE",
              "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM",
              "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH",
              "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI",
              "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX",
              "Utah": "UT", "Vermont": "VT", "Virginia": "VA", "Washington": "WA",
              "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"}

years = (2014,2015,2016,2017,2018)

In [10]:
pop_data_list = []

for i in state_dict:
    state_pop = pop_data.loc[pop_data["State"] == i]["Projected Populations"].tolist()
    pop_data_list.append({"State" : i, "2014" : state_pop[0],
                         "2015" : state_pop[1], "2016" : state_pop[2], "2017" : state_pop[3],
                         "2018" : state_pop[4]} )
    
pop_data_cleaned = pd.DataFrame.from_dict(pop_data_list)
pop_data_cleaned = pop_data_cleaned[["State", "2014", "2015", "2016", "2017", "2018"]]

In [11]:
pop_data_cleaned.head()

Unnamed: 0,State,2014,2015,2016,2017,2018
0,Alabama,4649932,4663111,4676269,4689410,4702519
1,Alaska,724553,732544,740704,748945,757331
2,Arizona,7315364,7495238,7679249,7867317,8059487
3,Arkansas,2950287,2968913,2987368,3005726,3023989
4,California,39705716,40123232,40541658,40959943,41377233


#### Isolate the victim counts into its own dataframe, ordered by state and year.

In [12]:
victim_counts = []

for state in state_dict:
    state_data = []
    temp = data.loc[data['state'] == state]
    for year in years:
        state_year = temp.loc[temp['year'] == year]
        state_data.append(state_year['victims'].sum())
    victim_counts.append({'State' : state, "2014" : state_data[0],
                         "2015" : state_data[1], "2016" : state_data[2], "2017" : state_data[3],
                         "2018" : state_data[4]})
    
victim_data_cleaned = pd.DataFrame.from_dict(victim_counts)
victim_data_cleaned = victim_data_cleaned[["State", "2014", "2015", "2016", "2017", "2018"]]

In [13]:
victim_data_cleaned.head()

Unnamed: 0,State,2014,2015,2016,2017,2018
0,Alabama,916,947,1249,1400,354
1,Alaska,78,154,191,139,30
2,Arizona,442,408,571,587,158
3,Arkansas,400,400,541,653,126
4,California,2927,2821,3161,3376,736


#### Reformat the state name to its respective two letter abbreviation so Plotly can read it properly

In [14]:
victim_data_cleaned['State'].replace(state_dict, inplace = True)

#### Create another dataset for victim counts per 100K people in that state. 

In [15]:
per_pop = []

for i in range(51):
    per_pop.append({"State": pop_data_cleaned.at[i,"State"]})
    for year in years:
        per_pop[i][str(year)] = (victim_data_cleaned.at[i,str(year)])/(pop_data_cleaned.at[i,str(year)])*100000
        
per_pop_data_cleaned = pd.DataFrame.from_dict(per_pop)
per_pop_data_cleaned = per_pop_data_cleaned[["State", "2014", "2015", "2016", "2017", "2018"]]

In [16]:
per_pop_data_cleaned.head()

Unnamed: 0,State,2014,2015,2016,2017,2018
0,Alabama,19.699213,20.308331,26.709327,29.854502,7.52788
1,Alaska,10.765258,21.022628,25.786279,18.55944,3.96128
2,Arizona,6.042078,5.443456,7.435623,7.461248,1.960423
3,Arkansas,13.558003,13.472944,18.109587,21.725201,4.166682
4,California,7.371735,7.030839,7.796918,8.242199,1.778756


In [17]:
per_pop_data_cleaned['State'].replace(state_dict, inplace = True)

In [18]:
no_dc = per_pop_data_cleaned[per_pop_data_cleaned.State != 'DC']

In [19]:
scl = [[0.0, 'rgb(255,255,255)'], [0.1, 'rgb(255,230,230)'], [0.2, 'rgb(255,205,205)'],\
       [0.3, 'rgb(255,180,180)'], [0.4, '255,155,155)'], [0.5, 'rgb(255,130,130)'],\
       [0.6, 'rgb(255,105,105)'],[0.8, 'rgb(255,80,80)'], [0.9, 'rgb(255,40,40)'],\
       [1.0, 'rgb(255,0,0)']]

#### Plot the total victim count per 100K people using Plotly's choropleth map feature. Slider bars are included to scroll between year (2014-2018). 

In [20]:
all_year_map_data = []

for year in years:
    map_data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = True,
        locations = no_dc['State'],
        z = no_dc[str(year)].astype(float),
        name = year,
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        visible = False,
        colorbar = dict(
            title = "Victims (Deaths + Injuries)")
        ) ]
    all_year_map_data.extend(map_data)
    
all_year_map_data[0]['visible'] = True

steps = []
yr = 0
for i in range(0,len(all_year_map_data)):
    step = dict(method = "restyle",
                args = ["visible", [False]*len(all_year_map_data)],
                label = years[yr]) 
    step['args'][1][i] = True
    steps.append(step)
    yr += 1

sliders = [dict(active = 10,
                currentvalue = {"prefix": "Year: "},
                pad = {"t": 50},
                steps = steps)]

layout = dict(
        title = 'US Gun Victims by State (per 100k people)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
        sliders = sliders)
    
fig = dict( data=all_year_map_data, layout=layout )
plotly.iplot( fig, filename='d3-cloropleth-map' )

#### Plot the raw total victim counts on another map

In [21]:
all_year_map_data = []

for year in years:
    map_data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = True,
        locations = victim_data_cleaned['State'],
        z = victim_data_cleaned[str(year)].astype(float),
        locationmode = 'USA-states',
        name = year,
        marker = dict(
            line = dict(
                color = 'rgb(255,255,255)',
                width = 2) ),
        visible = False,
        colorbar = dict(title = "Victims (Deaths + Injuries)") ) ]
    all_year_map_data.extend(map_data)

all_year_map_data[0]['visible'] = True

steps = []
for i in range(len(years)):#range(0,len(all_year_map_data)):
    step = dict( method = "restyle",
            args = ["visible", [False]*len(years)],
            label = years[i] ) 
    step['args'][1][i] = True
    steps.append(step)

sliders = [dict(active = 10,
            currentvalue = {"prefix": "Year: "},
            pad = {"t": 50},
            steps = steps)]

layout = dict(
        title = 'US Gun Victims by State (Counts)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = False,
            lakecolor = 'rgb(255, 255, 255)'
        ),
        sliders = sliders)
    
fig = dict( data=all_year_map_data, layout=layout )
plotly.iplot( fig, filename='d3-cloropleth-map' )

#### Note: The 2017 Vegas shooting does skew the data a bit for Nevada. There were nearly 500 deaths and injuries combined, which is an anomaly for a gun incident, even in the US.

#### These two maps show why count data has to be displayed as a rate when comparing between events over a different size. In this case, state populations are different, so the data has to be converted to a rate. California has just under 10x the population of Alabama, so it would make sense that California has a lot more gun victims than Alabama. If you look at the rate of gun victims per 100K people per state, Alabama actually comes out to be worse for gun victims. California acerages about 8 gun victims per 100K people, while Alabama roughly averages about 20 victims per 100K people. 

#### The difference in counts and rates is the difference between narratives. If the raw counts were displayed, one would believe that California is a haven for gun violence and Alabama is actually the better state to live in if you care about not getting shot. However, when you look at the rate, you see that, in fact, California is a lot safer than the counts actually make it. 

#### From a statistics point of view, this is why the poisson distribution includes an offset term for situations like these. 

### Isolate the original dataset to only include incidents where there were more than 10 victims. There is no clear definition for what consitutes a mass shooting, so for this scenario, we will define it as a gun incident with more than 10 victims.

In [22]:
mass_shootings = data.loc[data['victims'] > 10]

#### Using the mass shooting dataset, plot where each incident occurs using their respective latitude/longitude coordinate. Each bubble is centered around the location, and the size is scaled by how many victims there were. 

In [23]:
limits = [(10,15),(15,20),(25,30),(30,40),(40,100),(100,500)]
colors = ["rgb(128,128,255)","rgb(255,170,0)","rgb(255,128,128)","rgb(255,110,0)","rgb(0,0,255)","rgb(255,0,0)"]
cities = []
scale = 5

for i in range(len(limits)):
    subset = mass_shootings.loc[mass_shootings['victims'] < limits[i][1]]
    subset = subset.loc[subset['victims'] >= limits[i][0]]
    city = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = subset['longitude'],
        lat = subset['latitude'],
        text = subset['city_or_county'],
        marker = dict(
            size = subset['victims']*scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{} - {} Victims'.format(limits[i][0],limits[i][1]) )
    cities.append(city)

layout = dict(
        title = '2014-2018 US Gun Incidents With 10+ Victims' ,
        showlegend = True,
        geo = dict(
            scope='usa',
            projection=dict(type='albers usa'),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        )
    )
fig = dict( data=cities, layout=layout )
plotly.iplot( fig, filename='d3-bubble-map-populations')

#### Create another dataset of total victim counts per month/year combination. 2018 data only goes up to March, so that is where the dataset ends.

In [24]:
month_year_dict = []

for year in years:
    year_set = data.loc[data['year'] == year]
    if year == 2018:
        for month in range (1,4):
            month_year_set = year_set.loc[year_set['month'] == month]
            month_year_dict.append({'date': '{:02d}/{}'.format(month, year),
                                    'month': month,
                                    'year': year,
                                    'total victims' : month_year_set['victims'].sum()})
    else:
        for month in range (1,13):
            month_year_set = year_set.loc[year_set['month'] == month]
            month_year_dict.append({'date': '{:02d}/{}'.format(month, year),
                                    'month': month,
                                    'year': year,
                                    'total victims' : month_year_set['victims'].sum()})
                                    
month_year_totals = pd.DataFrame.from_dict(month_year_dict)

## Plotting the number of total victims per month from January 2014 to March 2018

In [25]:
trace = [go.Scatter(
        x= month_year_totals['date'],
        y= month_year_totals['total victims']
)]
plotly.iplot(trace, filename='basic-line')

#### This is a time series with seasonality. In one calendar year, February has the least number of total victims. This could be due to the fact that February has 2-3 less days than other months, but given that the number of victims is substantially less than January and March, there are other factors that are better in explaining this. Conversely, July has the highest number of total victims (except for 2017, but that is because the Las Vegas Shooting had 400+ victims, which would skew the data a bit). The high number of gun victims can be attributed to summer, so people tend to be outside more, and July 4, one of the most violent days annually in America.

## Plot the total number of victims per year (2018 data consists of January to March data)

In [26]:
year_sums = data.groupby('year')['victims'].sum()
year_sums = year_sums.drop([2013])
year_sums

bar_year = [go.Bar(
            x= years,
            y= year_sums
    )]

plotly.iplot(bar_year, filename='basic-bar')

#### As seen from this chart, there is an increasing trend in the number of gun violence victims per year. 