In [1]:
#import relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from celluloid import Camera
from wordcloud import WordCloud, STOPWORDS
import matplotlib.axes as ax
import numbers
import plotly
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import chart_studio.plotly as py
import seaborn as sns
import folium 
import plotly.express as px
import pandas_profiling as pp
from pandas_profiling import ProfileReport
from statannot import add_stat_annotation
from PIL import Image

#silence pandas warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#create dataframe and view first 5 rows
data = pd.read_csv('gun_violence.csv')
data.head()

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,False,...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,0::Julian Sims,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,,
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,False,...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,0::Bernard Gillis,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://losangeles.cbslocal.com/2013/01/01/man-...,62.0,35.0
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,False,...,0::25||1::31||2::33||3::34||4::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,0::Damien Bell||1::Desmen Noble||2::Herman Sea...,,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,http://www.morningjournal.com/general-news/201...,56.0,13.0
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,False,...,0::29||1::33||2::56||3::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,0::Stacie Philbrook||1::Christopher Ratliffe||...,,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://denver.cbslocal.com/2013/01/06/officer-...,40.0,28.0
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,False,...,0::18||1::46||2::14||3::47,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,0::Danielle Imani Jameison||1::Maurice Eugene ...,3::Family,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://myfox8.com/2013/01/08/update-mother-sho...,62.0,27.0


In [3]:
#create new dataframe with only the columns needed for data analysis 

df=data[['date','state','n_killed','n_injured','participant_age_group','participant_gender','participant_relationship','gun_type','latitude','longitude','location_description']]
df.head()

Unnamed: 0,date,state,n_killed,n_injured,participant_age_group,participant_gender,participant_relationship,gun_type,latitude,longitude,location_description
0,2013-01-01,Pennsylvania,0,4,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,,,40.3467,-79.8559,
1,2013-01-01,California,1,3,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,,,33.909,-118.333,
2,2013-01-01,Ohio,1,3,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,,0::Unknown||1::Unknown,41.4455,-82.1377,Cotton Club
3,2013-01-05,Colorado,4,0,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,,,39.6518,-104.802,
4,2013-01-07,North Carolina,2,2,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,3::Family,0::Handgun||1::Handgun,36.114,-79.9569,


In [4]:
#separate the month and year from the date for ease of analysis later 
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
#check dataframe
df.head()




Unnamed: 0,date,state,n_killed,n_injured,participant_age_group,participant_gender,participant_relationship,gun_type,latitude,longitude,location_description,year,month
0,2013-01-01,Pennsylvania,0,4,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,,,40.3467,-79.8559,,2013,1
1,2013-01-01,California,1,3,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,,,33.909,-118.333,,2013,1
2,2013-01-01,Ohio,1,3,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,,0::Unknown||1::Unknown,41.4455,-82.1377,Cotton Club,2013,1
3,2013-01-05,Colorado,4,0,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,,,39.6518,-104.802,,2013,1
4,2013-01-07,North Carolina,2,2,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,3::Family,0::Handgun||1::Handgun,36.114,-79.9569,,2013,1


In [5]:
#create new column for total number of casualties
df['total_casualties'] = df['n_injured'] + df['n_killed']
#sort dataframe by most serious incidents (i.e. most number of casualties) 
df.sort_values(by=['total_casualties'], ascending=False)



Unnamed: 0,date,state,n_killed,n_injured,participant_age_group,participant_gender,participant_relationship,gun_type,latitude,longitude,location_description,year,month,total_casualties
130448,2016-06-12,Florida,50,53,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male||5...,,0::223 Rem [AR-15]||1::9mm,28.5195,-81.3767,Pulse,2016,6,103
217151,2017-11-05,Texas,27,20,0::Teen 12-17||1::Child 0-11||2::Child 0-11||3...,0::Female||1::Female||2::Female||3::Female||4:...,,0::223 Rem [AR-15]||1::Rifle||2::9mm||3::22 LR,29.2733,-98.0564,First Baptist Church,2017,11,47
101531,2015-12-02,California,16,19,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Male||5::Female|...,,0::223 Rem [AR-15]||1::223 Rem [AR-15]||2::9mm...,34.0758,-117.2770,Inland Regional Center,2015,12,35
232745,2018-02-14,Florida,17,17,0::Adult 18+||1::Teen 12-17||2::Teen 12-17||3:...,0::Male||1::Male||2::Male||3::Male||4::Female|...,,0::223 Rem [AR-15]||1::Handgun,26.3045,-80.2694,Marjory Stoneman Douglas High School,2018,2,34
70511,2015-05-17,Texas,9,18,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male||5...,,,31.5039,-97.1290,Twin Peaks Restaurant,2015,5,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19426,2014-05-30,Virginia,0,0,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||3::Female||4::Female,,,38.8451,-77.0764,,2014,5,0
70146,2015-05-15,Washington,0,0,0::Adult 18+,0::Male,,,47.6611,-117.4320,,2015,5,0
70147,2015-05-15,North Carolina,0,0,0::Adult 18+,0::Male,,,35.2430,-81.1897,,2015,5,0
70149,2015-05-15,Iowa,0,0,0::Teen 12-17||1::Teen 12-17||2::Teen 12-17||3...,0::Male||1::Male||2::Male||3::Male,,0::9mm||1::9mm||2::40 SW||3::40 SW||4::22 LR,42.4986,-92.3149,,2015,5,0


In [None]:
#create a map to see the geographical distribution of incidents by number of total casualties 
#zoom in to view state names

df_map = df[df['total_casualties'] >= 0][['latitude', 'longitude', 'total_casualties', 'n_killed']].dropna()
df_map
gun_violence_map_all = folium.Map([39.50, -98.35],  zoom_start=4, tiles='OpenStreetMap')
markers = []
for idx, row in df_map.iterrows():
    total = row['total_casualties'] * 0.30   
    folium.CircleMarker([float(row['latitude']), float(row['longitude'])], radius=float(total), color='red', fill=True).add_to(gun_violence_map_all)
gun_violence_map_all


#from the map it seems like gun violence incidents that result in casualties i.e. injury or death, happen across the US, save for little pockets which might be remote/uninhabitable areas

In [None]:
"""
since the map showing all incidents with casualties provides little information on difference between states, 
let's zoom into a smaller subset and map the incidents where total casualties>10
"""

df_map = df[df['total_casualties'] >= 10][['latitude', 'longitude', 'total_casualties', 'n_killed']].dropna()
df_map
gun_violence_map5 = folium.Map([39.50, -98.35],  zoom_start=4, tiles='OpenStreetMap')
markers = []
for idx, row in df_map.iterrows():
    total = row['total_casualties'] * 0.30   
    folium.CircleMarker([float(row['latitude']), float(row['longitude'])], radius=float(total), color='blue', fill=True).add_to(gun_violence_map5)
gun_violence_map5

In [None]:
"""

Now that we've looked at the geographical distribution of incidents, let's look at the specific locations 
where incidents took place using a word cloud. First we will drop NaN values.
*since only wordcloud analysis will be done, can drop rather than replace with unknown

"""
df=df.dropna(subset=['location_description'])



In [None]:
#create word cloud to visualize the most common places gun violence incidences take place at

#import gun image
gun_mask = np.array(Image.open( "gun.jpg"))

#create word cloud using the imported image
comment_words = ''
stopwords = set(STOPWORDS)

for val in df['location_description']:
      
    # typecaste each val to string
    val = str(val)
  
    # split the value
    tokens = val.split()
      
    # Converts each token into lowercase
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
      
    comment_words += " ".join(tokens)+" "
  
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopwords,mask=gun_mask,
                min_font_size = 9).generate(comment_words)
  
# plot the WordCloud image                       
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
  
plt.show()