In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt      # For base plotting
# Seaborn is a library for making statistical graphics
# in Python. It is built on top of matplotlib and 
#  numpy and pandas data structures.
import seaborn as sns                # Easier plotting

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/"))

# Any results you write to the current directory are saved as output.

In [None]:
# Read data file
data = pd.read_csv('../input/gun-violence-data_01-2013_03-2018.csv')


Let's read the file and try to understand the data. 

In [None]:
data.columns

In [None]:
data.shape                           # dim()


In [None]:
data.head(1)                          # head()


The dataset has nearly 240,000 data points of various gun violence incidents. 
Let us add an attribute by adding the umber of people who are killed and those who are injured. 

In [None]:
data['victims']=data['n_killed']+data['n_injured']


In [None]:
data.shape

Let us also extract Year and Month from the dates and add them as separate attributes. We might be able to see some pattern or trend year-on-year or if there are some particular months when the activity increases.

In [None]:
#Extract Year and Month from dates
data['date'] = pd.to_datetime(data['date'])      # Convert to datetime
data.dtypes

# Now create columns and extract
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data.head()
data.shape


Let's assess the data state-wise, year-wise, month-wise, incident-wise....

In [None]:
#How many states are there and how many incidents have been reported in each state
len(data['state'].unique())         # Number of states where the incidents have been reported
data['state'].unique()              # Which states
data['state'].value_counts()        # Distribution


There is a significant variation across states in the number of gun violence incidents. Let's see which are the states with the highest incidence.

In [None]:
data['state'].value_counts()[:20]
hi_crime_states = data['state'].value_counts()[:20].index.tolist()
hi_crime_states
#bin_new = np.arange(start=5000, stop=18000, step=1000)
#data[hi_crime_states].plot(kind='hist', bins=bin_new, figsize=[12,6], alpha=.4, legend=True)

y_pos = np.arange(len(hi_crime_states))

plt.bar(y_pos, data['state'].value_counts()[:20], align='center', alpha=0.5)
plt.xticks(y_pos, hi_crime_states, rotation = 'vertical')
plt.ylabel('No. of incidents')
plt.title('States with highest number of gun violence incidents')
 
plt.show()


We see that  IL, CA and FL are having the highest number of these incidents in the reported period.  
Let's also see if there is some trend visible across the years.

In [None]:
#Year-wise people killed or injured
dd = data.groupby(['year'])['victims'].sum()
dd[dd > 2013]
#  dd[(dd<2018)]
dd
#Year-wise number of incidents
qq = data.groupby(['year'])['incident_id'].count()
qq

In [None]:

#month-wise - is there any trend in number of killings?
mm = data.groupby(['month'])
mmfiltered = mm.filter(lambda x: (x['year'] != 2018).any())
mmfiltered.groupby(['month'])['victims'].sum()

#month-wise - is there any trend in number of incidents?
mmfiltered.groupby(['month'])['incident_id'].count()


In [None]:
xx = mmfiltered.groupby(['month'])['incident_id'].count()

y_pos = np.arange(len(xx))
plt.bar(y_pos, xx, align='center', alpha=0.5)
plt.xticks(y_pos, 'JFMAMJJASOND', rotation = 'vertical')
plt.ylabel('No. of incidents')
plt.title('Month wise trend')
plt.show()


Let's check the number of people that are typically killed or injured in each of these incidents.

In [None]:
#What is the number of people that are killed/ injured typically in each of these incidents

(data['victims']).sort_values(ascending=False)
#type((data['victims']).sort_values(ascending=False))
#len((data['victims']).sort_values(ascending=False))


In [None]:
#  How many victims are there in each incident ###############
data['victims'].max()

bin_values = np.arange(start=0, stop=120, step=4)
print(bin_values)
bin_values2 = np.arange(start=0, stop=10, step=1)

data['victims'].hist(bins=bin_values, figsize=[8,2])


This graph is too skewed as most of the incidents have lower number of victims - perhaps a few incidents have a large number. Let's break the graph range into 2 parts.

In [None]:


data[data['victims']<=10]['victims'].hist(bins=bin_values2, figsize=[8,2])


In [None]:
data[data['victims']>10]['victims'].hist(bins=bin_values, figsize=[8,2])


In [None]:
# Number of incidents corresponding to number of victims in the incidents 
(data['victims']).value_counts()


Let's find the age of Suspect in each of these incidents. Are the suspects old? What is the likely age group when they are most susceptible to commit such crimes.

However, this is not so easy to extract as the attributes in the data are || separated for victims and suspects together. 

In [None]:
ageOfSuspect = []
for row in range(0,len(data)-1):
#    print("Row number", row)    
    if(not pd.isnull(data.loc[row,'participant_age'])):
        for x in data.loc[row,'participant_type'].split('||'):
            if('Subject-Suspect' in str(x)):
#                print(str(x)[3:])
                for y in data.loc[row,'participant_age'].split('||'):
                    if(str(y)[0]==str(x)[0]):
#                        print(str(y)[3:])
                        ageOfSuspect.append(y[3:5])

ageOfSuspect
#data['suspect_age'] = ageOfSuspect
#data.head()
len(ageOfSuspect)
type(ageOfSuspect)

ageOfSuspect.count('15')    

from collections import Counter
c=Counter(ageOfSuspect)
#print(c.items())

del c['::'],c[''],c[':1'],c[':2'],c[':3'],c[':4'],c[':6'],c[':7'],c['|1'],c['1|'], c['4|'], c['2|'], c['3|'], c['5|'], c['6|'], c['8|'], c['9|'], c['7|'], c['0|']
print(c.items())


Let's plot it to see the age profile of Suspects.

In [None]:
colors = list("rgbcmyk")

type(c)
key = c.keys()
df = pd.DataFrame(c,index=key)
df
df.drop(df.columns[1:], inplace=True)

type(df)
row = df.iloc[0]
row.sort_index()
row.sort_index().plot(kind='bar')



Since the number of values on x-axis are too many, let's see a subset of data - ages where 1000+ incidents are reported.

In [None]:

row2 = row[row > 1000]
row2.sort_index()
row2.sort_index().plot(kind='bar')


How is this age group distributed? Let's see it on a boxplot.

In [None]:
sns.boxplot(data= row)

In [None]:
sns.violinplot(data= row)

A lor more information can be extracted from the dataset - particularly on the participants - both victims and suspects.

However, we can still infer that typical suspects peak in late teens and early twenties. The maximum number of suspects would be in the 16-36 year range. Further we see that most of the incidents have less than 10 victims. There are some stray events where 90-100 victims are also there. 

There is no significant trend month-wise. Yearly rate of crime has increased slightly recently but there is not enough data to suggest a trend.

Another observation is on the states that have a high level of gun violence - however that information needs to be further assessed on other parameters such as population of state and regulations with respect to possession of guns.