# Author : Akash Kothare

Data Science & Business Analytics Intern (Batch - Dec'20)

## Task 4: Exploratory Data Analysis - Terrorism


## Importing Libraries

In [None]:
#basic imports
import numpy as np
import pandas as pd
#for visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import animation, rc
plt.style.use('fivethirtyeight')
import plotly.offline as py
py.init_notebook_mode(connected = True)
from mpl_toolkits.basemap import Basemap
import io
import base64
import codecs
import warnings
warnings.filterwarnings('ignore')
from IPython.display import HTML, display

## Loading Dataset

In [None]:
data = pd.read_csv("../input/tsf-datasets/globalterrorismdb_0718dist.csv", encoding = 'ISO-8859-1')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
#Keep 20 columns
df = data[['eventid', 'iyear','country_txt', 'region_txt', 'latitude', 'longitude', 'provstate', 'city', 'crit1', 'crit2', 'crit3', 'success', 'suicide', 'attacktype1_txt', 'targtype1_txt', 'targsubtype1_txt', 'natlty1_txt','gname', 'guncertain1', 'claimed', 'weaptype1_txt', 'nkill','nwound']]
#Renaming for further use
df.rename(columns = {'eventid' : 'EventId', 'iyear' : 'Year','country_txt' : 'Country', 'region_txt' : 'Region', 'provstate' : 'Provstate', 'city' : 'City', 'crit1' : 'Crit1', 'crit2' : 'Crit2', 'crit3' : 'Crit3', 'success' : 'Success', 'suicide' : 'Suicide', 'attacktype1_txt' : 'AttackType', 'targtype1_txt' : 'TargType', 'targsubtype1_txt' : 'TargSubType', 'natlty1_txt' : 'Nationality', 'gname' : 'Group', 'guncertain1' : 'GunCertain1', 'claimed' : 'Claimed', 'weaptype1_txt' : 'WeaponType', 'nkill' : 'Killed','nwound' : 'Wounded'}, inplace = True)

In [None]:
df.columns

Feature Engineering :
Creating new features from the existing features.

In [None]:
# Creating new feature "Casualities" by adding "Killed" and "Wounded" features
df['Casualities'] = df['Killed'] + df['Wounded']
#Merging all Hostage Taking Types into single category
df['AttackType'] = df['AttackType'].replace('Hostage Taking .*','Hostage Taking', regex=True)

One of the important thing to know about your dataset is to know how many null values are there in every features of the dataset.

In [None]:
df.isnull().sum()

## Statistics about the data

In [None]:
df['Country'].value_counts().head()

In [None]:
df['Region'].value_counts().head()

In [None]:
df['Group'].value_counts().head()

Here, 82782 attacks are unknown attacks and there is no data on which group did them.

In [None]:
df['City'].value_counts().head()

In [None]:
df['AttackType'].value_counts().head(10)

Here, we can observe that the most common type of attack is **Bombing/Explosion** with 88255 occurences followed by **Armed Assault** with 42669 incidences

In [None]:
df['TargType'].value_counts().head()

The top target type for terrorist attacks are **"Private Citizens & Property"**

In [None]:
df['TargSubType'].value_counts().head()

In [None]:
print('Country with Highest Terrorist Attacks:',df['Country'].value_counts().index[0])
print('Regions with Highest Terrorist Attacks:',df['Region'].value_counts().index[0])
print('Maximum people killed in an attack are:',df['Killed'].max(),'that took place in',df.loc[df['Killed'].idxmax()].Country)

In [None]:
#Year wise number of Terrorism Acts(Sorted by Number of Acts)
df['Year'].value_counts()

In [None]:
#Targeted Nationallity
df['Nationality'].value_counts()

In [None]:
print("Nationality of the maximally targetted group is:", df['Nationality'].value_counts().index[0])

## Visualizing the Data

Plotting the global terrorist activites trend on a time scale :

In [None]:
plt.subplots(figsize = (15, 8))
sns.countplot('Year', data = df, edgecolor = sns.color_palette('dark', 7))
plt.xticks(rotation = 90)
plt.xlabel("Year of Attack")
plt.ylabel("Count")
plt.title("Number of Terrorist Activities Each Year")
plt.show()

From the above graph, its very clear that the global terrorist activities hit a lowest point at the end of 19th century. But all of a sudden, it hiked to a wooping 18,000 mark in 2014.

We need to ask why terrorism raised after 2000. There may be various reasons for this that may not be available in the dataset.

In [None]:
plt.subplots(figsize = (15, 4))
sns.countplot('AttackType', data = df, palette = 'inferno', order = df['AttackType'].value_counts().index)
plt.xticks(rotation = 90)
plt.xlabel('Attack Type')
plt.title('Attacking Methods by Terrorists')
plt.show()

In [None]:
# Create table_1 about Attack types and number of Dead/Injured People

cat = ['Bombing/Explosion', 'Armed Assault', 'Assassination', 'Hostage Taking', 'Facility/Infrastructure Attack']
color_cat = sns.color_palette("Set2", 8)[:5]
color_cat_dict = dict(zip(cat, color_cat))

table_1 = df[['AttackType','Casualities']].groupby('AttackType', as_index = False).sum().sort_values(by = 'Casualities', ascending = False)
table_1 = table_1.reset_index()

inci = [88255,42669,7276,19312,12149,659,1015,10356]
table_1["Incidents"] = np.array(inci)

table_1["Casuality_Rate"] = table_1["Casualities"]/table_1["Incidents"]
table_1

In [None]:
# Draw chart of Terrorist Attack Types and the Damage
labels = table_1['AttackType'].tolist()
x = np.arange(len(labels)) 
cas = table_1['Casualities'].tolist()

gray = (0.5843137254901961, 0.6470588235294118, 0.6509803921568628)
color_list = [color_cat_dict[ter_type] if ter_type in color_cat_dict.keys() else gray for ter_type in labels]

c_rate = table_1["Casuality_Rate"].tolist()
fig, ax1 = plt.subplots(figsize=(15,8))

# Bar chart
ax1.bar(labels, cas,
       color = color_list,
       align = 'center')

# Number in bar chart
for i,v in enumerate(cas):
    ax1.text(i - 0.3, v - 13000 if v == 37209 else v + 3000, str(round(v)), 
             color = 'w' if v == 37209 else 'k',
             fontweight = 'bold')


# Insert a second plot -line plot
ax2 = ax1.twinx()
ax2.plot(labels, c_rate, linestyle = '--', linewidth = 4, marker = 'o', 
         markerfacecolor = 'black', markersize = 10,
         label = 'Mean Of Dead/Injured People',
         color = '#C44D51')

plt.title('Terrorist Attack Types and Damage', fontsize = 25, pad = 20, weight = 'bold', 
             color = sns.cubehelix_palette(8, start = .5, rot = -.75)[-3]) 

ax1.set(xlabel='Types Of Terrorist Attacks', ylabel='Number Of Dead/Injured People')

ax1.set_xticklabels(labels, rotation = 45)

plt.yticks(fontsize = 10)

ax2.legend(loc = 'upper center') 
fig.show()

From this, we can say that even though the frequency of "Hijacking" less compared to others, but the Casuality Rate is way to high w.r.t the trend for other terrorist activities.

In [None]:
plt.subplots(figsize = (15,5))
sns.countplot('TargType',data = df, palette = 'inferno', order = df['TargType'].value_counts().index)
plt.xticks(rotation = 90)
plt.xlabel('Target Type')
plt.title('Target by Terrorists')
plt.show()

In [None]:
plt.subplots(figsize = (20,5))
sns.countplot('TargSubType', data = df, palette = 'inferno', order = df['TargSubType'].value_counts().index)
plt.xticks(rotation = 90)
plt.xlabel('Sub Target Type')
plt.title('Sub Targets by Terrorists')
plt.show()

In [None]:
print("The top sub targets of terrorists are:", df['TargSubType'].value_counts().index[:5])

In [None]:
plt.subplots(figsize = (15,5))
sns.countplot('Region', data = df, palette = 'inferno', order = df['Region'].value_counts().index)
plt.xticks(rotation = 90)
plt.xlabel('Region')
plt.title('Number Of Terrorist Activities By Region')
plt.show()

In [None]:
plt.subplots(figsize = (15,5))
sns.countplot('Country', data = df, palette = 'inferno', order = df['Country'].value_counts()[:15].index)
plt.xticks(rotation = 90)
plt.xlabel('Countries')
plt.title('Number Of Terrorist Activities By Countries')
plt.show()

Plotting the locations where the terrorist attacks claimed lesser than 100 casualities :

In [None]:
df['casualities'] = df['Killed'] + df['Wounded']

In [None]:
m1 = Basemap(projection = 'mill', llcrnrlat = -80, urcrnrlat = 80, llcrnrlon = -180, urcrnrlon = 180)
lat_100 = list(df[df['casualities'] < 100].latitude)
long_100 = list(df[df['casualities'] < 100].longitude)
x_100,y_100 = m1(long_100, lat_100)
m1.drawcoastlines()
m1.drawcountries()
m1.plot(x_100, y_100, 'go', markersize = 0.5, color = 'g')
fig=plt.gcf()
fig.set_size_inches(15, 10)
plt.title("Terroist attacks with lesser than 100 casualities")

From the above map, India, Middle east and European countries are the favorite targets of the terrorist groups.

Plotting the locations where the terrorist attacks claimed more than 100 casualities :

In [None]:
m2 = Basemap(projection = 'mill', llcrnrlat = -80, urcrnrlat = 80, llcrnrlon = -180, urcrnrlon = 180)
lat_100 = list(df[df['casualities'] >= 100].latitude)
long_100 = list(df[df['casualities'] >= 100].longitude)
x_100,y_100 = m2(long_100, lat_100)
m2.drawcoastlines()
m2.drawcountries()
m2.plot(x_100, y_100, 'go', markersize = 5, color = 'r')
fig = plt.gcf()
fig.set_size_inches(15, 10)
plt.title("Terroist attacks with more than 100 Casualities")

Most of the major attacks were also happened in Middle east and contries like Pakistan, Afganisthan etc.

Which regions are facing more terrorist attacks worldwide :  

In [None]:
df_region = pd.crosstab(df.Year,df.Region)
df_region.plot(color = sns.color_palette('Set2', 12))
fig = plt.gcf()
fig.set_size_inches(18, 6)
plt.show()

Middle East, North Africa, South Asia are the top most affected regions by terrorism. In this chart also we can see a complete drop in the global terrorism rate at the end of 19th century. 

But after that, there was a sudden hike in the terrorism worldwide. So its very clear that something that happened around this time led to this hike. What was that ?

**9/11 attack ?**

Lets analyse which type of attack is famous in every region :

In [None]:
df_type = pd.crosstab(df.Region,df.AttackType)
df_type

In [None]:
df_type.plot.barh(stacked = True, width = 1)
fig = plt.gcf()
fig.set_size_inches(12 ,8)
plt.legend(loc = 4)
plt.show()

Frrom the chart above, it is clear that bombing and explosion is the favourite attack of terrorist groups in every region.

This may be the reason why most number of civilians are killed in the attacks as a single explosion claims lots of lives.

What are the top most affected countries from terrorism worl wide :

In [None]:
# Top 20 countries affected by terrorism
count_df=df['Country'].value_counts()[:20].to_frame() # to_frame() function will generate a dataframe out of the results. 
count_df.columns=['Attacks']
count_df

Lets mix and match the features and see if we get any insights out of it :

We are going to plot number of attacks and number of casualities in a bar chat for every significant country :

In [None]:
# This will give the number of people killed in every country collectively
count_kill=df.groupby('Country')['Killed'].sum().to_frame() 
count_kill.head ()

In [None]:
# This will merge the count_df and count_kill datasets 
# and give top 20 countries with no of attacks and no of people killed 
attack_kill = count_df.merge(count_kill, left_index = True, right_index = True, how = 'left')
attack_kill

Lets plot both of these datasets in a single barchat :

In [None]:
# Plotting the same on a bar chart
attack_kill.plot.bar()
fig = plt.gcf()
fig.set_size_inches(18, 6)

From the above chart, we can come to some obvious conclusions :
* In some of the Middle east contries like Iraq, number of casualities is more than twice the number of attacks. High density of population may be the reason. Poor prevention and security may also be a reason for this.
* In developed contries like UK, Spain, France, no of attacks is more than the number of casulaities. This means that these contries are better is safety and they are good in prevention before a terror attack happens. Low population density may also be a reason for this.

Which terror groups are highly active?

In [None]:
# To find which terrorist group is most active
count_group = df['Group'].value_counts()[:20].to_frame()
count_group

In [None]:
count_group[1:20].plot.bar()
fig = plt.gcf()
fig.set_size_inches(18, 6)
plt.title("Most active terrorist groups")

Plotting th activities of top 10 groups in a time series plot :

In [None]:
top_groups10 = df[df['Group'].isin(df['Group'].value_counts()[1:11].index)]
noto_grp = pd.crosstab(top_groups10.Year, top_groups10.Group)
noto_grp.plot(color = sns.color_palette('Paired', 10))
fig = plt.gcf()
fig.set_size_inches(18, 6)
plt.title("Terrorist Activites by the Most Notorious Groups")
plt.show()

The above chart clearly shows that all the terror groups are active for some particular time and stopped their activities after some time before 2000. But after 2000, we can see lot of groups emerging.

We can ask questions like what happened after the year 2000 ? What is the moto for these groups ? Is somebody creating these groups and funding them for their own goodwill ?

Most notorious groups like Taliban, ISIS emerged after 2000.

In [None]:
groups = df[df['Group'].isin(df['Group'].value_counts()[:14].index)]
m3 = Basemap(projection = 'mill', llcrnrlat = -80, urcrnrlat = 80, llcrnrlon = -180, urcrnrlon = 180, lat_ts = 20, resolution = 'c', lat_0 = True, lat_1 = True)
m3.drawcoastlines()
m3.drawcountries()
m3.fillcontinents(lake_color =  'aqua')
m3.drawmapboundary(fill_color = 'aqua')
fig = plt.gcf()
fig.set_size_inches(22, 10)
colors = ['r', 'g', 'b', 'y', '#800000', '#ff1100', '#8202fa', '#20fad9', '#ff5733', '#fa02c6', "#f99504", '#b3b6b7', '#8e44ad', '#1a2b3c']
group = list(groups['Group'].unique())
def group_point(group, color, label):
    lat_group = list(groups[groups['Group'] == group].latitude)
    long_group = list(groups[groups['Group'] == group].longitude)
    x_group, y_group = m3(long_group, lat_group)
    m3.plot(x_group,y_group, 'go', markersize = 3, color = j, label = i)
for i,j in zip(group,colors):
    group_point(i, j, i)
legend = plt.legend(loc = 'lower left', frameon = True, prop = {'size' : 10})
frame = legend.get_frame()
frame.set_facecolor('white')
plt.title('Terrorist Groups')
plt.show()

In [None]:
groups = df[df['Region'].isin(df['Region'].value_counts()[:14].index)]
m4 = Basemap(projection = 'mill', llcrnrlat = -80, urcrnrlat = 80, llcrnrlon = -180, urcrnrlon = 180, lat_ts = 20, resolution = 'c', lat_0 = True, lat_1 = True)
m4.drawcoastlines()
m4.drawcountries()
m4.fillcontinents(lake_color = 'black')
m4.drawmapboundary(fill_color = 'white')
fig = plt.gcf()
fig.set_size_inches(22, 10)
colors = ['r', 'g', 'b', 'y', '#800000', '#ff1100', '#8202fa', '#20fad9', '#ff5733', '#fa02c6', "#f99504", '#b3b6b7', '#8e44ad', '#1a2b3c']
group = list(groups['Region'].unique())
def group_point(group, color, label):
    lat_group = list(groups[groups['Region'] == group].latitude)
    long_group = list(groups[groups['Region'] == group].longitude)
    x_group, y_group =  m4(long_group,lat_group)
    m4.plot(x_group, y_group, 'go', markersize = 3, color = j, label = i)
for i,j in zip(group, colors):
    group_point(i, j, i)
legend = plt.legend(loc = 'lower left', frameon = True, prop = {'size' : 14})
frame = legend.get_frame()
frame.set_facecolor('white')
plt.title('Target areas of terrorist attacks')
plt.show()

In [None]:
groups = df[df['Country'].isin(df['Country'].value_counts()[:14].index)]
m5 = Basemap(projection = 'mill', llcrnrlat = -80, urcrnrlat = 80, llcrnrlon = -180, urcrnrlon = 180, lat_ts = 20, resolution = 'c', lat_0 =  True, lat_1 = True)
m5.drawcoastlines()
m5.drawcountries()
m5.fillcontinents(lake_color = 'black')
m5.drawmapboundary(fill_color = 'white')
fig = plt.gcf()
fig.set_size_inches(22, 10)
colors = ['r', 'g', 'b', 'y', '#800000', '#ff1100', '#8202fa', '#20fad9', '#ff5733', '#fa02c6', "#f99504", '#b3b6b7', '#8e44ad', '#1a2b3c']
group = list(groups['Country'].unique())
def group_point(group, color, label):
    lat_group = list(groups[groups['Country'] == group].latitude)
    long_group = list(groups[groups['Country'] == group].longitude)
    x_group, y_group = m5(long_group, lat_group)
    m5.plot(x_group, y_group, 'go', markersize = 3, color = j, label = i)
for i,j in zip(group, colors):
    group_point(i, j, i)
legend = plt.legend(loc = 'lower left', frameon = True, prop = {'size' : 16})
frame = legend.get_frame()
frame.set_facecolor('white')
plt.title('Target countries of terrorist attacks')
plt.show()

In [None]:
fig = plt.figure(figsize = (10, 6))
def animate(Year):
    ax = plt.axes()
    ax.clear()
    ax.set_title('Animation Of Terrorist Activities' + '\n' + 'Year:' +str(Year))
    m6 = Basemap(projection = 'mill', llcrnrlat = -80, urcrnrlat = 80, llcrnrlon = -180, urcrnrlon = 180, lat_ts = 20, resolution = 'c')
    lat6 = list(df[df['Year'] == Year].latitude)
    long6 = list(df[df['Year'] == Year].longitude)
    x6, y6 = m6(long6, lat6)
    m6.scatter(x6, y6, s = [(kill + wound) *  0.1 for kill, wound in zip(df[df['Year'] == Year].Killed, df[df['Year'] == Year].Wounded)], color = 'r')
    m6.drawcoastlines()
    m6.drawcountries()
    m6.fillcontinents(zorder = 1, alpha = 0.4)
    m6.drawmapboundary()
ani = animation.FuncAnimation(fig, animate, list(df.Year.unique()), interval = 1500)    
ani.save('animation.gif', writer = 'imagemagick', fps=1)
plt.close(1)
filename = 'animation.gif'
video = io.open(filename, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data = '''<img src="data:image/gif;base64,{0}" type="gif" />'''.format(encoded.decode('ascii')))

Lets plot the terrorist activities in India :

In [None]:
df_india = df[df['Country'] == 'India']
df_india.head()

Attacks with lesser than 100 casualities in India:

In [None]:
mi1 = Basemap(projection = 'mill', llcrnrlat = 5, urcrnrlat = 37, llcrnrlon = 67, urcrnrlon = 99)
lat_100 = list(df[df['casualities'] < 100].latitude)
long_100 = list(df[df['casualities'] < 100].longitude)
x_100, y_100 = mi1(long_100, lat_100)
mi1.drawcoastlines()
mi1.drawcountries()
mi1.plot(x_100, y_100, 'go', markersize = 1, color = 'g')
fig = plt.gcf()
fig.set_size_inches(15, 10)
plt.title("Terroist attacks with lesser than 100 casualities(India)")
plt.plot()

It is clear from the above map that states like Andhra, Bihar, Orissa and J&K are the most affected states from terrorism. Presence of Naxals is one of the main reason for this.

Attacks with more than 100 casualities :

In [None]:
mi2 = Basemap(projection = 'mill', llcrnrlat = 5, urcrnrlat = 37, llcrnrlon = 67, urcrnrlon = 99)
lat_100 = list(df[df['casualities'] >= 100].latitude)
long_100 = list(df[df['casualities'] >= 100].longitude)
x_100, y_100 = mi2(long_100, lat_100)
mi2.drawcoastlines()
mi2.drawcountries()
mi2.plot(x_100, y_100, 'go', markersize = 5, color = 'r')
fig = plt.gcf()
fig.set_size_inches(15, 10)
plt.title("Terroist attacks with more than 100 casualities(India)")
plt.show()