# EDA: Terrorism

### Importing Datasets

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import datasets
from matplotlib import pyplot
from pandas.plotting import scatter_matrix
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import os 
import mpl_toolkits
import json

### Reading Datasets

In [4]:
df_terrorism= pd.read_csv(r"../input/globalterrorism/globalterrorismdb_0718dist.csv",encoding='ISO-8859-1')

In [5]:
df_terrorism.head()

In [6]:
# Dimensions of the DataFrame
df_terrorism.shape

### Cleaning

In [9]:
# Selecting Wanted columns
usecols = [1, 5, 8, 10, 11, 12, 13, 14, 25, 26, 27, 29, 35, 58, 69, 71, 82, 98, 100, 101, 103, 104, 106]
renamecols = {
    'latitude': 'lat',
    'longitude': 'lon',
    'iyear': u'year',
    'country_txt': u'country',
    'region_txt': u'region',
    'provstate': u'state',
    'attacktype1_txt': u'attacktype',
    'targtype1_txt': u'targettype',
    'weaptype1_txt': u'weapontype',
    'nperps': u'nter',
    'nkill': u'nkilled',
    'nkillter': u'nkilledter',
    'nwound': u'nwounded',
    'nwoundte': u'nwoundedter',
    'propextent_txt': u'propertyextent'
}

In [11]:
df_terrorism= pd.read_csv(r"../input/globalterrorism/globalterrorismdb_0718dist.csv",encoding='ISO-8859-1',usecols=usecols)
df_terrorism.rename(columns=renamecols, inplace=True)
df_terrorism = df_terrorism.apply(lambda x: x.encode('utf-8').strip() if isinstance(x, str) else x)

In [12]:
df_terrorism.shape

In [13]:
df_terrorism.head()

In [14]:
# Removing unknown values in the coordinates
df_terrorism = df_terrorism[pd.notnull(df_terrorism.lat)]
df_terrorism = df_terrorism[pd.notnull(df_terrorism.lon)]
print("Unknown values in the coordinates are removed succcessfully")

In [16]:
# Unknowns in numeric columns
exclude_cols = ['year', 'lat', 'lon']
float_cols = [c for c in df_terrorism.select_dtypes(include=[float]).columns.tolist() if c not in exclude_cols]

In [17]:
df_terrorism[float_cols] = df_terrorism[float_cols].fillna(0).astype(int)
df_terrorism[float_cols] = df_terrorism[float_cols].mask(df_terrorism[float_cols] < 0, 0)

In [18]:
# Unknowns in string columns
str_cols = df_terrorism.select_dtypes(exclude=[float]).columns.tolist()
df_terrorism[str_cols] = df_terrorism[str_cols].fillna('Unknown')

In [20]:
# Many fields are dots which mean unknown value
df_terrorism[str_cols] = df_terrorism[str_cols].replace(r'^\.*$', 'Unknown', regex=True)

In [21]:
# Limit Long strings
df_terrorism['weapontype'] = df_terrorism['weapontype'].replace(u'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)', 'Vehicle')
df_terrorism['propertyextent'] = df_terrorism['propertyextent'].replace(u'Minor (likely < $1 million)', u'Minor (< $1 million)')
df_terrorism['propertyextent'] = df_terrorism['propertyextent'].replace(u'Major (likely > $1 million but < $1 billion)', u'Major (< $1 billion)')
df_terrorism['propertyextent'] = df_terrorism['propertyextent'].replace(u'Catastrophic (likely > $1 billion)', u'Catastrophic (> $1 billion)')


In [22]:
# Number of duplicates values
df_terrorism.duplicated().sum()

In [23]:
# Removal of duplicates values
df_terrorism.drop_duplicates(keep=False,inplace=True)
# No more null values
df_terrorism.isnull().sum()

In [24]:
# Shape of Processed dataset
df_terrorism

### EDA

In [25]:
# Columns in the Dataset
df_terrorism.columns

In [26]:
# Datatypes in the Dataframe
df_terrorism.dtypes

In [27]:
# Summary of Dataset
df_terrorism.info()

In [28]:
# Count of Values in each column of DataFrame
df_terrorism.nunique()

In [29]:
# Years in the DataFrame
df_terrorism["year"].unique()

In [30]:
# Count of each year in the Data Frame
df_terrorism["year"].value_counts()

In [31]:
# Total Count of the regions in the DataFrame
df_terrorism['region'].value_counts()

In [32]:
# Total count of gname in the DataFrame
df_terrorism['gname'].value_counts()

In [33]:
# Total count of cities in the DataFrame
df_terrorism['city'].value_counts()

In [34]:
# Total count of Attack type in the DataFrame
df_terrorism['attacktype'].value_counts()

In [35]:
# Total count of Target type in the DataFrame
df_terrorism['targettype'].value_counts()

In [36]:
# Stastical Summary of Data
df_terrorism.describe()

In [37]:
# Data in each column
for col in df_terrorism:
    print(df_terrorism[col].unique())


In [38]:
# Covariance between set of variables
df_terrorism.cov()

In [39]:
# Slicing Data
df_terrorism.iloc[0]

In [40]:
df_terrorism.iloc[:,0] 

### Visualization

In [41]:
# Heat Map of the correlation among the columns
fig,axes = plt.subplots(1,1,figsize=(16,14))
sns.heatmap(df_terrorism.corr(), annot= True)
plt.show()

In [42]:
# Heat map of the covariance among the set of variables
fig,axes = plt.subplots(1,1,figsize=(16,14))
sns.heatmap(df_terrorism.cov(), annot= True)
plt.show()

In [47]:
# Pie Plot of AttackTypes
plt.figure(figsize=(10,10))
df_terrorism['attacktype'].value_counts().plot.pie(autopct="%1.1f%%")
plt.show()

In [48]:
# Pie Plot of TargetTypes
plt.figure(figsize=(10,10))
df_terrorism['targettype'].value_counts().plot.pie(autopct="%1.1f%%")
plt.show()

In [50]:
# Pie Plot of WeaponTypes
plt.figure(figsize=(10,10))
df_terrorism['weapontype'].value_counts().plot.pie(autopct="%1.1f%%")
plt.show()

In [51]:
# Pie Plot of Region
plt.figure(figsize=(10,10))
df_terrorism['region'].value_counts().plot.pie(autopct="%1.1f%%")
plt.show()

In [52]:
# Number of Terrorist Activities by Region
plt.subplots(figsize=(15,5))
sns.countplot('region',data=df_terrorism,palette='inferno',order=df_terrorism['region'].value_counts().index)
plt.xticks(rotation=90)
plt.xlabel('Regions')
plt.title('Number Of Terrorist Activities By Region')
plt.show()

In [53]:
# Number of Terrorist Activities Each Year
plt.subplots(figsize=(15,8))
sns.countplot('year',data=df_terrorism,palette='RdYlGn_r',edgecolor=sns.color_palette('dark',7))
plt.xticks(rotation=90)
plt.xlabel('Year of attack')
plt.title('Number Of Terrorist Activities Each Year')
plt.show()

In [54]:
# Frequency of Attacks
plt.figure(figsize=(18,8))
sns.countplot(x=df_terrorism["year"], hue="success", data = df_terrorism)
plt.xlabel("Years")
plt.xticks(rotation=90)
plt.ylabel("Frequency of Attacks")

In [55]:
# Attacking Methods of Terrorists
plt.subplots(figsize=(15,4))
sns.countplot('attacktype',data=df_terrorism,palette='inferno',order=df_terrorism['attacktype'].value_counts().index)
plt.xticks(rotation=90)
plt.xlabel('Attack Type')
plt.title('Attacking Methods by Terrorists')
plt.show()

In [56]:
# Frequency of each weapon used
plt.figure(figsize=(15,6))
sns.barplot(x= df_terrorism["weapontype"].value_counts()[:50].index,y = df_terrorism["weapontype"].value_counts()[:50].values)
plt.xlabel("Weapons used", fontsize=20)
plt.xticks(rotation=90)
plt.ylabel("Frequency of a Particluar weapon used", fontsize=12)
plt.title("Types of Weapons", fontsize=20)

In [57]:
#Attacking Methods by the Terrorists
plt.subplots(figsize=(15,6))
sns.countplot(x= 'attacktype', data=df_terrorism, palette='coolwarm',order=df_terrorism['attacktype'].value_counts().index)
plt.xticks(rotation=90)
plt.title('Attacking Methods by Terrorists')
plt.show()

In [58]:
# Different Target places for Terrorists to Attack
plt.subplots(figsize=(15,6))
sns.countplot('targettype',data=df_terrorism,palette='inferno',order=df_terrorism['targettype'].value_counts().index)
plt.xticks(rotation=90)
plt.xlabel('Target Type')
plt.title('Target by Terrorists')
plt.show()

In [59]:
# Number of Terrorist Activities Countrywise
plt.figure(figsize=(20,6))
sns.barplot(x=df_terrorism['country'].value_counts()[:25].index, y=df_terrorism['country'].value_counts()[:25])
plt.xlabel("Countries")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.rcParams['figure.figsize'] = [10,8]
plt.title("Number of Terrorist Activities Countrywise")


In [60]:
# Number of Terrorist Activities Regionwise
sns.countplot(x= df_terrorism["region"], hue="success", data=df_terrorism)
plt.xlabel("Regions", fontsize=20)
plt.xticks(rotation=90)
plt.ylabel("Terrorism Count", fontsize=20)
plt.title("Number of Terrorist Activities Regionwise", fontsize=20)

In [61]:
# Terrorist Attack Year VS Region
df_region=pd.crosstab(df_terrorism.year,df_terrorism.region)
df_region.plot(color=sns.color_palette('Set2',12))
fig=plt.gcf()
plt.title("Terrorist Attack Year VS Region", fontsize=20)
fig.set_size_inches(15,8)
plt.show()

In [62]:
# Terrorist Attack Year VS Target Type
df_region=pd.crosstab(df_terrorism.year,df_terrorism.targettype)
df_region.plot(color=sns.color_palette('Set2',12))
fig=plt.gcf()
plt.title("Terrorist Attack Year VS Target Type", fontsize=20)
fig.set_size_inches(15,8)
plt.show()

In [63]:
# Wounded VS Year
d=df_terrorism.groupby(['year','region'])['nwounded'].sum()
plot_df_terrorism = d.unstack('region').loc[:]
plot_df_terrorism.index = pd.PeriodIndex(plot_df_terrorism.index.tolist(),freq='A')
plot_df_terrorism.plot(figsize=(15,8),color=sns.color_palette('Set2',12))
plt.title("Wounded Vs Year", fontsize=20)
plt.xlabel("Year")
plt.ylabel("Wounded")

In [64]:
# Histogram of the Data
df_terrorism.hist(figsize=(35,30))

In [65]:
# Attacks VS Killed
count_terror = df_terrorism['country'].value_counts()[:15].to_frame()
count_terror.columns=['Attacks']
count_kill=df_terrorism.groupby ('country')['nkilled'].sum().to_frame()
count_terror.merge(count_kill,left_index = True,right_index =True,how='left').plot.bar(width=0.9)
fig=plt.gcf()
plt.title("Attacks VS Killed", fontsize=20)
fig.set_size_inches(16,4)
plt.show()

In [68]:
import folium
from folium.plugins import MarkerCluster 
filterYear = df_terrorism['year'] == 1970

In [70]:
filterData = df_terrorism[filterYear] # filter data
# filterData.info()
reqFilterData = filterData.loc[:,'city':'lon'] #We are getting the required fields
reqFilterData = reqFilterData.dropna() # drop NaN values in latitude and longitude
reqFilterDataList = reqFilterData.values.tolist()
# reqFilterDataList

In [71]:
map = folium.Map(location = [0, 30], tiles='CartoDB positron', zoom_start=2)
# clustered marker
markerCluster = folium.plugins.MarkerCluster().add_to(map)
for point in range(0, len(reqFilterDataList)):
    folium.Marker(location=[reqFilterDataList[point][1],reqFilterDataList[point][2]],
                  popup = reqFilterDataList[point][0]).add_to(markerCluster)
map

84% of the terrorist attacks in 1970 were carried out on the American continent. In 1970, the Middle East and North Africa, currently the center of wars and terrorist attacks, faced only one terrorist attack.

In [77]:
terror_df_group = df_terrorism.dropna(subset=['lat','lon'])
terror_df_group = terror_df_group.drop_duplicates(subset=['country','gname'])
terrorist_groups = df_terrorism.gname.value_counts()[1:8].index.tolist()
terror_df_group = terror_df_group.loc[terror_df_group.gname.isin(terrorist_groups)]
print(terror_df_group.gname.unique())

In [79]:
map = folium.Map(location=[20, 0], tiles="CartoDB positron", zoom_start=2)
markerCluster = folium.plugins.MarkerCluster().add_to(map)
for i in range(0,len(terror_df_group)):
    folium.Marker([terror_df_group.iloc[i]['lat'],terror_df_group.iloc[i]['lon']], 
                  popup='gname:{}<br>country:{}'.format(terror_df_group.iloc[i]['gname'], 
                  terror_df_group.iloc[i]['country'])).add_to(map)
map

### Conclusion

* Country with the highest number of Terrorist Attacks: Iraq
* Regions with the highest number of Terrorist Attacks: Middle East & North Africa
* Maximum number of people were killed by a single terrorist attack are 1570 people that took place in Iraq
* Year with the most Attacks: 2014
* Month with the most Attacks: 5
* Group with the most Attacks: Taliban
* Most Attacks Types: Bombing/Explosion