## Names : Sayak Mallick, Aayushi Ajmera
## Course : Masters in Quantitative Data Science Methods
## Matriculation Numbers : 6000578, 6001009

In [1]:
# importing all required modules

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib import cm
import ipywidgets as widgets 
from ipywidgets import *

import geopandas
from shapely.geometry import Point, Polygon

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.cluster import KMeans

In [None]:
# importing the data

terror = pd.read_excel(r"../data/globalterrorismdb_0221dist.xlsx")
dk = terror.copy()

In [None]:
world_map = geopandas.read_file(r'../data/World_Countries__Generalized_.shp')
capitals = pd.read_csv(r'../data/concap.csv')
happiness2015 = pd.read_csv("../data/2015.csv")
happiness2016 = pd.read_csv("../data/2016.csv")
happiness2017 = pd.read_csv("../data/2017.csv")
happiness2018 = pd.read_csv("../data/2018.csv")
happiness2019 = pd.read_csv("../data/2019.csv")

# Data Preprocessing

In [None]:
df = dk

df['ncasualties'] = df['nkill'] + df['nwound']
df['has_casualties'] = df['ncasualties'].apply(lambda x: 0 if x == 0 else 1)

In [None]:
years = [2015,2016,2017,2018,2019]
terror_snip = terror[terror['iyear'].isin(years)]
terror_new = terror_snip

In [None]:
terror_new["Rank"] = ""
terror_new["Score"] = ""
terror_new = terror_new.rename(columns={'country_txt': 'Country'})
happiness2015 = happiness2015.rename(columns={'Happiness Score' : 'Score_2015'})
happiness2016 = happiness2016.rename(columns={'Happiness Score' : 'Score_2016'})
happiness2017 = happiness2017.rename(columns={'Happiness.Score' : 'Score_2017'})
happiness2018 = happiness2018.rename(columns={'Country or region' : 'Country', 'Score' : 'Score_2018'})
happiness2019 = happiness2019.rename(columns={'Country or region' : 'Country', 'Score' : 'Score_2019'})

# Visualizing the data

In [None]:
plt.figure(figsize=(25,10))
sns.countplot(terror['iyear'], palette="rocket_r").set_title('Year wise attacks', fontsize=25)
plt.xticks(rotation=90)
plt.xlabel('Year', fontsize=15)
plt.ylabel('No. of attacks', fontsize=15)

In [None]:
#Terrorist incident locations plotted

crs = {'init':'EPSG:4326'}
geometry = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]
total_inc = geopandas.GeoDataFrame(df, crs = crs, geometry = geometry)

fig, ax = plt.subplots(figsize = (20,10))
ax.axes.xaxis.set_ticklabels([])
ax.axes.yaxis.set_ticklabels([])
world_map.to_crs(epsg=4326).plot(ax=ax, color='lightgrey')
total_inc.plot(ax=ax, alpha= 0.01, color = 'k')
ax.set_title('Incidents',fontsize = 20)

In [None]:
#Estimated global terrorist hubs based on country to which kidnappers diverted

crs = {'init':'EPSG:4326'}

div = df[pd.notna(df['divert'])]
div_cap = div.copy()
div_cap['Diverted to capital'] = div_cap['divert'].map (dict(zip(div_cap.country_txt,capitals.CapitalName)))
div_cap['CapLong'] = div_cap['Diverted to capital'].map (dict(zip(div_cap['Diverted to capital'],capitals.CapitalLongitude)))
div_cap['CapLat'] = div_cap['Diverted to capital'].map (dict(zip(div_cap['Diverted to capital'],capitals.CapitalLatitude)))

geometry = [Point(xy) for xy in zip(div['longitude'], div['latitude'])]
cap_geometry = [Point(xy) for xy in zip(div_cap['CapLong'], div_cap['CapLat'])]

div = geopandas.GeoDataFrame(div, crs = crs, geometry = geometry)
div_cap = geopandas.GeoDataFrame(div_cap, crs = crs, geometry = cap_geometry)

fig, ax = plt.subplots(figsize = (20,10))
ax.axes.xaxis.set_ticklabels([])
ax.axes.yaxis.set_ticklabels([])
world_map.to_crs(epsg=4326).plot(ax=ax, color='lightgrey')
div.plot(ax=ax, alpha= 1, color = 'cyan', label = 'Location of kidnapping and hijacking', legend = True)
div_cap.plot(ax=ax, alpha = 1, color = 'green', marker = '*', markersize = 300,label = 'Location diverted to', legend = True)
ax.legend(labels = ['Location of kidnapping and hijacking', 'Location diverted to'])
ax.set_title('Locations to which kidnappers & hijackers diverted',fontsize = 20)
plt.show()

# Economic Influence

In [None]:
#Terrorist incident locations plotted for which property damage was above 1 million dollars

catas_inc = df[df['propextent'].isin([1,2])]
crs = {'init':'EPSG:4326'}
geometry = [Point(xy) for xy in zip(catas_inc['longitude'], catas_inc['latitude'])]
catas_inc = geopandas.GeoDataFrame(catas_inc, crs = crs, geometry = geometry)

fig, ax = plt.subplots(figsize = (20,10))
ax.axes.xaxis.set_ticklabels([])
ax.axes.yaxis.set_ticklabels([])
world_map.to_crs(epsg=4326).plot(ax=ax, color='lightgrey')
catas_inc.plot(ax=ax, alpha= 0.7, color = 'red')
ax.set_title('Incidents for which property damage was above 1 million dollars',fontsize = 20)

# Social Influence

In [None]:
attacks = terror_new.groupby('Country').count()['eventid']
attacks = pd.DataFrame(attacks)
attacks = attacks.rename(columns={'eventid':'Incidents'})
attacks

In [None]:
hpterror15 = dict(zip(happiness2015.Country,happiness2015.Score_2015))
hpterror16 = dict(zip(happiness2016.Country,happiness2016.Score_2016))
hpterror17 = dict(zip(happiness2017.Country,happiness2017.Score_2017))
hpterror18 = dict(zip(happiness2018.Country,happiness2018.Score_2018))
hpterror19 = dict(zip(happiness2019.Country,happiness2019.Score_2019))

In [None]:
attacks['score_2015'] = attacks.index.map(hpterror15)
attacks['score_2016'] = attacks.index.map(hpterror16)
attacks['score_2017'] = attacks.index.map(hpterror17)
attacks['score_2018'] = attacks.index.map(hpterror18)
attacks['score_2019'] = attacks.index.map(hpterror19)

In [None]:
fig, ax = plt.subplots(5, figsize = (15,25))
attacks = attacks.fillna(0)

lp = ['score_2015','score_2016','score_2017','score_2018','score_2019']
for k in range(len(lp)):
    attacks.sort_values (by = lp[k],ascending = False, inplace = True)
    y = attacks.Incidents
    x = attacks[lp[k]]

    a = ax[k].bar(x, y, color='black', width = 0.2);
    b = ax[k].axvline(5,color = 'r', linestyle = '--')
    ax[k].set_xlabel('Happiness Score')
    ax[k].set_ylabel('Terrorist Attacks')
    ax[k].legend(handles =[a,b],labels=['Terrorist Attacks','Separator for Happy countries (towards the right)'])
    ax[k].set_title(lp[k])

fig.tight_layout()
plt.show()

# Prediction of casualties

In [None]:
feature_cols = ['iyear', 'imonth', 'iday', 'latitude', 'longitude','extended','vicinity','doubtterr','multiple','success',
                'suicide','claimed','property','ishostkid','country_txt','region','attacktype1_txt','targtype1_txt','weaptype1_txt']

In [None]:
target_col = 'has_casualties'

In [None]:
lb = LabelEncoder()
df['country_txt'] = lb.fit_transform(df['country_txt'])
df['attacktype1_txt'] = lb.fit_transform(df['attacktype1_txt'])
df['targtype1_txt'] = lb.fit_transform(df['targtype1_txt'])
df['weaptype1_txt'] = lb.fit_transform(df['weaptype1_txt'])

In [None]:
X = df[feature_cols].fillna(0)
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
forest = ExtraTreesClassifier(n_estimators=20, random_state=0)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
fnames = [feature_cols[i] for i in indices]

In [None]:
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],color="y", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), fnames, rotation=90)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
feature_cols = ['longitude', 'targtype1_txt', 'latitude', 'attacktype1_txt', 'property', 'iday', 'imonth']
X = df[feature_cols].fillna(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
model = RandomForestClassifier(n_estimators=20)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
np.mean(y_pred == y_test)

In [None]:
model = RandomForestClassifier(n_estimators=20)

scores = cross_val_score(model, X, y, cv=10) # Ten-fold cross validation.
print(scores)
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

In [None]:
model = DummyClassifier(strategy="most_frequent")
%time model.fit(X_train, y_train)
%time y_pred = model.predict(X_test)
np.mean(y_pred == y_test)

In [None]:
model = AdaBoostClassifier(n_estimators=20)

scores = cross_val_score(model, X, y, cv=10) # Ten-fold cross validation.
print(scores)
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

In [None]:
model = GradientBoostingClassifier(n_estimators=20)

scores = cross_val_score(model, X, y, cv=10) # Ten-fold cross validation.
print(scores)
print('Accuracy: %0.2f (+/- %+0.2f)' % (scores.mean(), scores.std() * 2))

# K means clustering to group nearby incidents

In [None]:
# removing outliers - incidents where a lot of people were killed/ injured
data = dk[dk['nkill'] <= 8].reset_index(drop=True)
data = data[data['nwound'] <= 12].reset_index(drop=True)

In [None]:
fts = ['longitude','latitude','nwound','nkill','natlty1_txt','targtype1_txt','targsubtype1_txt',
            'weaptype1_txt','attacktype1_txt']

X = pd.get_dummies(data[fts])
X = X.T[X.var() > 0.05].T.fillna(0)
X = X.fillna(0)

print('Shape :', X.shape)
X.head()

In [None]:
scores = {}
for k in range(2, 11):
    print(k, end=', ')
    scores[k] = KMeans(n_clusters=k).fit(X).score(X)
pd.Series(scores).plot.bar()

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import scale
data['Cluster'] = KMeans(n_clusters=6).fit_predict(X) + 1
print('Silhouette Score:', silhouette_score(X, data['Cluster'], sample_size=10000) * 10000 // 1 / 100, '%')

In [None]:
names = data.groupby('Cluster')['region_txt'].describe()['top'].values
data['ClusterName'] = data['Cluster'].apply(lambda c: names[c - 1])

numerical = data.dtypes[data.dtypes != 'object'].keys()
exclude = [
    'eventid', 'Cluster', 'region', 'country', 'iyear', 
    'natlty1', 'natlty2', 'natlty3', 'imonth', 'iday',
    'guncertain1', 'guncertain2', 'guncertain3'
] + [col for col in numerical if 'type' in col or 'mode' in col or 'ransom' in col]
X_profiling = data[numerical.drop(exclude)].fillna(0)

X_profiling['ClusterName'] = data['ClusterName']

In [None]:
ckeys = data['region_txt'].unique()
ckeys = dict(zip(ckeys, plt.cm.tab20(range(len(ckeys)))))

fig,ax = plt.subplots(1,figsize = (20,10))
ax.axes.xaxis.set_ticklabels([])
ax.axes.yaxis.set_ticklabels([])
for i, x in pd.concat([X_profiling, data['region_txt']], axis=1).groupby('region_txt'):
    a = ax.scatter(x['longitude'], x['latitude'], c=ckeys[i], marker='.', cmap='tab10', label=i)
plt.legend(loc=3)

In [None]:
print('Similarity between cluster and region labels:', 
      len(data[data['region_txt'] == data['ClusterName']]) / len(data) * 10000 // 1 / 100, '%')

# Extras

In [None]:
def plotf(Year):
    dataperyear= terror.loc[terror['iyear']==Year]
    region = dataperyear['region_txt'].unique()
    
    attacks = dataperyear['region_txt'].value_counts()
    fig = plt.figure(figsize =(13, 13))
    plt.pie(attacks)
    plt.legend(region)
    fig.tight_layout()
interactive_plot = interact(
    plotf,
    Year = widgets.IntSlider(min = 1970,max = 2019,step = 10, value = 2019)
)

interactive_plot

In [None]:
df

In [None]:
df.groupby('gname').count()['eventid'].sort_values(ascending = False)[1:11]

In [None]:
groups = list(df.groupby('gname').count()['eventid'].sort_values(ascending = False)[1:11].index)
groups

In [None]:
g_dict = dict(zip(groups, sns.color_palette("bright", 10)))
g_dict

In [None]:
#Top 10 terrorist groups plotted by group

group_plot = df[df['gname'].isin(groups)]
group_plot['color'] = [g_dict[x] for x in group_plot.gname]

crs = {'init':'EPSG:4326'}
geometry = [Point(xy) for xy in zip(group_plot['longitude'], group_plot['latitude'])]
total_inc = geopandas.GeoDataFrame(group_plot, crs = crs, geometry = geometry)

fig, ax = plt.subplots(figsize = (30,17))
world_map.to_crs(epsg=4326).plot(ax=ax, color='lightgrey')

for c in range (10):
    tt = total_inc[total_inc['gname'] == groups[c]]
    tt.plot(ax=ax, alpha = 1, label = total_inc.gname)

ax.axes.xaxis.set_ticklabels([])
ax.axes.yaxis.set_ticklabels([])
ax.legend(groups,title="Terrorist Groups", fontsize= 9 , title_fontsize=10, loc = 'lower right')
ax.set_title('Top 10 terror groups and their activites',fontsize = 20)