# II. Stats

## Basic statistics

In [1]:
from __future__ import division
from collections import Counter
import math

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/globalterrorism_cleaned.csv', encoding='utf-8')

In [3]:
df.shape

(152252, 24)

In [23]:
df.columns

Index([u'year', u'extended', u'country', u'region', u'state', u'city', u'lat',
       u'lon', u'multiple', u'success', u'suicide', u'attacktype',
       u'targettype', u'gname', u'nter', u'claimed', u'weapontype', u'nkilled',
       u'nkilledter', u'nwounded', u'nwoundedter', u'property',
       u'propertyextent', u'countrycode'],
      dtype='object')

In [38]:
num_df = df.select_dtypes(include=[int])
num_desc_df = np.round(num_df.describe(), decimals=0).astype(int).iloc[[3,7]]
nans = np.round(num_df[num_df == -9].count()/len(num_df)*100, decimals=2).astype(str)+'%'
nans.name = 'nans'
num_desc_df.append(nans)

Unnamed: 0,year,extended,multiple,success,suicide,nter,claimed,nkilled,nkilledter,nwounded,nwoundedter,property
min,1970,0,0,0,0,0,0,0,0,0,0,-9
max,2015,1,1,1,1,25000,2,1500,500,5500,200,1
nans,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,11.12%


In [39]:
num_df = df.select_dtypes(include=[float])
num_desc_df = num_df.describe().iloc[[3,7]]
nans = np.round(num_df[num_df == -9].count()/len(num_df)*100, decimals=2).astype(str)+'%'
nans.name = 'nans'
num_desc_df.append(nans)

Unnamed: 0,lat,lon
min,-53.1546,-176.176
max,74.6336,179.367
nans,0.0%,0.0%


In [5]:
str_df = df.select_dtypes(exclude=[float, int])
str_cols = list(set(str_df.columns.tolist()))
str_df = str_df[str_cols]
str_desc_df = str_df.describe().iloc[1:3,:]
nans = np.round(str_df[str_df == 'Unknown'].count()/len(str_df)*100, decimals=2).astype(str)+'%'
nans.name = 'nans'
str_desc_df.append(nans)

Unnamed: 0,propertyextent,targettype,countrycode,city,country,region,weapontype,state,attacktype,gname
unique,4,22,191,29579,204,12,12,2487,9,3216
top,Unknown,Private Citizens & Property,IRQ,Unknown,Iraq,Middle East & North Africa,Explosives/Bombs/Dynamite,Unknown,Bombing/Explosion,Unknown
nans,75.27%,2.46%,0.0%,4.69%,0.0%,0.0%,7.59%,8.68%,3.38%,46.41%


## Categorial data

In [6]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
from plotly import tools
import colorlover as cl

init_notebook_mode(connected=True)
print __version__ # requires version >= 1.9.0

2.0.7


In [7]:
# Setup colors
bgcolor = 'white'
gridcolor = 'lightgrey'
fontcolor = 'black'

def clamp(x): 
    return max(0, min(x, 255))
colors = ['#%02x%02x%02x' % (clamp(rgb[0]), clamp(rgb[1]), clamp(rgb[2])) for rgb in cl.to_numeric(cl.scales['12']['qual']['Set3'])]

# Setup fonts
titlesize = 16
labelsize = 14
defaultsize = 12
smallsize = 10

### Distribution of binary attributes

In [8]:
def barplot(x, y, t, xl, yl, margin=dict()):
    data = [
        go.Bar(
            x=x, 
            y=y, 
            marker=dict(
                 color=colors[9]
            )
        )
    ]
    layout = go.Layout(
        showlegend=False,
        bargap=0,
        title=t,
        titlefont=dict(
            size=titlesize
        ),
        xaxis=dict(
            title=xl,
            titlefont=dict(
                size=labelsize
            ),
            zeroline=False,
            showgrid=False,
            autotick=True,
            tickcolor=bgcolor
            ),
        yaxis=dict(
            title=yl,
            titlefont=dict(
                size=labelsize
            ),
            zeroline=False,
            gridcolor=gridcolor,
            autotick=True,
            tickcolor=bgcolor
        ),
        plot_bgcolor=bgcolor,
        paper_bgcolor=bgcolor,
        font=dict(
            color=fontcolor,
            size=defaultsize
        ),
        margin=margin
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [9]:
count_pos = lambda col: Counter(df[col].map(lambda x: str(x == True)).values.tolist())['True']/len(df.index)
options = {
    'Extended': count_pos('extended'),
    'Success': count_pos('success'),
    'Multiple incidents': count_pos('multiple'),
    'Suicide': count_pos('suicide'),
    'Responsibility claimed': count_pos('claimed'),
    'Property damaged': count_pos('property')
}
x, y = zip(*sorted(options.items(), key=lambda x: x[1], reverse=True))

barplot(x, y, 'Distribution of Various Attributes', 'Attribute', 'Frequency')

We re able to discover additional attributes of terror attacks, like success. For example, a bomb that exploded in a building would be counted as a success even if it did not succeed in killing or injuring targets. We see nearly 90% of attacks succeeded. Also, the most attacks resulted in a property damage. Nearly 13% of attacks were part of a multiple incident, while only nearly 4% lasted more than 24h. In 8% of cases the underlying terror group claimed the responsibility. To our great surprise, a suicide attack took place only in 3% of cases - no need to fear Kamikaze nowadays.

### Terrorist Attacks per Year

In [10]:
x, y = zip(*df.groupby('year').size().reset_index(name='size').values.tolist())

barplot(x, y, 'Terrorist Attacks per Year', 'Year', 'Count')

We see a continual development of terrorism right up to year 2014, which may support the fear of global insecurity and play into the hands of populists of all kinds. But before making some conclusions, let us take a look at the geographic distribution of terrorism.

### Terrorist Attacks per Region

In [11]:
x, y = zip(*sorted(df.groupby('region').size().reset_index(name='size').values.tolist(), key=lambda x: x[1], reverse=True))

barplot(x, y, 'Terrorist Attacks per Region', 'Region', 'Count', dict(b=150))

Middle East, North Africa and South Asia all count for more than a half of all terrorist attacks worldwide. After South America, Europe takes only the fourth place of regions most targeted by terrorists, while North America only the ninth. Both regions share only 12% of attacks around the globe. 

### Top 10 Terrorist Groups

In [12]:
x, y = zip(*Counter(df[df.gname != 'Unknown'].gname.values.tolist()).most_common()[:10])

barplot(x, y, 'Top 10 Terorrist Groups', 'Terorirst Group', 'Count', dict(b=200))

Taliban is (or was) the most dangerous organization through 45 years of terrorism. 

### Property Damage Extent

In [13]:
x, y = zip(*Counter(df[(df.propertyextent != 'Unknown') & (df.property == 1)]['propertyextent'].values.tolist()).most_common())

barplot(x, y, 'Property Damage Extent', 'Extent', 'Count')

Lastly, we do not know anything about the property damage of the majority of cases, but seems like only a few caused more than $1 billion in damage (09 Sep 2001).

## Small multiples

### Development of Terrorist Attacks in Focus Regions

In [14]:
def smallmulti(x, y, t, xl, yl):
    pos = [
        ([0, 0.45], [0.6, 1]),
        ([0.55, 1], [0.6, 1]),
        ([0, 0.45], [0, 0.4]),
        ([0.55, 1], [0, 0.4])
    ]

    data = []
    layout = dict(
        bargap=0,
        title=t,
        titlefont=dict(
            size=titlesize
        ),
        plot_bgcolor=bgcolor,
        paper_bgcolor=bgcolor,
        font=dict(
            color=fontcolor,
            size=defaultsize
        ),
        showlegend=False
    )
    for i, l in enumerate(xl):
        data.append(go.Bar(
            x=x[i], 
            y=y[i], 
            marker=dict(
                color=colors[5]
            ),
            xaxis='x%d'%(i+1),
            yaxis='y%d'%(i+1)
        ))
        layout['xaxis%d'%(i+1)] = dict(
            title='' + l + '',
            domain=pos[i][0],
            anchor='y%d'%(i+1),
            zeroline=False,
            autotick=True,
            tickcolor=bgcolor,
            nticks=5
        )
        layout['yaxis%d'%(i+1)] = dict(
            title='Count',
            domain=pos[i][1],
            anchor='x%d'%(i+1),
            zeroline=False,
            gridcolor=gridcolor,
            autotick=True,
            tickcolor=bgcolor,
            nticks=5
        )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [15]:
regions = ['Middle East & North Africa', 'South Asia', 'Western Europe', 'North America']
x, y = zip(*[zip(*df[df["region"] == r].groupby('year').size().reset_index(name='size').values.tolist()) for i, r in enumerate(regions)])

smallmulti(x, y, 'Development of Terrorist Attacks in Focus Regions', regions, ['Count']*4)

Furthermore, we must have the most terrifying times behind us, back in 20th century, were the number of attacks was at its maximum. After that, we have actually done quite well (compared to other hot regions).

### Distribution of Fatalities and Injuries

We also want to know how many attacks caused how many deaths and injuries, both on targets and terrorists. For this, we can rely on small multiples to show four different histograms.

In [16]:
col_labels = [
    ('nkilled', 'Killed'),
    ('nwounded', 'Wounded'),
    ('nkilledter', 'Killed Terrorists'),
    ('nwoundedter', 'Wounded Terrorists')
]

x, y = [], []
for i, (col, label) in enumerate(col_labels):
    x_, y_ = zip(*Counter(df[col][df[col] >= 0].values.tolist()).most_common()[:10])
    y_ = np.asarray(y_)/len(df.index)
    x.append(x_)
    y.append(y_)
    
smallmulti(x, y, 'Distribution of Fatalities and Injuries', zip(*col_labels)[1], ['Frequency']*4)

To our big surprise, nearly a half of all attacks on targets went without deaths and more than half without injuries. In more than a half of all cases the terrorist survived. 

## Multidimensional data

### Terrorist Attacks by Weapon

In [17]:
def scatterplot(col, title, xlabel, ylabel, color='skyblue'):
    t = df[col].unique()
    z = df.groupby(col).size().reset_index(name='size')['size']
    x = df.groupby(col)['nkilled'].sum().reset_index(name='sum')['sum']
    y = df.groupby(col)['nwounded'].sum().reset_index(name='sum')['sum']
    text = t + '<br>' + z.astype(str) + ' Attacks<br>' + x.astype(str) + ' Killed<br>' + y.astype(str) + ' Wounded<br>'

    data = [
        go.Scatter(
            x=x,
            y=y,
            mode='markers+text',
            marker=dict(
                size=10+np.asarray(z)/1000,
                color=colors[6],
                opacity=1,
                line=go.Line(
                    color=bgcolor
                )
            ),
            hoverinfo='text',
            text=t,
            textfont=dict(
                size=smallsize
            ),
            textposition='top'
        )
    ]
    
    layout = go.Layout(
        title=title,
        titlefont=dict(
            size=titlesize
        ),
        plot_bgcolor=bgcolor,
        paper_bgcolor=bgcolor,
        font=dict(
            color=fontcolor,
            size=defaultsize
        ),
        xaxis=dict(
            type='log',
            title=xlabel,
            showgrid=True,
            gridcolor=gridcolor,
            zeroline=False,
            autotick=True,
            tickcolor=bgcolor,
            nticks=4
        ),
        yaxis = dict(
            type='log',
            title=ylabel,
            showgrid=True,
            gridcolor=gridcolor,
            zeroline=False,
            autotick=True,
            tickcolor=bgcolor,
            nticks=4
        )
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [18]:
scatterplot('weapontype', 'Terrorist Attacks by Weapon', 'Wounded', 'Killed', color='orange')

What about typical terror-related weapons? The most death- and injury-bringing weapons are incendiary bombs and devices, but also chemical weapons. Both are also the most commonly used ones.

### Terrorist Attacks by Type

In [19]:
scatterplot('attacktype', 'Terrorist Attacks by Type', 'Wounded', 'Killed', color='orange')

After we familiarized ourselves with weapons, we would like to know how terrorists proceed to cause the highest damage. From the list of all possible actions, explosions and armed assaults both create the highest impact. 

### Terrorist Attacks by Target

In [20]:
scatterplot('targettype', 'Terrorist Attacks by Target', 'Wounded', 'Killed', color='orange')

Less surprising is the vulnerability of tourists, private citizens and transportation towards those types of attacks. Police officers feel the most secure though (even more than military).

## Geodata

### Choropleth

Because we have the coordinates of each attack, we can easily draw them on a map. For this, we either use a choropleth to highlight those countries hardest hit by terrorism, or we use a scatter plot to highlight attacks in response to their attributes (e.g., the number of fatalities and injuries).

In [21]:
region = 'Middle East & North Africa'
region_df = df.loc[df.region.isin([region])]
country_codes, nattacks = zip(*region_df.groupby('countrycode').size().reset_index(name='size').values.tolist())

data = [
    go.Choropleth(
        locations=country_codes,
        z=nattacks,
        colorscale=[[0, colors[8]], [1, colors[4]]],
        autocolorscale=False,
        reversescale=False,
        showscale=False,
        marker=dict(
            line=dict(
                color=bgcolor
            )
        )
    )
]

layout = go.Layout(
    title='Terrorist Attacks across %s (1970-2015)'%region,
    titlefont=dict(
        size=titlesize
    ),
    geo=dict(
        scope='world',
        showframe=False,
        showcoastlines=True,
        showland=True,
        bgcolor=bgcolor,
        landcolor=colors[8],
        showcountries=True,
        countrycolor=bgcolor,
        coastlinecolor=bgcolor,
        projection=dict(
            type='conic conformal',
        ),
        lonaxis=dict(range=[min(region_df.lon), max(region_df.lon)]),
        lataxis=dict(range=[min(region_df.lat), max(region_df.lat)])
    ),
    autosize=False,
    width=1000,
    height=700,
    plot_bgcolor=bgcolor,
    paper_bgcolor=bgcolor,
    font=dict(
        color=fontcolor
    ),
    margin=dict(
        l=50,
        r=50
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [22]:
# Group terrorist group names by country and count
gname_by_country = region_df.gname[region_df.gname != 'Unknown'].groupby([region_df.countrycode, region_df.gname]).size().reset_index(name='count')
# Return row indices of the groups with biggest counts
rowidx = gname_by_country['count'].groupby(gname_by_country['countrycode']).idxmax().reset_index(name='rowidx')['rowidx'].values.tolist()
# Extract columns separatelly
countrycodes, gnames, counts = zip(*gname_by_country.loc[rowidx, :].values.tolist())

# Setup color scale
mapper = {
    gname: i
    for i, gname in enumerate(set(gnames))
}
scale = cl.scales['12']['qual']['Set3']
def clamp(x): 
    return max(0, min(x, 255))
colorscl = [
    [i/len(mapper), '#%02x%02x%02x' % (clamp(rgb[0]), clamp(rgb[1]), clamp(rgb[2]))]
    for i, rgb in enumerate(cl.to_numeric(cl.to_rgb(cl.interp(scale, len(mapper)))))
]
# Colorscale must begin with 0 and end with 1 !
colorscl[-1] = [1.0, colorscl[-1][1]]

data = [
    go.Choropleth(
        locations=country_codes,
        z=[mapper[gname] for gname in gnames],
        colorscale=colorscl,
        autocolorscale=False,
        reversescale=False,
        showscale=False,
        marker=dict(
            line=dict(
                color=bgcolor
            )
        ),
        text=gnames
    )
]

layout = go.Layout(
    title='Most Present Terrorist Groups across %s (1970-2015)'%region,
    titlefont=dict(
        size=titlesize
    ),
    geo=dict(
        scope='world',
        showframe=False,
        showcoastlines=True,
        showland=True,
        bgcolor=bgcolor,
        landcolor=colors[8],
        showcountries=True,
        countrycolor=bgcolor,
        coastlinecolor=bgcolor,
        projection=dict(
            type='conic conformal',
        ),
        lonaxis=dict(range=[min(region_df.lon), max(region_df.lon)]),
        lataxis=dict(range=[min(region_df.lat), max(region_df.lat)])
    ),
    autosize=False,
    width=1000,
    height=700,
    plot_bgcolor=bgcolor,
    paper_bgcolor=bgcolor,
    font=dict(
        color=fontcolor
    ),
    margin=dict(
        l=50,
        r=50
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

The map above colors each country by the predominant terrorist group. 