# III. Data for visualization

## Load dataset

In [1]:
from __future__ import division
from collections import Counter
import math
import pandas as pd
import numpy as np

df = pd.read_csv('data/globalterrorism_cleaned.csv', encoding='utf-8')

In [2]:
df.sample()

Unnamed: 0,year,extended,country,region,state,city,lat,lon,multiple,success,...,nter,claimed,weapontype,nkilled,nkilledter,nwounded,nwoundedter,property,propertyextent,countrycode
19241,1983,0,Nicaragua,Central America & Caribbean,Unknown,San Juan Del Rio Coco,13.544303,-86.165168,0,1,...,0,0,Firearms,0,0,0,0,1,Unknown,NIC


In [3]:
df.columns

Index([u'year', u'extended', u'country', u'region', u'state', u'city', u'lat',
       u'lon', u'multiple', u'success', u'suicide', u'attacktype',
       u'targettype', u'gname', u'nter', u'claimed', u'weapontype', u'nkilled',
       u'nkilledter', u'nwounded', u'nwoundedter', u'property',
       u'propertyextent', u'countrycode'],
      dtype='object')

## Encode string values

In [4]:
import json

str_cols = ['region', 'country', 'weapontype', 'attacktype', 'targettype', 'gname', 'countrycode']

unique_str = sorted([
    unique.title()
    for col in str_cols 
    for unique in df[col].unique().tolist()
])
unique_str.extend(['World', 'Region', 'Country', 'Terrorist Group', 'Weapon Type', 'Attack Type', 'Target Type'])
unique_str.extend(['All Regions', 'All Types'])
str_map = {
    unique: i 
    for i, unique in enumerate(unique_str)
}
str_map_rev = {
    i: unique 
    for i, unique in enumerate(unique_str)
}
with open('/Users/olegpolakow/Google Drive/My Documents/Education/Master/DTU_SocialData_SS17/socialdata2017/project/dicts/strings.json', 'w') as fp:
    json.dump(str_map_rev, fp)
    
def encode_df(df_):
    str_cols = df_.select_dtypes(exclude=[float, int]).columns.tolist()
    for col in str_cols:
        df_[col] = df_[col].map(lambda x: str_map[x.title()])

## Histogram: Temporal distribution

In [5]:
categories = {
    'region': 'Region',
    'country': 'Country',
    'gname': 'Terrorist Group',
    'weapontype': 'Weapon Type',
    'attacktype': 'Attack Type',
    'targettype': 'Target Type'
}
columns = ['category', 'subcategory', 'year', 'nkilled', 'nwounded', 'nkilledter', 'nwoundedter']
metrics = ['nkilled', 'nwounded', 'nkilledter', 'nwoundedter']

df_ = pd.DataFrame(columns=columns)

for category, label in categories.iteritems():
    df1 = df.groupby([df[category], df.year]).size().reset_index(name='nattacks')
    df2 = df[metrics].groupby([df[category], df.year]).sum().reset_index()
    df3 = df1.join(df2.set_index([category, 'year']), on=[category, 'year'])
    df3.rename(index=str, columns={category: 'subcategory'}, inplace=True)
    df3['category'] = label
    df_ = df_.append(df3)

df1 = df.groupby(df.year).size().reset_index(name='nattacks')
df2 = df[metrics].groupby(df.year).sum().reset_index()
df3 = df1.join(df2.set_index(['year']), on=['year'])
df3['category'] = 'World'
df3['subcategory'] = 'All Regions'
df_ = df_.append(df3)

In [6]:
df_ = df_[df_.subcategory != 'Unknown']
df_[metrics+['nattacks', 'year']] = df_[metrics+['nattacks', 'year']].astype(int)

encode_df(df_)

df_.to_csv('/Users/olegpolakow/Google Drive/My Documents/Education/Master/DTU_SocialData_SS17/socialdata2017/project/data/histogram.csv', encoding='utf-8', index=False)

## Scatterplot: Terrorism types

In [7]:
categories = {
    'region': 'Region',
    'country': 'Country'
}
types = {
    'weapontype': 'Weapon Type',
    'attacktype': 'Attack Type',
    'targettype': 'Target Type'
}
columns = ['category', 'subcategory', 'type', 'subtype', 'nattacks', 'nkilled', 'nwounded', 'nkilledter', 'nwoundedter']
metrics = ['nkilled', 'nwounded', 'nkilledter', 'nwoundedter']

df_ = pd.DataFrame(columns=columns)

for type_, tlabel in types.iteritems():
    for category, clabel in categories.iteritems():
        
        df1 = df.groupby([df[category], df[type_]]).size().reset_index(name='nattacks')
        df2 = df[metrics].groupby([df[category], df[type_]])[metrics].sum().reset_index()
        df1 = df1.join(df2.set_index([category, type_]), on=[category, type_])
        df1.rename(columns={category: 'subcategory', type_: 'subtype'}, inplace=True)
        df1['category'] = clabel
        df1['type'] = tlabel
        df_ = df_.append(df1)
    
    df1 = df.groupby(df[type_]).size().reset_index(name='nattacks')
    df2 = df[metrics].groupby(df[type_])[metrics].sum().reset_index()
    df1 = df1.join(df2.set_index([type_]), on=[type_])
    df1.rename(columns={type_: 'subtype'}, inplace=True)
    df1['category'] = 'World'
    df1['subcategory'] = 'All Regions'
    df1['type'] = tlabel
    df_ = df_.append(df1)

In [8]:
df_ = df_[df_['subcategory'] != 'Unknown']
df_ = df_[df_['subtype'] != 'Unknown']

encode_df(df_)

df_.to_csv('/Users/olegpolakow/Google Drive/My Documents/Education/Master/DTU_SocialData_SS17/socialdata2017/project/data/scatterplot.csv', encoding='utf-8', index=False)

## Choropleth: Geographical distribution

In [9]:
categories = {
    'gname': 'Terrorist Group',
    'weapontype': 'Weapon Type',
    'attacktype': 'Attack Type',
    'targettype': 'Target Type'
}
columns = ['category', 'subcategory', 'year', 'countrycode', 'nkilled', 'nwounded', 'nkilledter', 'nwoundedter']
metrics = ['nkilled', 'nwounded', 'nkilledter', 'nwoundedter']

df_ = pd.DataFrame(columns=columns)

for category, label in categories.iteritems():
    df1 = df.groupby([df[category], df['countrycode'], df['year']]).size().reset_index(name='nattacks')
    df2 = df[metrics].groupby([df[category], df['countrycode'], df['year']]).sum().reset_index()
    df3 = df1.join(df2.set_index([category, 'countrycode', 'year']), on=[category, 'countrycode', 'year'])
    df3.rename(index=str, columns={category: 'subcategory'}, inplace=True)
    df3['category'] = label
    df_ = df_.append(df3)

df1 = df.groupby([df['countrycode'], df['year']]).size().reset_index(name='nattacks')
df2 = df[metrics].groupby([df['countrycode'], df['year']]).sum().reset_index()
df3 = df1.join(df2.set_index(['countrycode', 'year']), on=['countrycode', 'year'])
df3['category'] = 'All Types'
df3['subcategory'] = 'All Types'
df_ = df_.append(df3)

In [10]:
df_ = df_[df_.subcategory != 'Unknown']
df_[metrics+['nattacks', 'year']] = df_[metrics+['nattacks', 'year']].astype(int)

encode_df(df_)

df_.to_csv('/Users/olegpolakow/Google Drive/My Documents/Education/Master/DTU_SocialData_SS17/socialdata2017/project/data/choropleth.csv', encoding='utf-8', index=False)

## K-Means: Core areas of terrorism

In [11]:
df_loc = df[['lon', 'lat', 'weapontype', 'attacktype', 'targettype']].copy()
df_loc.lon = df_loc.lon.apply(lambda x: round(x, 2))
df_loc.lat = df_loc.lat.apply(lambda x: round(x, 2))

encode_df(df_loc)
    
df_loc.to_csv('/Users/olegpolakow/Google Drive/My Documents/Education/Master/DTU_SocialData_SS17/socialdata2017/project/data/points.csv', encoding='utf-8', index=False)

In [12]:
# Cluster every category and type
from sklearn.cluster import KMeans

str_cols = ['weapontype', 'attacktype', 'targettype']

df_ = pd.DataFrame(columns=['lon', 'lat', 'k', 'type'])
for col in str_cols:
    print 'Processing column "%s"' % col
    subtypes = df_loc[col].unique().tolist()
    for subtype in subtypes:
        ks = range(2, 7)
        kmeans = {k: KMeans(n_clusters=k, max_iter=50).fit(df_loc[df_loc[col] == subtype][['lon', 'lat']]) for k in ks}
        centroid_data = [
            (kmean.cluster_centers_[i][0], kmean.cluster_centers_[i][1], k) 
            for k, kmean in kmeans.iteritems() 
            for i in range(len(kmean.cluster_centers_))
        ]
        pd_temp = pd.DataFrame(centroid_data, columns=['lon', 'lat', 'k'])
        pd_temp['type'] = subtype
        df_ = df_.append(pd_temp)

Processing column "weapontype"
Processing column "attacktype"
Processing column "targettype"


In [13]:
# Cluster all attacks
kmeans = {k: KMeans(n_clusters=k, max_iter=50).fit(df_loc[['lon', 'lat']]) for k in ks}
centroid_data = [
    (kmean.cluster_centers_[i][0], kmean.cluster_centers_[i][1], k) 
    for k, kmean in kmeans.iteritems() 
    for i in range(len(kmean.cluster_centers_))
]
pd_temp = pd.DataFrame(centroid_data, columns=['lon', 'lat', 'k'])
# Encoding for "All Types"
pd_temp['type'] = str_map['All Types']
df_ = df_.append(pd_temp)

In [14]:
df_[['k', 'type']] = df_[['k', 'type']].astype(int)
df_.to_csv('/Users/olegpolakow/Google Drive/My Documents/Education/Master/DTU_SocialData_SS17/socialdata2017/project/data/centroids.csv', encoding='utf-8', index=False)