# I. Preprocessing data

Below you find the code for cleaning and filtering the original data.

In [12]:
# Wanted columns
usecols = [1, 5, 8, 10, 11, 12, 13, 14, 25, 26, 27, 29, 35, 58, 71, 73, 84, 100, 102, 103, 105, 106, 108]
renamecols = {
    'latitude': 'lat',
    'longitude': 'lon',
    'iyear': u'year',
    'country_txt': u'country',
    'region_txt': u'region',
    'provstate': u'state',
    'attacktype1_txt': u'attacktype',
    'targtype1_txt': u'targettype',
    'weaptype1_txt': u'weapontype',
    'nperps': u'nter',
    'nkill': u'nkilled',
    'nkillter': u'nkilledter',
    'nwound': u'nwounded',
    'nwoundte': u'nwoundedter',
    'propextent_txt': u'propertyextent'
}

In [13]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/globalterrorismdb_0616dist.csv', encoding='ISO-8859-1', usecols=usecols)
df.rename(columns=renamecols, inplace=True)
df = df.apply(lambda x: x.encode('utf-8').strip() if isinstance(x, str) else x)

In [14]:
df.columns

Index([          u'year',       u'extended',        u'country',
               u'region',          u'state',           u'city',
                  u'lat',            u'lon',       u'multiple',
              u'success',        u'suicide',     u'attacktype',
           u'targettype',          u'gname',           u'nter',
              u'claimed',     u'weapontype',        u'nkilled',
           u'nkilledter',       u'nwounded',    u'nwoundedter',
             u'property', u'propertyextent'],
      dtype='object')

In [4]:
df.shape

(156772, 23)

In [5]:
# Remove zero values in date
df = df.drop(df[df.year == 0].index)

In [6]:
# Remove unknown values in coordinates
df = df[pd.notnull(df.lat)]
df = df[pd.notnull(df.lon)]

In [7]:
# Deal with unknowns in numeric columns
exclude_cols = ['year', 'lat', 'lon']
float_cols = [c for c in df.select_dtypes(include=[float]).columns.tolist() if c not in exclude_cols]
# We don't want any NaNs as they automatically convert the column to dtype=float
# Convert the unknowns into negative numbers, say -9
df[float_cols] = df[float_cols].fillna(0).astype(int)
df[float_cols] = df[float_cols].mask(df[float_cols] < 0, 0)

In [8]:
# Deal with unknowns in string columns
str_cols = df.select_dtypes(exclude=[float]).columns.tolist()
df[str_cols] = df[str_cols].fillna('Unknown')
# Many fields are dots which mean unknown value
df[str_cols] = df[str_cols].replace(r'^\.*$', 'Unknown', regex=True)

In [9]:
# Limit long strings
df['weapontype'] = df['weapontype'].replace(u'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)', 'Vehicle')

df['propertyextent'] = df['propertyextent'].replace(u'Minor (likely < $1 million)', u'Minor (< $1 million)')
df['propertyextent'] = df['propertyextent'].replace(u'Major (likely > $1 million but < $1 billion)', u'Major (< $1 billion)')
df['propertyextent'] = df['propertyextent'].replace(u'Catastrophic (likely > $1 billion)', u'Catastrophic (> $1 billion)')

In [10]:
import pycountry

df = df.drop(df[df.country == u'International'].index)

def alpha_3(country):
    other = {
        u'West Germany (FRG)': u'DEU',
        u'East Germany (GDR)': u'DEU',
        u'Vatican City': u'VAT',
        u'Venezuela': u'VEN',
        u'Iran': u'IRN',
        u'South Yemen': u'YEM',
        u'North Yemen': u'YEM',
        u'Bolivia': u'BOL',
        u'West Bank and Gaza Strip': u'PSE',
        u'South Vietnam': u'VNM',
        u'Yugoslavia': u'MNE',
        u'Zaire': u'COD',
        u'Democratic Republic of the Congo': u'COD',
        u'Syria': u'SYR',
        u'South Korea': u'KOR',
        u'Taiwan': u'TWN',
        u"People's Republic of the Congo": u"COG",
        u"Republic of the Congo": u"COG",
        u'Rhodesia': u'ZWE',
        u'Soviet Union': u'RUS',
        u'Tanzania': u'TZA',
        u'New Hebrides': u'VUT',
        u'Falkland Islands': u'FLK',
        u'Czechoslovakia': u'CZE',
        u'Laos': u'LAO',
        u'Moldova': u'MDA',
        u'Russia': u'RUS',
        u'Ivory Coast': u'CIV',
        u'Bosnia-Herzegovina': u'BIH',
        u'Brunei': u'BRN',
        u'Vietnam': u'VNM',
        u'Slovak Republic': u'SVK',
        u'Macedonia': u'MKD',
        u'Czech Republic': u'CZE',
        u'North Korea': u'PRK',
        u'St. Kitts and Nevis': u'KNA',
        u'Macau': u'MAC',
        u'Kosovo': u'XKX',
        u'East Timor': u'TLS',
        u'St. Lucia': u'LCA',
        u'Serbia-Montenegro': u'SRB'
    }
    if country in other:
        return other[country]
    return pycountry.countries.get(name=country).alpha_3
df['countrycode'] = df.country.apply(lambda x: alpha_3(x))
# Save cleaned dataset
df.to_csv('data/globalterrorism_cleaned.csv', encoding='utf-8', index=False)

In [11]:
df

Unnamed: 0,year,extended,country,region,state,city,lat,lon,multiple,success,...,nter,claimed,weapontype,nkilled,nkilledter,nwounded,nwoundedter,property,propertyextent,countrycode
0,1970,0,Dominican Republic,Central America & Caribbean,Unknown,Santo Domingo,18.456792,-69.951164,0,1,...,0,0,Unknown,1,0,0,0,0,Unknown,DOM
1,1970,0,Mexico,North America,Unknown,Mexico city,19.432608,-99.133207,0,1,...,7,0,Unknown,0,0,0,0,0,Unknown,MEX
2,1970,0,Philippines,Southeast Asia,Tarlac,Unknown,15.478598,120.599741,0,1,...,0,0,Unknown,1,0,0,0,0,Unknown,PHL
3,1970,0,Greece,Western Europe,Attica,Athens,37.983773,23.728157,0,1,...,0,0,Explosives/Bombs/Dynamite,0,0,0,0,1,Unknown,GRC
4,1970,0,Japan,East Asia,Unknown,Fukouka,33.580412,130.396361,0,1,...,0,0,Incendiary,0,0,0,0,1,Unknown,JPN
5,1970,0,United States,North America,Illinois,Cairo,37.005105,-89.176269,0,1,...,0,0,Firearms,0,0,0,0,1,Minor (< $1 million),USA
6,1970,0,Uruguay,South America,Montevideo,Montevideo,-34.891151,-56.187214,0,0,...,3,0,Firearms,0,0,0,0,0,Unknown,URY
7,1970,0,United States,North America,California,Oakland,37.805065,-122.273024,0,1,...,0,0,Explosives/Bombs/Dynamite,0,0,0,0,1,Minor (< $1 million),USA
8,1970,0,United States,North America,Wisconsin,Madison,43.076592,-89.412488,0,1,...,1,1,Incendiary,0,0,0,0,1,Minor (< $1 million),USA
9,1970,0,United States,North America,Wisconsin,Madison,43.072950,-89.386694,0,1,...,1,0,Incendiary,0,0,0,0,1,Minor (< $1 million),USA
