Code for data pre-processing for terrorism data. Rationale for these choices are in <rationale.ipynb>.

In [None]:
import math
import numpy as np
import re
import string
import scipy
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import datetime as dt

#this recodes WeapRecode1 and WeapRecode2 to specs in weaptype recode.xlsx
def weap_recode(wtype,wsubtype):
    if (wtype in (1,2,3,4)):
        ret = wtype
    if (wtype == 5):
        ret = wsubtype + 3
    if (wtype == 6):
        if (wsubtype < 28):
            ret = wsubtype + 3
        else:
            ret = wsubtype - 7
    if (wtype == 7):
        ret = 23
    if (wtype == 8):
        ret = wsubtype + 6
    if (wtype == 9):
        if (wsubtype < 26):
            ret = wsubtype + 6
        else:
            ret = wsubtype + 5
    if (wtype in (10,11,12,13)):
        ret = wtype + 23
    if (pd.isnull(wtype)):
        ret = np.NaN
    return ret

def c_weekday(y,m,d):
    if (m != 0 and d !=0):
        t_dt = dt.datetime(y,m,d)
        ret = t_dt.weekday()
    else:
        ret = np.nan
    return ret
    
# this recodes propextent to:
# 0 = no damage
# 1 = Minor (likely < $1 million)
# 2 = Major (likely > $1 million but < $1 billion)
# 3 =  Catastrophic (likely > $1 billion)       
# Nan = unknown

def prop_extent_recode(prop,prop_ext):
    if (prop_ext == 4):
        ret = np.NaN
    if (prop == 0):
        ret = 0
    if (prop_ext in (1,2,3)):
        if (pd.isnull(prop)):
            ret = np.NaN
        else:
            ret = 4 - prop_ext
    if (pd.isnull(prop_ext)):
        ret = np.NaN
    return ret


##########################
#
# Start preprocessing
#
##########################

# read data file globalterrorismdb_0616dist.xlsx

home = r"C:\Users\ibshi\Desktop\startup.ml\challenge 2\global terrorism\data"
infile = home + r"\globalterrorismdb_0616dist.xlsx"
indata = pd.read_excel(infile)

# get count of all features and put into csv, put into data_summary.xlsx

indata_count = indata.count()
countfile = home + r"\count.csv"
indata_count.to_csv(countfile)

# 1. Add Weekday

weekday_data = indata
weekday_data.insert(5,'Weekday',-1)

for i in list(weekday.index):
    if (i % 1000 == 0):
        print(i)
    t_year = weekday_data.iyear[i]
    t_month = weekday_data.imonth[i]
    t_day = weekday_data.iday[i]
    weekday_data.Weekday[i] = c_weekday(t_year,t_month,t_day)

# 2. remove rows where terrorism status is in doubt

weekday_data = weekday_data[weekday_data.doubtterr!=1]

# save data to intermediary file weekday.xlsx

#home = r"C:\Users\ibshi\Desktop\startup.ml\challenge 2\global terrorism\data"
#weekdayfile = home + r"\weekday.xlsx"
#weekday_data.to_excel(weekdayfile)


home = r"C:\Users\ibshi\Desktop\startup.ml\challenge 2\global terrorism\data"
infile = home + r"\weekday.xlsx"
weekday = pd.read_excel(infile)

# 3. remove guncertain == 1, keeping ==0 and ==NaN

weekday = weekday[weekday.guncertain1 != 1]

# 4. if competing claim, set claimed to NaN

weekday.claimed[weekday.compclaim == 1] = np.NaN

# 5. recode WeapRecode1, WeapRecode2

weekday.insert(93,'WeapRecode2',-1)
weekday.insert(88,'WeapRecode1',-1)

for i in list(weekday.index):
    w = weekday.weaptype1[i]
    ws = weekday.weapsubtype1[i]
    weekday.WeapRecode1[i] = weap_recode(w,ws)
    w = weekday.weaptype2[i]
    ws = weekday.weapsubtype2[i]
    weekday.WeapRecode2[i] = weap_recode(w,ws)

# 6. recode propextent so that property ==0 is included.  Also values switched
#    so 0 to 3 increase in value

for i in list(weekday.index):
    p = weekday.property[i]
    pe = weekday.propextent[i]    
    weekday.propextent[i] = prop_extent_recode(p,pe)
    
# 7. recode nhostkid. If ishostkid == 0, set nohistkid =0

weekday.nhostkid[weekday.ishostkid == 0] = 0

# 8. nhours and ndays
#    there are nhours == 999, I'm assuming that's a NaN code
#    then ndays is rounded to nearest day. combine nhours into ndays so that
#    it is also rounded to nearest day (either 0 or 1)

weekday.ndays[weekday.nhours ==999.0] = np.NaN
weekday.ndays[pd.notnull(weekday.nhours)] = np.round(weekday.nhours/24.0)

# 9. motive had many unknown text codes, change to NaN

t = weekday
t.motive[t.motive=="The specific motive for the attack is unknown."] = np.NaN
t.motive[t.motive=="Unknown"] = np.NaN
weekday = t

# save to intermediary file weekday10.xlsx

weekday10file = home + r"\weekday10.xlsx"
weekday.to_excel(weekday10file)

# 10. select features for analysis (into dataframe final_data) 

final_data = weekday
droplist =['approxdate','extended','resolution','country_txt','region_txt', \
'provstate','city','latitude','longitude','specificity','vicinity', \
'location','summary','doubtterr','alternative','alternative_txt', \
'attacktype1_txt','attacktype2_txt','attacktype3','attacktype3_txt',\
'targtype1_txt', 'targsubtype1_txt','corp1','target1','natlty1','natlty1_txt', \
'targtype2','targtype2_txt','targsubtype2','targsubtype2_txt','corp2', \
'target2','natlty2','natlty2_txt','targtype3','targtype3_txt','targsubtype3', \
'targsubtype3_txt','corp3','target3','natlty3','natlty3_txt','gsubname', \
'gname2','gsubname2','gname3','ingroup','ingroup2','ingroup3','gsubname3', \
'motive','guncertain1','guncertain2','guncertain3','claimmode_txt', \
'claim2','claimmode2','claimmode2_txt','claim3','claimmode3', \
'claimmode3_txt','compclaim','weaptype1_txt','weapsubtype1', \
'weapsubtype1_txt','weaptype2_txt','weapsubtype2','weapsubtype2_txt', \
'weaptype3','weaptype3_txt','weapsubtype3','weapsubtype3_txt','weaptype4', \
'weaptype4_txt','weapsubtype4','weapsubtype4_txt','weapdetail', 'nkillus', \
'nkillter','nwoundus','nwoundte','property','propextent_txt','propvalue', \
'propcomment','ishostkid','nhostkidus','nhours','divert','kidhijcountry', \
'ransomamt','ransomamtus','ransompaid','ransompaidus','ransomnote', \
'hostkidoutcome_txt','addnotes','scite1','scite2','scite3','dbsource', \
'INT_ANY','related']

final_data.drop(droplist,inplace=True,axis=1)

# 11. remove rows with gname == 'Unknown'

final_data = final_data[final_data.gname !='Unknown']

# 12. Choose gname with entries (rows) >= 150
#     This left 63 gnames as the number of categories to predict
#     from the original 2865

    # This finds a list of terrorist group names with >149 entries
t = final_data.gname
temp = t.value_counts()
temp149 = temp[temp > 149]
terrlist = list(temp149.index)

    # This inserts an 'inlist' column, figures out if gname is inlist
final_data.insert(0,'inlist',-1)
for i in list(final_data.index):
    final_data.inlist[i] = final_data.gname[i] in terrlist

    # This removes entries !inlist and removes inlist column
final_data = final_data[final_data.inlist==1]
final_data.drop('inlist',inplace=True,axis=1)

# create final files for analyses, Y_in.csv, final_data.xlsx

Y_in = final_data['gname']
final_data.drop(['gname','eventid'],inplace=True,axis=1)
outfile = home + r"\final_data.xlsx"
final_data.to_excel(outfile)
outfile = home + r"\Y_in.csv"
np.savetxt(outfile,Y_in,delimiter=",", fmt='%s')
