In [33]:
# Data Libraries 
import pandas as pd 
import numpy as np

# plotting libraries 
import matplotlib.pyplot as plt 
import seaborn as sns

# ignore warnings 
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

_orginal Data source_
https://www.kaggle.com/START-UMD/gtd

In [2]:
# data encoding = latin1
archive = pd.read_csv('./Data/global-terrorism-db.csv',encoding='latin1',low_memory=False)

In [3]:
# data stats
print('Number of rows : {}'.format(archive.shape[0]))
print('Number of columns : {}'.format(archive.shape[1]))
print('Data Size :  {}'.format(archive.size))

Number of rows : 181691
Number of columns : 135
Data Size :  24528285


### Manual Column Selection 

In [4]:
cols = ['eventid', 'iyear', 'imonth', 'iday', 
        'extended', 'country', 'country_txt', 
        'region', 'region_txt', 'city', 'provstate', 
        'latitute', 'longitude','location','summary', 
        'success', 'suicide', 'location', 'attacktype_txt', 
        'targettype_txt', 'tagetsubtype1_txt', 'corp1', 
        'target1', 'natlty1', 'natlty1_txt', 'gname', 'motive', 
        'nperps', 'weaptype1', 'weaptype1_txt', 'nkill', 'propextent'] 

In [5]:
allcols = archive.columns.to_list()

In [6]:
# preliminary check 
def check_cols():
    for column in cols:
        if column in allcols:
            pass
        else:
            print('Invalid Column Name : ', column)
check_cols()

Invalid Column Name :  latitute
Invalid Column Name :  attacktype_txt
Invalid Column Name :  targettype_txt
Invalid Column Name :  tagetsubtype1_txt


In [7]:
# corrected 
cols = ['eventid', 'iyear', 'imonth', 'iday', 
        'extended', 'country', 'country_txt', 
        'region', 'region_txt', 'city', 'provstate', 
        'latitude', 'longitude','location','summary', 
        'success', 'suicide', 'attacktype1_txt', 
        'targtype1_txt', 'targsubtype1_txt', 'corp1', 
        'target1', 'natlty1', 'natlty1_txt', 'gname', 'motive', 
        'nperps', 'weaptype1', 'weaptype1_txt', 'nkill', 'propextent'] 

In [8]:
check_cols() # all okay 

### Slicing data 

In [9]:
print('Number of Columns Selected are {} out of total {}'.format(len(cols),archive.shape[1]))

Number of Columns Selected are 31 out of total 135


In [10]:
mainsheet = archive[cols]

In [11]:
mainsheet.head(10) 

Unnamed: 0,eventid,iyear,imonth,iday,extended,country,country_txt,region,region_txt,city,...,target1,natlty1,natlty1_txt,gname,motive,nperps,weaptype1,weaptype1_txt,nkill,propextent
0,197000000001,1970,7,2,0,58,Dominican Republic,2,Central America & Caribbean,Santo Domingo,...,Julio Guzman,58.0,Dominican Republic,MANO-D,,,13,Unknown,1.0,
1,197000000002,1970,0,0,0,130,Mexico,1,North America,Mexico city,...,"Nadine Chaval, daughter",21.0,Belgium,23rd of September Communist League,,7.0,13,Unknown,0.0,
2,197001000001,1970,1,0,0,160,Philippines,5,Southeast Asia,Unknown,...,Employee,217.0,United States,Unknown,,,13,Unknown,1.0,
3,197001000002,1970,1,0,0,78,Greece,8,Western Europe,Athens,...,U.S. Embassy,217.0,United States,Unknown,,,6,Explosives,,
4,197001000003,1970,1,0,0,101,Japan,4,East Asia,Fukouka,...,U.S. Consulate,217.0,United States,Unknown,,,8,Incendiary,,
5,197001010002,1970,1,1,0,217,United States,1,North America,Cairo,...,Cairo Police Headquarters,217.0,United States,Black Nationalists,To protest the Cairo Illinois Police Deparment,-99.0,5,Firearms,0.0,3.0
6,197001020001,1970,1,2,0,218,Uruguay,3,South America,Montevideo,...,Juan Maria de Lucah/Chief of Directorate of in...,218.0,Uruguay,Tupamaros (Uruguay),,3.0,5,Firearms,0.0,
7,197001020002,1970,1,2,0,217,United States,1,North America,Oakland,...,Edes Substation,217.0,United States,Unknown,,-99.0,6,Explosives,0.0,3.0
8,197001020003,1970,1,2,0,217,United States,1,North America,Madison,...,"R.O.T.C. offices at University of Wisconsin, M...",217.0,United States,New Year's Gang,To protest the War in Vietnam and the draft,1.0,8,Incendiary,0.0,3.0
9,197001030001,1970,1,3,0,217,United States,1,North America,Madison,...,Selective Service Headquarters in Madison Wisc...,217.0,United States,New Year's Gang,To protest the War in Vietnam and the draft,1.0,8,Incendiary,0.0,3.0


### Data Preprocessing

In [12]:
# Numerical 
missing = (mainsheet.isna().sum() / mainsheet.size ) * 100 # In percentages 

In [13]:
missing[missing > 0]

city                0.007705
provstate           0.007475
latitude            0.080889
longitude           0.080907
location            2.240529
summary             1.174078
targsubtype1_txt    0.184166
corp1               0.755448
target1             0.011292
natlty1             0.027679
natlty1_txt         0.027679
motive              2.328129
nperps              1.262601
nkill               0.183101
propextent          2.088374
dtype: float64

In [14]:
missing[missing > 0.5]

location      2.240529
summary       1.174078
corp1         0.755448
motive        2.328129
nperps        1.262601
propextent    2.088374
dtype: float64

In [15]:
def impute_value(column,value='Unknown'):
    column = column.fillna(value=value)
    return column

In [16]:
mainsheet['city'] = impute_value(mainsheet['city'],value='Unknown')

In [17]:
mainsheet['provstate'].value_counts()

Baghdad                7645
Northern Ireland       4498
Unknown                4290
Balochistan            3710
Saladin                3411
                       ... 
Pcinja ( District )       1
Ciudad Real               1
Volga (District)          1
Yamagata                  1
Paktika Province          1
Name: provstate, Length: 2855, dtype: int64

In [18]:
mainsheet['provstate'] = impute_value(mainsheet['provstate'],value='Unknown')

In [19]:
mainsheet['provstate'].value_counts()

Baghdad                7645
Unknown                4711
Northern Ireland       4498
Balochistan            3710
Saladin                3411
                       ... 
Pcinja ( District )       1
Ciudad Real               1
Volga (District)          1
Yamagata                  1
Paktika Province          1
Name: provstate, Length: 2855, dtype: int64

In [20]:
mainsheet['location'] 

0                                                       NaN
1                                                       NaN
2                                                       NaN
3                                                       NaN
4                                                       NaN
                                ...                        
181686       The incident occurred near the town of Balcad.
181687       The incident occurred at the Humaymim Airport.
181688    The incident occurred in the Datu Hoffer distr...
181689    The incident occurred in the Mantripukhri neig...
181690                                                  NaN
Name: location, Length: 181691, dtype: object

In [21]:
mainsheet['location'] = impute_value(mainsheet['location'],value='Not Registered')

In [22]:
# mainsheet['location'].value_counts()

In [23]:
mainsheet['summary'] = impute_value(mainsheet['summary'], value='Not Recorded')

In [24]:
# mainsheet['summary'].value_counts()

In [25]:
mainsheet['motive'] = impute_value(mainsheet['motive'], value='Unknown')

In [26]:
mainsheet['natlty1_txt'] = impute_value(mainsheet['natlty1_txt'], value='Unknown')

In [27]:
mainsheet['corp1'] = impute_value(mainsheet['corp1'],value='Unknown')

In [28]:
# Furthur Preprocessing Can be Done While practically working with columns in plots

### Data Segmentation 

- The thought process behind segmenting or filtering out data elements like summary and other categorical data out of the main dataset is to quickly gain access to any event's summary, location, type of event pivoting on the eventid; this not only increases the efficiency to learn about any event at hand which is interesting but also keeps things organised. 

__Total Number of Datasets After Segmenting__
1. The Mainsheet with Selected 31 columns 
2. Numerical Sheet ( on which all the Statistics are worked out )
3. Summary Sheet  ( To Get synopsis of an event using the eventid ) 


In [29]:
# MainSheet 
mainsheet.to_csv('./Data/Main-Sheet.csv') # csv

In [30]:
# Numerical
data = mainsheet.drop(['summary','location'],axis=1)
data.to_csv('./Data/Data.csv') # csv

In [31]:
# # Summary 
summary_cols = ['eventid', 'iyear', 'imonth', 'country_txt', 'region_txt', 'city', 'location', 'success', 'motive','summary' ]
summary = mainsheet[summary_cols]
summary.to_csv('./Data/summary.csv') # csv