# Importing Libraries

In [1]:
import numpy as np
import pandas as pd

# Importing The Dataset

In [2]:
data = pd.read_csv("A:/MinorProjectData/GlobalTerrorCleanPartOne.csv")

# Handling Missing Values

In [3]:
#Number of missing values in each column
data.isnull().sum()

Unnamed: 0              0
eventid                 0
iyear                   0
imonth                  0
iday                    0
extended                0
country                 0
country_txt             0
region                  0
region_txt              0
provstate           14502
city                  446
latitude             4606
longitude            4606
specificity             4
vicinity                0
crit1                   0
crit2                   0
crit3                   0
doubtterr               0
multiple                0
success                 0
suicide                 0
attacktype1             0
attacktype1_txt         0
targtype1               0
targtype1_txt           0
targsubtype1         9345
targsubtype1_txt     9345
corp1               42566
target1               636
natlty1              1394
natlty1_txt          1394
gname                   0
guncertain1           379
individual              0
weaptype1               0
weaptype1_txt           0
weapsubtype1

| Using -1 for unknown values in each column | <br>
| Also replacing -9 for -1 in the data which would represent unknown values which were earlier represented using -9 |

In [4]:
#Column Specificity
data['specificity'].fillna(-1 , inplace=True)

In [5]:
#Columns ishostkid
data['ishostkid'].fillna(-1 , inplace=True)
data.loc[data['ishostkid'] == -9, 'ishostkid'] = -1

In [6]:
#Column Vicinity
data.loc[data['vicinity']==-9 , 'vicinity'] = -1

In [7]:
#Column doubtterr
data.loc[data['doubtterr']==-9,'doubtterr'] = -1 

In [8]:
#Column target
data['targsubtype1_txt'].fillna('Unknown', inplace=True)
data['targsubtype1'].fillna(-1,inplace=True)
data['target1'].fillna('Unknown', inplace=True)

In [9]:
#column corp1
data['corp1'].fillna('Unknown', inplace=True)

In [10]:
#Column natlty
data['natlty1_txt'].fillna('Unknown', inplace=True)
data['natlty1'].fillna(-1 , inplace =True)

In [11]:
#Column guncertain
data['guncertain1'].fillna(-1, inplace=True)

In [12]:
#Column weapons
data['weapsubtype1_txt'].fillna('Unknown', inplace=True)
data['weapsubtype1'].fillna(-1 , inplace=True)

In [14]:
#Column state or region
data['provstate'].fillna('Unknown' , inplace=True)

In [15]:
#Column city
data['city'].fillna('Unknown' , inplace = True)

In [16]:
#Column entity targetted or organisation
data['corp1'].fillna('Unknown' , inplace = True)

In [19]:
#Column for primary target 
data['target1'].fillna('Unknown' , inplace = True)

In [20]:
data.isnull().sum()

Unnamed: 0              0
eventid                 0
iyear                   0
imonth                  0
iday                    0
extended                0
country                 0
country_txt             0
region                  0
region_txt              0
provstate               0
city                    0
latitude             4606
longitude            4606
specificity             0
vicinity                0
crit1                   0
crit2                   0
crit3                   0
doubtterr               0
multiple                0
success                 0
suicide                 0
attacktype1             0
attacktype1_txt         0
targtype1               0
targtype1_txt           0
targsubtype1            0
targsubtype1_txt        0
corp1                   0
target1                 0
natlty1                 0
natlty1_txt             0
gname                   0
guncertain1             0
individual              0
weaptype1               0
weaptype1_txt           0
weapsubtype1

In [23]:
data[['nkill', 'nwound']].describe(percentiles = [0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.0]).transpose()

Unnamed: 0,count,mean,std,min,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%,max
nkill,160668.0,2.387246,11.327709,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,5.0,1500.0,1500.0
nwound,155025.0,3.200239,34.647365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,7.0,7366.0,7366.0


In [24]:
#FUNCTION TO FILL MISSING VALUES IN 'nkill' and 'nwound'
def filling_missing_values(col):
    fillvalue = 0.0 
    k = 3 
    col_clean = col.dropna()
    col_std   = col_clean.std()
    
    #Outlier if not in range of 3 standard deviations of the column values. 
    outlier_value  = col_clean[col_clean > ( k * col_std )]
    
    if outlier_value.count() > 0 :
        fillvalue = col_clean.median()
    else:
        fillvalue = col_clean.mean()
    
    return fillvalue
    

In [27]:
#Using function on 'nkill' and 'nwound'
data['nkill'] = data['nkill'].fillna(filling_missing_values(data['nkill']))
data['nwound'] = data['nwound'].fillna(filling_missing_values(data['nwound']))

In [44]:
# Latitude And Longitude Values
# No procedure can be used for handling the missing longitude and latitude values.
# Best option is to drop the rows with these missing values. Number of rows are 4606.
data = data.dropna().copy()

In [45]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165744 entries, 0 to 170349
Data columns (total 49 columns):
Unnamed: 0          165744 non-null int64
eventid             165744 non-null int64
iyear               165744 non-null int64
imonth              165744 non-null int64
iday                165744 non-null int64
extended            165744 non-null int64
country             165744 non-null int64
country_txt         165744 non-null object
region              165744 non-null int64
region_txt          165744 non-null object
provstate           165744 non-null object
city                165744 non-null object
latitude            165744 non-null object
longitude           165744 non-null float64
specificity         165744 non-null float64
vicinity            165744 non-null int64
crit1               165744 non-null int64
crit2               165744 non-null int64
crit3               165744 non-null int64
doubtterr           165744 non-null int64
multiple            165744 non-null int6

In [46]:
data.isnull().sum()

Unnamed: 0          0
eventid             0
iyear               0
imonth              0
iday                0
extended            0
country             0
country_txt         0
region              0
region_txt          0
provstate           0
city                0
latitude            0
longitude           0
specificity         0
vicinity            0
crit1               0
crit2               0
crit3               0
doubtterr           0
multiple            0
success             0
suicide             0
attacktype1         0
attacktype1_txt     0
targtype1           0
targtype1_txt       0
targsubtype1        0
targsubtype1_txt    0
corp1               0
target1             0
natlty1             0
natlty1_txt         0
gname               0
guncertain1         0
individual          0
weaptype1           0
weaptype1_txt       0
weapsubtype1        0
weapsubtype1_txt    0
nkill               0
nwound              0
property            0
ishostkid           0
dbsource            0
INT_LOG   

# Writing The Clean Dataset To A CSV File

In [43]:
data.to_csv("A:/MinorProjectData/GlobalTerrorCleanPartTwo.csv" , encoding ="utf-8")