In [147]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib


from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import bokeh
from bokeh.io import output_notebook, show
from bokeh.plotting import figure, output_file, show
from bokeh.plotting import  curdoc
output_notebook()

%matplotlib inline

## Importing raw data and inital EDA

In [2]:
raw_data = pd.read_csv('/home/roland/Workspace/Data/project_5/globalterrorismdb_0617dist.csv', encoding = "ISO-8859-1")# Looking at raw data
print(raw_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170350 entries, 0 to 170349
Columns: 135 entries, eventid to related
dtypes: float64(53), int64(24), object(58)
memory usage: 175.5+ MB
None


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
from fuzzywuzzy import fuzz

def eda_helper(df):
    dict_list = []
    for col in df.columns:
        data = df[col]
        dict_ = {}
        # The null count for a column. Columns with no nulls are generally more interesting
        dict_.update({"null_count" : data.isnull().sum()})
        # Counting the unique values in a column
        # This is useful for seeing how interesting the column might be as a feature
        dict_.update({"unique_count" : len(data.unique())})
        # Finding the types of data in the column
        # This is useful for finding out potential problems with a column having strings and ints
        dict_.update({"data_type" : set([type(d).__name__ for d in data])})
        dict_list.append(dict_)
    eda_df = pd.DataFrame(dict_list)
    eda_df.index = df.columns
    eda_df = eda_df.sort_values(['null_count','unique_count'], ascending=[True, False])
    return eda_df

from fuzzywuzzy import fuzz
def match_name(name, list_names, min_score=0):
    # -1 score incase we don't get any matches
    max_score = -1
    # Returning empty name for no match as well
    max_name = ""
    # Iternating over all names in the other
    for name2 in list_names:
        #Finding fuzzy match score
        score = fuzz.ratio(name, name2)
        # Checking if we are above our threshold and have a better score
        if(score > max_score):
            max_name = name2
            max_score = score
    
    if max_score < min_score: max_score = -1
    return (max_name, max_score)

def date_renamer(df,threshold = 0):
    # This is where you define what you want your column names to be
    # I know for the terrorist data that iday, imonth, and iyear exist
    # but this could apply to any kind of naming convention.
    dates = ['day','month','year']
    for d in dates:
        match_result = match_name(d, df.columns, min_score=threshold)
        name = match_result[0]
        # If we do not find a score that meets our threshold skip it
        if match_result[1] == -1:
            continue
        df.rename(columns={ name: d }, inplace=True)

# def format_dates(df):
#     dates = ['month','date','year']
#     for d in dates:
#         if d == 'day':
            


In [4]:
# Renaming my columns
date_renamer(raw_data, threshold=70)

In [5]:
raw_data['year'] = pd.to_datetime(raw_data.year, format='%Y')

In [6]:
eda_df = eda_helper(raw_data)

In [7]:
pd.set_option("max_row", None)
eda_df

Unnamed: 0,data_type,null_count,unique_count
eventid,{int64},0,170350
gname,{str},0,3454
country,{int64},0,205
country_txt,{str},0,205
year,{Timestamp},0,46
day,{int64},0,32
dbsource,{str},0,26
targtype1,{int64},0,22
targtype1_txt,{str},0,22
month,{int64},0,13


In [8]:
# pulling columns without a lot of null values 
features = eda_df[eda_df.null_count < 10000].index
sub_data = raw_data[features].dropna()
del sub_data['eventid']

In [9]:
year_data = sub_data.groupby(['country', 'year']).count()

In [50]:
top_countries = sub_data.groupby('region_txt').count().nlargest(20, 'gname')
top_countries

Unnamed: 0_level_0,gname,country,country_txt,year,day,dbsource,targtype1,targtype1_txt,month,region,...,guncertain1,city,target1,natlty1,natlty1_txt,latitude,longitude,targsubtype1,targsubtype1_txt,nkill
region_txt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Middle East & North Africa,40765,40765,40765,40765,40765,40765,40765,40765,40765,40765,...,40765,40765,40765,40765,40765,40765,40765,40765,40765,40765
South Asia,37159,37159,37159,37159,37159,37159,37159,37159,37159,37159,...,37159,37159,37159,37159,37159,37159,37159,37159,37159,37159
South America,15528,15528,15528,15528,15528,15528,15528,15528,15528,15528,...,15528,15528,15528,15528,15528,15528,15528,15528,15528,15528
Western Europe,14002,14002,14002,14002,14002,14002,14002,14002,14002,14002,...,14002,14002,14002,14002,14002,14002,14002,14002,14002,14002
Sub-Saharan Africa,13175,13175,13175,13175,13175,13175,13175,13175,13175,13175,...,13175,13175,13175,13175,13175,13175,13175,13175,13175,13175
Southeast Asia,9823,9823,9823,9823,9823,9823,9823,9823,9823,9823,...,9823,9823,9823,9823,9823,9823,9823,9823,9823,9823
Central America & Caribbean,6862,6862,6862,6862,6862,6862,6862,6862,6862,6862,...,6862,6862,6862,6862,6862,6862,6862,6862,6862,6862
Eastern Europe,4586,4586,4586,4586,4586,4586,4586,4586,4586,4586,...,4586,4586,4586,4586,4586,4586,4586,4586,4586,4586
North America,2891,2891,2891,2891,2891,2891,2891,2891,2891,2891,...,2891,2891,2891,2891,2891,2891,2891,2891,2891,2891
East Asia,689,689,689,689,689,689,689,689,689,689,...,689,689,689,689,689,689,689,689,689,689


## Visualizations of terrorist attacks over time below



cmap = plt.get_cmap(name='autumn')
len(cc.b_cyclic_grey_15_85_c0)

In [259]:
import bokeh
from bokeh.io import output_file, show
from bokeh.layouts import layout
from bokeh.models import Toggle, BoxAnnotation, CustomJS
from bokeh.io import curdoc
from bokeh.io import output_notebook, show
from bokeh.plotting import figure, output_file, show, curdoc
from bokeh.models import Legend


p = figure(plot_width=1200, plot_height=500, x_axis_type="datetime")
norm = matplotlib.colors.Normalize(vmin=0, vmax=len(top_countries))

regions = {}
region_colors= {}
cmap = bokeh.palettes.Category20_20
items = []

for (i,c) in enumerate(top_countries.index):
    # create a new plot with a datetime axis type
    d = sub_data[sub_data.region_txt == c].groupby('year').count()
    region_colors[c] = cmap[int(norm(i)*len(cmap))]
    
    items.append((c,[p.line(d.index, d['attacktype1'], alpha=0.5,
                           color = region_colors[c],
                           muted_color='grey',
                           muted_alpha=0.05, 
                            line_cap = 'butt',
                            line_width=5)
                    ]
                 ))  


legend = Legend(items = items, location = (0,100))
legend.click_policy='mute'
p.add_layout(legend, 'right')
p.title.text_font_size = '20pt'
p.xaxis.axis_label = 'Year'
p.yaxis.axis_label = 'Total Terrorist Attacks'
p.title.text = 'Terrorist Attacks by Region'
p.toolbar_location = 'above'
    
    
# # We write coffeescript to link toggle with visible property of box and line
# code = '''\
# object.visible = toggle.active
# '''

#p.legend.location = "top_left"


# toggles = []
# for r in regions.keys():
#     callback1 = CustomJS.from_coffeescript(code=code, args={})
#     toggle1 = Toggle(label = r, button_type = "success", callback= callback1)
#     callback1.args = {'toggle': toggle1, 'object': regions[r]}
#     toggles.append(toggle1)
curdoc().clear()
show(p)


In [245]:
sub_data.columns

Index(['gname', 'country', 'country_txt', 'year', 'day', 'dbsource',
       'targtype1', 'targtype1_txt', 'month', 'region', 'region_txt',
       'weaptype1', 'weaptype1_txt', 'attacktype1', 'attacktype1_txt',
       'vicinity', 'doubtterr', 'property', 'INT_LOG', 'INT_IDEO', 'INT_MISC',
       'INT_ANY', 'extended', 'crit1', 'crit2', 'crit3', 'multiple', 'success',
       'suicide', 'individual', 'specificity', 'ishostkid', 'guncertain1',
       'city', 'target1', 'natlty1', 'natlty1_txt', 'latitude', 'longitude',
       'targsubtype1', 'targsubtype1_txt', 'nkill'],
      dtype='object')

In [258]:
import bokeh
from bokeh.io import output_file, show
from bokeh.layouts import layout
from bokeh.models import Toggle, BoxAnnotation, CustomJS
from bokeh.io import curdoc
from bokeh.io import output_notebook, show
from bokeh.plotting import figure, output_file, show, curdoc
from bokeh.models import Legend


p = figure(plot_width=1300, plot_height=500, x_axis_type="datetime")
norm = matplotlib.colors.Normalize(vmin=0, vmax=len(top_countries))

regions = {}
region_colors= {}
cmap = bokeh.palettes.Category20_20
items = []

for (i,c) in enumerate(top_countries.index):
    # create a new plot with a datetime axis type
    d = sub_data[sub_data.region_txt == c].groupby('year').sum()
    region_colors[c] = cmap[int(norm(i)*len(cmap))]
    
    items.append((c,[p.line(d.index, d['nkill'], alpha=0.5,
                           color = region_colors[c],
                           muted_color='grey',
                           muted_alpha=0.05, 
                            line_cap = 'butt',
                            line_width=5
          )]))  

    
# # We write coffeescript to link toggle with visible property of box and line
# code = '''\
# object.visible = toggle.active
# '''

#p.legend.location = "top_left"
legend = Legend(items = items, location = (0,100))
legend.click_policy='mute'
p.add_layout(legend, 'right')
p.title.text_font_size = '20pt'
#p.xaxis.axis_label = 'Year'

p.xaxis.axis_label_text_font_size= '12pt'
p.xaxis.major_label_text_font_size= '12pt'
p.yaxis.axis_label = 'Number of people killed'
p.title.text = 'Fatalities from Terrorist Attacks'
p.toolbar_location = 'above'

# toggles = []
# for r in regions.keys():
#     callback1 = CustomJS.from_coffeescript(code=code, args={})
#     toggle1 = Toggle(label = r, button_type = "success", callback= callback1)
#     callback1.args = {'toggle': toggle1, 'object': regions[r]}
#     toggles.append(toggle1)
curdoc().clear()
show(p)


## Predicting 1993