## Global Terrorism Attribution Predictor

Imports and field variables

In [1]:
from kaggle.api.kaggle_api_extended import KaggleApi
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
data_path = "./data"

Download Global Terrorism Database from Kaggle

In [2]:
api = KaggleApi()
api.authenticate()
api.dataset_download_files("START-UMD/gtd", quiet=False, path=data_path)

gtd.zip: Skipping, found more recently modified local copy (use --force to force download)


Read data into Pandas DataFrame

In [3]:
usecols=["iyear",
         "suicide",
         "imonth",
         "country",
         "region",
         "provstate",
         "city",
         "alternative",
         "attacktype1",
         "targtype1",
         "natlty1",
         "gname",
         "weaptype1",
         "ransom"]

data = pd.read_csv(data_path + "/gtd.zip",
                        encoding = "ISO-8859-1",
                        compression='zip',
                        usecols=usecols)

set dTypes

In [4]:
# data = pd.DataFrame(data.dropna())
# data = data.fillna(value={0})
data['alternative'] = data.alternative.astype('category')
data['suicide'] = data.suicide.astype('category')
data['iyear'] = data.iyear.astype('category')
data['imonth'] = data.imonth.astype('category')
data['country'] = data.country.astype('category')
data['region'] = data.region.astype('category')
data['provstate'] = data.provstate.astype('str').astype('category')
data['city'] = data.city.astype('str').astype('category')
data['attacktype1'] = data.attacktype1.astype('category')
data['targtype1'] = data.targtype1.astype('category')
data['natlty1'] = data.natlty1.astype('category')
data['weaptype1'] = data.weaptype1.astype('category')
data['ransom'] = data.ransom.astype('category')

Filter out unattributed attacks

In [5]:
attributed = data.loc[data['gname'] != "Unknown"]

Shuffle Data

In [6]:
attributed = attributed.sample(frac=1)

Fit the Data

In [7]:
to_fit = attributed.select_dtypes(include=['object'])
le = preprocessing.LabelEncoder()
attributed = attributed.apply(le.fit_transform)

Making a copy of the attributeddataframe which does not contain the dependent variable

In [8]:
independent_vars = attributed.copy()
independent_vars = independent_vars[independent_vars.columns.drop(list(independent_vars.filter(regex='gname')))]
independent_vars.shape

(98909, 13)

Making a dataframe with of the dependent variables (organizations to which the attacks were attribued)

In [9]:
dependent_vars = pd.DataFrame(attributed.loc[:,list(attributed.filter(regex='gname'))])
dependent_vars.shape

(98909, 1)

Split the data into training and test sets

In [10]:
independant_size = independent_vars.shape
dependant_size = dependent_vars.shape
if independant_size[0] != dependant_size[0]:
    raise ValueError('Independent and Dependant DFs do not match')

split_size = int(independant_size[0] * 0.7)

training_independant_vars = independent_vars.iloc[:split_size]
test_independant_vars = independent_vars.iloc[split_size:]

training_dependant_vars = dependent_vars.iloc[:split_size]
test_dependant_vars = dependent_vars.iloc[split_size:]

Implement Training model

In [18]:
classifier = RandomForestClassifier(verbose=2, n_estimators=35, warm_start=True)
classifier.fit(training_independant_vars, training_dependant_vars.values.ravel())
predictions = classifier.predict(test_independant_vars)
total = accuracy_score(test_dependant_vars, predictions)
print("Accuracy is {}%".format(total * 100))


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 35


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.3s remaining:    0.0s


building tree 2 of 35
building tree 3 of 35
building tree 4 of 35
building tree 5 of 35
building tree 6 of 35
building tree 7 of 35
building tree 8 of 35
building tree 9 of 35
building tree 10 of 35
building tree 11 of 35
building tree 12 of 35
building tree 13 of 35
building tree 14 of 35
building tree 15 of 35
building tree 16 of 35
building tree 17 of 35
building tree 18 of 35
building tree 19 of 35
building tree 20 of 35
building tree 21 of 35
building tree 22 of 35
building tree 23 of 35
building tree 24 of 35
building tree 25 of 35
building tree 26 of 35
building tree 27 of 35
building tree 28 of 35
building tree 29 of 35
building tree 30 of 35
building tree 31 of 35
building tree 32 of 35
building tree 33 of 35
building tree 34 of 35
building tree 35 of 35


[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:   29.8s finished


Accuracy is 76.13655511744683%


In [12]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [13]:
from sklearn.metrics import confusion_matrix

Compute the Confusion Matrix (Currently computing an unwieldingly large CM). TODO: Believe I need to implement this as a One vs. All CM)

In [20]:
# Compute confusion matrix
import numpy as np
cnf_matrix = confusion_matrix(test_dependant_vars, predictions)
np.set_printoptions(precision=2)
cnf_matrix.shape

(2128, 2128)

Plot the Confusion Matrix. (Currently, the CM is too large and will cause the kernal to hang)

In [15]:
# Plot non-normalized confusion matrix
import itertools
class_names = test_independant_vars.columns
# plot_confusion_matrix(np.array(cnf_matrix), class_names)


In [16]:
# plt.figure()
# plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
#                       title='Normalized confusion matrix')