In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

from sklearn.calibration import CalibratedClassifierCV

import lightgbm as lgb

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve

from skopt import BayesSearchCV

from scipy.stats import randint

import re

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from scipy.sparse import csr_matrix
from scipy.sparse import hstack

import dash
from dash import dcc, html, Input, Output
import plotly.express as px

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\casti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\casti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\casti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
X_tr_vec = sp.sparse.load_npz(r'C:\Users\casti\OneDrive\Documents\A Springboard\Capstone 2 - Global Terrorism\data\interim\X_tr_vec.npz')
X_te_vec = sp.sparse.load_npz(r'C:\Users\casti\OneDrive\Documents\A Springboard\Capstone 2 - Global Terrorism\data\interim\X_te_vec.npz')
y_tr = pd.read_csv(r'C:\Users\casti\OneDrive\Documents\A Springboard\Capstone 2 - Global Terrorism\data\interim\y_tr.csv')
y_te = pd.read_csv(r'C:\Users\casti\OneDrive\Documents\A Springboard\Capstone 2 - Global Terrorism\data\interim\y_te.csv')
y_tr = y_tr.drop(columns = ['Unnamed: 0']).values.ravel()
y_te = y_te.drop(columns = ['Unnamed: 0']).values.ravel()

df1 = pd.read_csv(r'C:\Users\casti\OneDrive\Documents\A Springboard\Capstone 2 - Global Terrorism\data\interim\dfall.csv').drop(columns=['Unnamed: 0'])
df2 = pd.read_csv(r'C:\Users\casti\OneDrive\Documents\A Springboard\Capstone 2 - Global Terrorism\data\interim\dfpr.csv').drop(columns=['Unnamed: 0'])


In [4]:
X = df1.drop(columns=['success','weap_Unknown'])
y = df1[['success']]
print(X.columns)
print(y)

Index(['motive', 'is_weekend', 'targ_Business', 'targ_Educational Institution',
       'targ_Government (Diplomatic)', 'targ_Government (General)',
       'targ_Journalists & Media', 'targ_Military', 'targ_Other',
       'targ_Police', 'targ_Private Citizens & Property',
       'targ_Religious Figures/Institutions', 'targ_Telecommunication',
       'targ_Terrorists/Non-State Militia', 'targ_Transportation',
       'targ_Utilities', 'targ_Violent Political Party', 'att_Armed Assault',
       'att_Assassination', 'att_Bombing/Explosion',
       'att_Facility/Infrastructure Attack', 'att_Hostage Taking', 'att_Other',
       'weap_Biological', 'weap_Chemical', 'weap_Explosives',
       'weap_Fake Weapons', 'weap_Firearms', 'weap_Incendiary', 'weap_Melee',
       'weap_Other', 'weap_Radiological', 'weap_Sabotage Equipment',
       'weap_Vehicle', 'fatal_enc', 'wound_enc'],
      dtype='object')
       success
0            1
1            1
2            1
3            0
4            1
...    

In [5]:
X['tok'] = X['motive'].apply(word_tokenize)
wnl = WordNetLemmatizer()
def lem_tokens(tokens) :
    lem = [wnl.lemmatize(token) for token in tokens]
    return ' '.join(lem)
    

X['lem'] = X.tok.apply(lem_tokens)
X['lem']
tv = TfidfVectorizer(ngram_range = (1,2), stop_words='english')
tv.fit(X['lem'])
motive_vec = tv.transform(X['lem'])
X_num = X.drop(columns = ['motive','tok','lem']).astype(int)

X_sparse = csr_matrix(X_num.values)

# Concatenate sparse matrix (motive_vec) horizontally with DataFrame (X_tr)
X_vec = hstack([X_sparse, motive_vec])

In [6]:
X_vec.shape

(23764, 100554)

In [7]:
df1.drop(columns=['motive','success','weap_Unknown'], inplace=True)

In [8]:
df1.columns

Index(['is_weekend', 'targ_Business', 'targ_Educational Institution',
       'targ_Government (Diplomatic)', 'targ_Government (General)',
       'targ_Journalists & Media', 'targ_Military', 'targ_Other',
       'targ_Police', 'targ_Private Citizens & Property',
       'targ_Religious Figures/Institutions', 'targ_Telecommunication',
       'targ_Terrorists/Non-State Militia', 'targ_Transportation',
       'targ_Utilities', 'targ_Violent Political Party', 'att_Armed Assault',
       'att_Assassination', 'att_Bombing/Explosion',
       'att_Facility/Infrastructure Attack', 'att_Hostage Taking', 'att_Other',
       'weap_Biological', 'weap_Chemical', 'weap_Explosives',
       'weap_Fake Weapons', 'weap_Firearms', 'weap_Incendiary', 'weap_Melee',
       'weap_Other', 'weap_Radiological', 'weap_Sabotage Equipment',
       'weap_Vehicle', 'fatal_enc', 'wound_enc'],
      dtype='object')

In [9]:
def model():
    
    #takes whatever they input and creates a 'motive' feature
    #creates X_te_vec for custom motive and all combinations taken from the gtd data.
    #makes predictions of each X_te_vec and assigns labels and probabilities
    #outputs interactive dashboard of highest probable successes and associated features

    motive = input('Enter a potential suspect or affiliation and motive in your own words: ')

    mot_arr = np.array([motive for i in range(len(df1))])
    
    feats = pd.concat([df1, pd.Series(mot_arr)], axis=1).rename(columns={0:'motive'})
    
    print('Tokenizing...')
    feats['tok'] = feats['motive'].apply(lambda x: word_tokenize(x))    
    
    print('Lemmatizing...')
    wnl = WordNetLemmatizer()
    
    def lem_tokens(tokens) :
        lem = [wnl.lemmatize(token) for token in tokens]
        return ' '.join(lem)

    feats['lem'] = feats.tok.apply(lem_tokens)
    
    print('Vectorizing...')

    motive_vec = tv.transform(feats['lem'])
    
    print('Converting to sparse matrix...')
    feats_num = feats.drop(columns = ['motive','tok','lem']).astype(int)

    feats_sparse = csr_matrix(feats_num.values)

    feats_vec = hstack([feats_sparse, motive_vec])
    
    print('Initializing LGBM...')
    lgbm = lgb.LGBMClassifier(objective='binary',class_weight= {0: 1.0, 1: 1.0}, n_estimators= 76)
    
    print('Fitting LGBM...')
    lgbm.fit(X_vec, y.values.ravel())
    
    print('\n')
    print('Making Predictions...')
    preds = lgbm.predict(feats_vec)
    probs = lgbm.predict_proba(feats_vec)
    
    df_pr = df2
    df_pr['prob_success'] = probs[:,1]
    df_sort = df_pr.sort_values(by='prob_success', ascending=False)
    
    df_out = df_sort[['prob_success','targtype1_txt','attacktype1_txt','weaptype1_txt', 'fatalities_cat','wound_cat','is_weekend']]

    print('Creating interactive dashboard...')
    
    
    print('Results ready for analysis!')
    print('\n')
    
    print('Motive under investigation:')
    print('\n')
    print(f'{motive}')
    
    threshold1 = 300
    threshold2 = 400
    max_thr = 8000
    
    
    app = dash.Dash(__name__)

    # Define the layout of the dashboard
    app.layout = html.Div([
    html.Div([
        dcc.Slider(
            id='threshold1-slider',
            min=0,
            max=max_thr,
            step=100,
            value=300,
            marks={i: str(i) if i % 500 == 0 else '' for i in range(0, max_thr + 1, 1000)},
            tooltip={'placement': 'bottom'}
        ),
        html.P("Target Type Value Counts Threshold"),
    ]),
    html.Div([
        dcc.Slider(
            id='threshold2-slider',
            min=0,
            max=max_thr,
            step=100,
            value=400,
            marks={i: str(i) if i % 500 == 0 else '' for i in range(0, max_thr + 1, 1000)},
            tooltip={'placement': 'bottom'}
        ),
        html.P("Attack Type Value Counts Threshold"),
    ]),
    html.Div([
        dcc.Slider(
            id='threshold3-slider',
            min=0,
            max=1,
            step=0.01,
            value=0.98,
            marks={i/10: str(i/10) if i % 2 == 0 else '' for i in range(0, 11)},
            tooltip={'placement': 'bottom'}
        ),
        html.P("Probability of Success Threshold"),
    ]),
    dcc.Graph(id='boxplot')
    ])

    # Define callback to update the boxplot based on the threshold slider values
    @app.callback(
        Output('boxplot', 'figure'),
        [
            Input('threshold1-slider', 'value'),
            Input('threshold2-slider', 'value'),
            Input('threshold3-slider', 'value')
        ]
    )
    def update_boxplot(threshold1, threshold2, threshold3):
        # Filter the data based on the thresholds
        
        valid_rows = df_out[df_out['prob_success'] > threshold3].copy()
        counts1 = valid_rows['targtype1_txt'].value_counts()
        counts2 = valid_rows['attacktype1_txt'].value_counts()
        valid_categories1 = counts1[(counts1 >= threshold1) & (counts1 <= max_thr)].index
        valid_categories2 = counts2[(counts2 >= threshold2) & (counts2 <= max_thr)].index

        df_filtered = valid_rows[
            (df_out['targtype1_txt'].isin(valid_categories1)) &
            (df_out['attacktype1_txt'].isin(valid_categories2))
        ].sort_values(by='prob_success', ascending=False)

        # Create the boxplot using Plotly Express
        fig = px.box(df_filtered, x='prob_success', y='targtype1_txt', color='attacktype1_txt')
        fig.update_layout(
            title='Probability of Success by Target Type and Attack Type',
            xaxis_title='Probability of Success',
            yaxis_title='Target Type',
            legend_title='Attack Type',
            height=800
        )

        return fig
            
    # Run the app
    if __name__ == '__main__':
        app.run_server(debug=True)
        

Sample Motives:

Animal rights extremists protest university administration's decision to increase funding for animal research. Protesters are members of the "Rights for Mice" group.  

Communist leader took power via military coup. Protesters are planning to overthrow current regime and establish a republic with an elected majority leader. 

Political Extremist student organization at University of Tennessee is protesting recent educational bill that hinders the development of government-funded charter schools

In [10]:
model()

Enter a potential suspect or affiliation and motive in your own words: Tensions are high between Uganda natives and rebel armed forces in the neighboring democratic republic of the congo. Intel reveals that some rebel forces have crossed into Ugandan territory. Rebel forces have a history of aligning with religious cults such as The Lord's Resistance Army or derivatives of the Islamic State. Previous targets include government buildings, schools, and public officials.
Tokenizing...
Lemmatizing...
Vectorizing...
Converting to sparse matrix...
Initializing LGBM...
Fitting LGBM...
[LightGBM] [Info] Number of positive: 21705, number of negative: 2059
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.373266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83407
[LightGBM] [Info] Number of data points in the train set: 23764, number of used feat


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.

