In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

from sklearn.calibration import CalibratedClassifierCV

import lightgbm as lgb

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve

from skopt import BayesSearchCV

from scipy.stats import randint

import re

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from scipy.sparse import csr_matrix
from scipy.sparse import hstack

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\casti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\casti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\casti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
X_tr_vec = sp.sparse.load_npz(r'C:\Users\casti\OneDrive\Documents\A Springboard\Capstone 2 - Global Terrorism\data\interim\X_tr_vec.npz')
X_te_vec = sp.sparse.load_npz(r'C:\Users\casti\OneDrive\Documents\A Springboard\Capstone 2 - Global Terrorism\data\interim\X_te_vec.npz')
y_tr = pd.read_csv(r'C:\Users\casti\OneDrive\Documents\A Springboard\Capstone 2 - Global Terrorism\data\interim\y_tr.csv')
y_te = pd.read_csv(r'C:\Users\casti\OneDrive\Documents\A Springboard\Capstone 2 - Global Terrorism\data\interim\y_te.csv')
y_tr = y_tr.drop(columns = ['Unnamed: 0']).values.ravel()
y_te = y_te.drop(columns = ['Unnamed: 0']).values.ravel()

df = pd.read_csv(r'C:\Users\casti\OneDrive\Documents\A Springboard\Capstone 2 - Global Terrorism\data\interim\dfall.csv').drop(columns=['Unnamed: 0'])

In [4]:
X = df.drop(columns='success')
y = df[['success']]
print(X.columns)
print(y)

Index(['motive', 'is_weekend', 'targ_Business', 'targ_Educational Institution',
       'targ_Government (Diplomatic)', 'targ_Government (General)',
       'targ_Journalists & Media', 'targ_Military', 'targ_Other',
       'targ_Police', 'targ_Private Citizens & Property',
       'targ_Religious Figures/Institutions', 'targ_Telecommunication',
       'targ_Terrorists/Non-State Militia', 'targ_Transportation',
       'targ_Utilities', 'targ_Violent Political Party', 'att_Armed Assault',
       'att_Assassination', 'att_Bombing/Explosion',
       'att_Facility/Infrastructure Attack', 'att_Hostage Taking', 'att_Other',
       'weap_Biological', 'weap_Chemical', 'weap_Explosives',
       'weap_Fake Weapons', 'weap_Firearms', 'weap_Incendiary', 'weap_Melee',
       'weap_Other', 'weap_Radiological', 'weap_Sabotage Equipment',
       'weap_Unknown', 'weap_Vehicle', 'fatal_enc', 'wound_enc'],
      dtype='object')
       success
0            1
1            1
2            1
3            0
4     

In [5]:
X['tok'] = X['motive'].apply(word_tokenize)
wnl = WordNetLemmatizer()
def lem_tokens(tokens) :
    lem = [wnl.lemmatize(token) for token in tokens]
    return ' '.join(lem)
    

X['lem'] = X.tok.apply(lem_tokens)
X['lem']
tv = TfidfVectorizer(ngram_range=(2,3), stop_words='english')
tv.fit(X['lem'])
motive_vec = tv.transform(X['lem'])
X_num = X.drop(columns = ['motive','tok','lem']).astype(int)

X_sparse = csr_matrix(X_num.values)

# Concatenate sparse matrix (motive_vec) horizontally with DataFrame (X_tr)
X_vec = hstack([X_sparse, motive_vec])

In [6]:
X_vec.shape

(23764, 203335)

In [62]:
# def make_unique_df(df, category):
#     cat_list = list(df.columns[df.columns.str.contains(f'{category}')])
#     val1 = list(False for i in range(len(cat_list)))
#     dfn = pd.DataFrame({k:v for k,v in zip(cat_list, val1)}, index=df.index).loc[range(len(cat_list)),:]
#     for i, col in enumerate(cat_list):
#         dfn[col][i] = True
#     return dfn

In [7]:
df.drop(columns=['success','motive'], inplace=True)

In [10]:
def model():
    
    #takes whatever they input and creates a 'motive' feature
    #creates X_te_vec for custom motive and all combinations taken from the gtd data.
    #makes predictions of each X_te_vec and assigns labels and probabilities
    #outputs 5 highest probable successes and associated features

    motive = input('Enter a potential suspect or affiliation and motive in your own words.')
     
    mot_arr = np.array([motive for i in range(len(df))])
    
    feats = pd.concat([df, pd.Series(mot_arr)], axis=1).rename(columns={0:'motive'})
    
    print('Tokenizing...')
    feats['tok'] = feats['motive'].apply(lambda x: word_tokenize(x))    
    
    print('Lemmatizing...')
    wnl = WordNetLemmatizer()
    
    def lem_tokens(tokens) :
        lem = [wnl.lemmatize(token) for token in tokens]
        return ' '.join(lem)

    feats['lem'] = feats.tok.apply(lem_tokens)
    
    print('Vectorizing...')

    motive_vec = tv.transform(feats['lem'])
    
    print('Converting to sparse matrix...')
    feats_num = feats.drop(columns = ['motive','tok','lem']).astype(int)

    feats_sparse = csr_matrix(feats_num.values)

    feats_vec = hstack([feats_sparse, motive_vec])
    
    print('Initializing LGBM...')
    lgbm = lgb.LGBMClassifier(objective='binary',class_weight= {0: 1.0, 1: 1.0}, n_estimators= 76)
    
    print('Fitting LGBM...')
    lgbm.fit(X_vec, y.values.ravel())
    
    print('Making Predictions...')
    preds = lgbm.predict(feats_vec)
    probs = lgbm.predict_proba(feats_vec)
    
    df_pr = feats.drop(columns=['tok','lem'])
    df_pr['prob_success'] = probs[:,1]
    df_sort = df_pr.sort_values(by='prob_success', ascending=False)
    
    df_out = pd.DataFrame()
    for col in df_sort:
        if df_sort[col].dtype == bool and df_sort[col].any():
            df_out[col] = df_sort[col]
        elif df_sort[col].dtype in ['float64', 'int64']:
            df_out[col] = df_sort[col]
    
    return(df_out)
    
    

In [11]:
model()

Enter a potential suspect or affiliation and motive in your own words.University students protested the university funding initiative to increase budget for animal research. Animal rights activists partnered with local militants to rally against university administration.
Tokenizing...
Lemmatizing...
Vectorizing...
Converting to sparse matrix...
Initializing LGBM...
Fitting LGBM...
[LightGBM] [Info] Number of positive: 21705, number of negative: 2059
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.237997 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53326
[LightGBM] [Info] Number of data points in the train set: 23764, number of used features: 2877
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.913356 -> initscore=2.355322
[LightGBM] [Info] Start training from score 2.355322
Making Predictions...


Unnamed: 0,is_weekend,targ_Business,targ_Educational Institution,targ_Government (Diplomatic),targ_Government (General),targ_Journalists & Media,targ_Military,targ_Other,targ_Police,targ_Private Citizens & Property,targ_Religious Figures/Institutions,targ_Telecommunication,targ_Terrorists/Non-State Militia,targ_Transportation,targ_Utilities,targ_Violent Political Party,att_Armed Assault,att_Assassination,att_Bombing/Explosion,att_Facility/Infrastructure Attack,att_Hostage Taking,att_Other,weap_Biological,weap_Chemical,weap_Explosives,weap_Fake Weapons,weap_Firearms,weap_Incendiary,weap_Melee,weap_Other,weap_Radiological,weap_Sabotage Equipment,weap_Unknown,weap_Vehicle,fatal_enc,wound_enc,prob_success
1979,0,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,1.0,3.0,0.996187
3631,0,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,1.0,3.0,0.996187
15550,1,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,2.0,3.0,0.995672
15551,1,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,2.0,3.0,0.995672
8846,1,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,2.0,4.0,0.995653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17901,0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,0.0,0.0,0.011660
23047,0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,0.0,0.0,0.011660
2236,1,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,0.0,1.0,0.011215
14061,0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,0.0,1.0,0.010615


In [12]:
df_out

NameError: name 'df_out' is not defined