In [15]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [16]:
global_terrorism_model_df = pd.read_csv('/Users/skylerwilson/Desktop/Lighthouse_Labs/Projects/midterm_project/data/global_terrorism.csv')
global_terrorism_model_df.drop(columns= ['Unnamed: 0'], inplace=True)
global_terrorism_model_df.columns

Index(['eventid', 'approxdate', 'extended', 'country', 'country_txt', 'region',
       'region_txt', 'provstate', 'city', 'latitude', 'longitude',
       'specificity', 'vicinity', 'crit1', 'crit2', 'crit3', 'doubtterr',
       'multiple', 'success', 'suicide', 'attacktype1', 'attacktype1_txt',
       'targtype1', 'targtype1_txt', 'targsubtype1', 'targsubtype1_txt',
       'corp1', 'target1', 'natlty1', 'natlty1_txt', 'gname', 'motive',
       'guncertain1', 'individual', 'nperps', 'nperpcap', 'weaptype1',
       'weaptype1_txt', 'weapsubtype1', 'weapsubtype1_txt', 'weapdetail',
       'nkill', 'nkillter', 'nwound', 'nwoundte', 'property', 'propextent',
       'propvalue', 'ransom', 'ransomamt', 'ransompaid'],
      dtype='object')

In [17]:
global_terrorism_model_df.isna().any()

eventid             False
approxdate          False
extended            False
country             False
country_txt         False
region              False
region_txt          False
provstate            True
city                 True
latitude             True
longitude            True
specificity          True
vicinity            False
crit1               False
crit2               False
crit3               False
doubtterr            True
multiple             True
success             False
suicide             False
attacktype1         False
attacktype1_txt     False
targtype1           False
targtype1_txt       False
targsubtype1         True
targsubtype1_txt     True
corp1                True
target1              True
natlty1              True
natlty1_txt          True
gname               False
motive              False
guncertain1          True
individual          False
nperps               True
nperpcap             True
weaptype1           False
weaptype1_txt       False
weapsubtype1

<h3>Preliminary Model<h3>

<h5>Questions:</h5>

1) Understand if there is a relationship between the target for terrorist attacks and other columns in the data set

2) Predict what targets are most likely to be hit depending on location and other predictors

3) what predictors are most important in predicting the target type

In [18]:
# Drop the independent variable and non-numeric columns
non_numeric_columns = global_terrorism_model_df.select_dtypes(exclude=['int64', 'float64']).columns
columns_to_drop = ['targtype1', 'targtype1_txt', 'eventid'] + non_numeric_columns.to_list()

# Drop the columns
X_subset = global_terrorism_model_df.drop(columns=columns_to_drop)

# Drop the columns with NaN
X_subset = X_subset.dropna(axis=1)

In [19]:
#define the data
def linear_regression_model(X, y):

    #split into training and testing data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit a model using statsmodels
    model = sm.OLS(y_train, sm.add_constant(X_train))
    results = model.fit()

    # Evaluate the model on the testing data
    y_pred = results.predict(sm.add_constant(X_test))

    return results, y_pred

In [20]:
#define data
X = X_subset
y = y = global_terrorism_model_df['targtype1']

In [21]:
# Fit the model and make predictions
model_results, y_pred = linear_regression_model(X, y)

<h5>Evaluate the preliminary model</h5>

In [22]:
#print the model summary to evaluate key coefficients
print(model_results.summary())

                            OLS Regression Results                            
Dep. Variable:              targtype1   R-squared:                       0.082
Model:                            OLS   Adj. R-squared:                  0.082
Method:                 Least Squares   F-statistic:                     808.0
Date:                Wed, 12 Jul 2023   Prob (F-statistic):               0.00
Time:                        16:59:17   Log-Likelihood:            -4.7536e+05
No. Observations:              145316   AIC:                         9.507e+05
Df Residuals:                  145299   BIC:                         9.509e+05
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          13.9158      0.275     50.640      

In [23]:
#employ backward stepwise regresion to fit the model better

#drop column with highest p_value
X_reg_1 = X_subset.drop(columns=['attacktype1'])

#refit the model
model_results, y_pred = linear_regression_model(X_reg_1, y)

In [24]:
#print results
print(model_results.summary())

                            OLS Regression Results                            
Dep. Variable:              targtype1   R-squared:                       0.082
Model:                            OLS   Adj. R-squared:                  0.082
Method:                 Least Squares   F-statistic:                     861.8
Date:                Wed, 12 Jul 2023   Prob (F-statistic):               0.00
Time:                        16:59:17   Log-Likelihood:            -4.7536e+05
No. Observations:              145316   AIC:                         9.507e+05
Df Residuals:                  145300   BIC:                         9.509e+05
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         13.9177      0.274     50.717      0.0

In [25]:
#follow steps of backward stepwise regression again

#drop column with new highest p_value
X_reg_2 = X_reg_1.drop(columns=['ransomamt'])

#refit the model
model_results, y_pred = linear_regression_model(X_reg_2, y)


In [26]:
print(model_results.summary())

                            OLS Regression Results                            
Dep. Variable:              targtype1   R-squared:                       0.082
Model:                            OLS   Adj. R-squared:                  0.082
Method:                 Least Squares   F-statistic:                     923.3
Date:                Wed, 12 Jul 2023   Prob (F-statistic):               0.00
Time:                        16:59:17   Log-Likelihood:            -4.7536e+05
No. Observations:              145316   AIC:                         9.507e+05
Df Residuals:                  145301   BIC:                         9.509e+05
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         13.9184      0.274     50.720      0.0