In [1]:
import os

try:
    from dotenv import load_dotenv
except ImportError:
    from pip._internal import main as pip
    pip(['install', 'python-dotenv'])
    from dotenv import load_dotenv

try:
    import psycopg2
except ImportError:
    from pip._internal import main as pip
    pip(['install', 'psycopg2'])
    import psycopg2

try:
    import plotly.graph_objects as go
except ImportError:
    from pip._internal import main as pip
    pip(['install', 'plotly'])
    import plotly.graph_objects as go

try:
    import scipy
except ImportError:
    from pip._internal import main as pip
    pip(['install', 'scipy'])
    import scipy
    
from scipy import stats
from scipy.stats import ttest_ind
import sqlalchemy
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import sklearn
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt


In [2]:
load_dotenv(dotenv_path='//.env')
db = os.getenv('DB')


In [3]:
dorne_sql = sqlalchemy.text("""SELECT sub_event_type,fatalities FROM acled 
WHERE sub_event_type = 'Air/drone strike'
AND notes ILIKE ALL (ARRAY['%drone%']);""")

air_sql = sqlalchemy.text("""SELECT sub_event_type,fatalities FROM acled 
WHERE sub_event_type = 'Air/drone strike'
AND notes NOT ILIKE ALL (ARRAY['%drone%']);""")

sql = sqlalchemy.text("""SELECT CASE
           WHEN sub_event_type = 'Air/drone strike' THEN 'Drone Strike'
       END type_of_strike,
       acled.*,
       ST_Distance(acled.geom::geography,cities.geom::geography) AS distance_to_nearest_city,
       cities.name AS nearest_city,
       cities.gn_pop
FROM acled
CROSS JOIN LATERAL
    (SELECT name,
            geom,
            cities.gn_pop
     FROM cities
     WHERE gn_pop > 0
     ORDER BY cities.geom <-> acled.geom
     LIMIT 1) cities
WHERE country IN ('Afghanistan', 'Syria', 'Iraq', 'Yemen', 'Pakistan', 'Mali',
       'Turkey', 'Ukraine', 'Azerbaijan', 'Saudi Arabia', 'Palestine',
       'Armenia', 'Libya', 'Burkina Faso', 'Somalia', 'Egypt', 'Israel',
       'Lebanon', 'Venezuela', 'United Arab Emirates', 'Nigeria',
       'South Sudan')
	   AND year > 2000
    AND year < 2021
    AND sub_event_type = 'Air/drone strike'
    AND notes LIKE '%drone%'
UNION
SELECT CASE
           WHEN sub_event_type = 'Air/drone strike' THEN 'Air Strike'
       END type_of_strike,
       acled.*,
       ST_Distance(acled.geom::geography,cities.geom::geography) AS distance_to_nearest_city,
       cities.name AS nearest_city,
       cities.gn_pop
FROM acled
CROSS JOIN LATERAL
    (SELECT name,
            geom,
            cities.gn_pop
     FROM cities
     WHERE gn_pop > 0
     ORDER BY cities.geom <-> acled.geom
     LIMIT 1) cities
WHERE country IN ('Afghanistan', 'Syria', 'Iraq', 'Yemen', 'Pakistan', 'Mali',
       'Turkey', 'Ukraine', 'Azerbaijan', 'Saudi Arabia', 'Palestine',
       'Armenia', 'Libya', 'Burkina Faso', 'Somalia', 'Egypt', 'Israel',
       'Lebanon', 'Venezuela', 'United Arab Emirates', 'Nigeria',
       'South Sudan') 
	AND year > 2000
    AND year < 2021
    AND sub_event_type = 'Air/drone strike'
    AND notes NOT ILIKE ALL (ARRAY['%drone%'])
ORDER BY event_date;""")


In [4]:
drone_df = pd.read_sql_query(dorne_sql, db)

air_df = pd.read_sql_query(air_sql, db)

df = pd.read_sql_query(sql,db)

In [5]:
ttest_ind(df.query('type_of_strike == "Air Strike"')['fatalities'], df.query('type_of_strike == "Drone Strike"')['fatalities'])

Ttest_indResult(statistic=-9.84635079836412, pvalue=7.404517241451441e-23)

from scyipy import stats

In [36]:
air_df.replace('Air/drone strike','Air Strike',inplace=True)

air_df.head()

Unnamed: 0,sub_event_type,fatalities
0,Air Strike,0
1,Air Strike,4
2,Air Strike,0
3,Air Strike,0
4,Air Strike,0


In [40]:
drone_df.replace('Air/drone strike', 'Drone Strike', inplace=True)
drone_df.head()

Unnamed: 0,sub_event_type,fatalities
0,Drone Strike,10
1,Drone Strike,0
2,Drone Strike,4
3,Drone Strike,2
4,Drone Strike,1


In [42]:
df = pd.concat([air_df,drone_df])
df.head()

Unnamed: 0,sub_event_type,fatalities
0,Air Strike,0
1,Air Strike,4
2,Air Strike,0
3,Air Strike,0
4,Air Strike,0


In [37]:

stats.stats.normaltest(air_df['fatalities'])


NormaltestResult(statistic=109465.91578645803, pvalue=0.0)

In [38]:
stats.stats.normaltest(drone_df['fatalities'])


NormaltestResult(statistic=3049.8184164829745, pvalue=0.0)

In [9]:
air_means = []
drone_means = []

for __ in range(1000):
    boot_sample = df.query('type_of_strike == "Drone Strike"').sample(len(df.query('type_of_strike == "Drone Strike"')), replace=True)
    drone_means.append(boot_sample.fatalities.mean())

for __ in range(1000):
    boot_sample = df.query('type_of_strike == "Air Strike"').sample(len(df.query('type_of_strike == "Air Strike"')), replace=True)
    air_means.append(boot_sample.fatalities.mean())


In [10]:
fig = px.histogram(x=[air_means,drone_means],
                   marginal="box")
fig.show()


In [11]:
hist_data = [air_means,drone_means]

group_labels = ['Air Strike', 'Drone Strike']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.02)
fig.show()


In [12]:
ttest_ind(air_means, drone_means)

Ttest_indResult(statistic=-1062.2128729367682, pvalue=0.0)

In [None]:
df.query('type_of_strike == "Drone Strike"').info()