In [0]:
df = spark.table("bigdata_finals.default.clean_terror")
df.show(5)
df.printSchema()


+------------+-----+------+----+-------+------------------+------+--------------------+---------+-------------+---------+----------+-----------+--------------------+--------------------+-----+------+-------------+--------------------+-------+-------+-------------+----------+------+----------------+
|     eventid|iyear|imonth|iday|country|       country_txt|region|          region_txt|provstate|         city| latitude| longitude|attacktype1|     attacktype1_txt|               gname|nkill|nwound|weaptype1_txt|       targtype1_txt|success|suicide|incident_date|casualties|decade|   high_casualty|
+------------+-----+------+----+-------+------------------+------+--------------------+---------+-------------+---------+----------+-----------+--------------------+--------------------+-----+------+-------------+--------------------+-------+-------+-------------+----------+------+----------------+
|197000000001| 1970|     7|   2|     58|Dominican Republic|     2|Central America &...|     NULL|San

In [0]:
# Basic Stats
df.describe(["nkill", "nwound", "casualties"]).show()

+-------+------------------+------------------+-----------------+
|summary|             nkill|            nwound|       casualties|
+-------+------------------+------------------+-----------------+
|  count|            181691|            181691|           181691|
|   mean|2.2668596683380025|2.8832963658078827|5.150156034145885|
| stddev| 11.22705708247857|34.309747287962395|40.55541615668081|
|    min|               0.0|               0.0|              0.0|
|    max|            1570.0|            8191.0|           9574.0|
+-------+------------------+------------------+-----------------+



In [0]:
# Attack per Year
from pyspark.sql.functions import count

df_attacks_year = (
    df.groupBy("iyear")
      .agg(count("*").alias("num_attacks"))
      .orderBy("iyear")
)
df_attacks_year.show()


+-----+-----------+
|iyear|num_attacks|
+-----+-----------+
| 1970|        651|
| 1971|        471|
| 1972|        568|
| 1973|        473|
| 1974|        581|
| 1975|        740|
| 1976|        923|
| 1977|       1319|
| 1978|       1526|
| 1979|       2662|
| 1980|       2662|
| 1981|       2586|
| 1982|       2544|
| 1983|       2870|
| 1984|       3495|
| 1985|       2915|
| 1986|       2860|
| 1987|       3183|
| 1988|       3721|
| 1989|       4324|
+-----+-----------+
only showing top 20 rows


In [0]:
# Casualities by Region
from pyspark.sql.functions import sum as spark_sum

df_casualties_region = (
    df.groupBy("region_txt")
      .agg(
          spark_sum("casualties").alias("total_casualties"),
          spark_sum("nkill").alias("total_killed"),
          spark_sum("nwound").alias("total_wounded")
      )
      .orderBy("total_casualties", ascending=False)
)
df_casualties_region.show()


+--------------------+----------------+------------+-------------+
|          region_txt|total_casualties|total_killed|total_wounded|
+--------------------+----------------+------------+-------------+
|Middle East & Nor...|        351950.0|    137642.0|     214308.0|
|          South Asia|        242679.0|    101319.0|     141360.0|
|  Sub-Saharan Africa|        131243.0|     78386.0|      52857.0|
|       South America|         45553.0|     28849.0|      16704.0|
|      Southeast Asia|         41896.0|     15637.0|      26259.0|
|Central America &...|         37699.0|     28708.0|       8991.0|
|       North America|         26447.0|      4916.0|      21531.0|
|      Western Europe|         25026.0|      6694.0|      18332.0|
|      Eastern Europe|         19460.0|      7415.0|      12045.0|
|           East Asia|         10365.0|      1152.0|       9213.0|
|        Central Asia|          3009.0|      1000.0|       2009.0|
|Australasia & Oce...|           410.0|       150.0|        26

In [0]:
# Top 10 countries by casualities
from pyspark.sql.functions import col
df_top_countries = (
    df.groupBy("country_txt")
      .agg(spark_sum("casualties").alias("total_casualties"))
      .orderBy(col("total_casualties").desc())
      .limit(10)
)
df_top_countries.show()


+-------------+----------------+
|  country_txt|total_casualties|
+-------------+----------------+
|         Iraq|        213279.0|
|  Afghanistan|         83661.0|
|     Pakistan|         65860.0|
|        India|         48321.0|
|      Nigeria|         32921.0|
|    Sri Lanka|         31091.0|
|        Syria|         29338.0|
|     Colombia|         25026.0|
|United States|         24473.0|
|  Philippines|         22926.0|
+-------------+----------------+



In [0]:
# Attack type VS Suicide
from pyspark.sql.functions import avg

df_suicide_attacktype = (
    df.groupBy("attacktype1_txt")
      .agg(
          avg("suicide").alias("suicide_rate"),
          avg("casualties").alias("avg_casualties")
      )
      .orderBy(col("suicide_rate").desc())
)
df_suicide_attacktype.show()



+--------------------+--------------------+------------------+
|     attacktype1_txt|        suicide_rate|    avg_casualties|
+--------------------+--------------------+------------------+
|   Bombing/Explosion|  0.0704096085207637| 6.005404792929579|
|Hostage Taking (B...|0.054490413723511606| 8.520686175580222|
|           Hijacking| 0.01669195751138088|31.440060698027313|
|       Assassination|0.013566694283347142| 2.009475973487987|
|Hostage Taking (K...|0.003405628248790...| 2.749327836529844|
|     Unarmed Assault|0.001970443349753...|14.686699507389163|
|       Armed Assault|0.001148374698258689| 5.569921957392955|
|             Unknown|2.748763056624519E-4|  6.47416162726773|
|Facility/Infrastr...|9.656237929702588E-5|0.7152375434530707|
+--------------------+--------------------+------------------+



In [0]:
%sql
SELECT
  iyear,
  region_txt,
  COUNT(*) AS num_attacks
FROM bigdata_finals.default.clean_terror
GROUP BY iyear, region_txt
ORDER BY iyear, num_attacks DESC;

iyear,region_txt,num_attacks
1970,North America,472
1970,South America,65
1970,Western Europe,50
1970,Middle East & North Africa,28
1970,Eastern Europe,12
1970,Southeast Asia,10
1970,Central America & Caribbean,7
1970,Sub-Saharan Africa,3
1970,East Asia,2
1970,South Asia,1


In [0]:
%sql
SELECT
  weaptype1_txt,
  SUM(nkill + nwound) AS total_casualties
FROM bigdata_finals.default.clean_terror
GROUP BY weaptype1_txt
ORDER BY total_casualties DESC;



weaptype1_txt,total_casualties
Explosives,560181.0
Firearms,252113.0
Unknown,60195.0
"Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)",20305.0
Melee,16070.0
Chemical,14449.0
Incendiary,11006.0
Biological,814.0
Sabotage Equipment,369.0
Other,228.0


In [0]:
%sql
SELECT
  targtype1_txt,
  SUM(nkill + nwound) AS total_casualties
FROM bigdata_finals.default.clean_terror
GROUP BY targtype1_txt
ORDER BY total_casualties DESC;


targtype1_txt,total_casualties
Private Citizens & Property,319176.0
Military,177085.0
Police,118407.0
Business,78018.0
Government (General),67255.0
Transportation,54595.0
Religious Figures/Institutions,37890.0
Terrorists/Non-State Militia,17311.0
Educational Institution,13972.0
Government (Diplomatic),13398.0
