
# Step  #1: Crawling 
## Download information on terrorist incidents from around the world

## About this step
In this step,  I collected the data using the tools: scraping and crawling.

The information comes from the  [GTD web site](https://www.imdb.com/chart/top/).<br/>

### import modules (packages)

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import time

import requests
import json


In [2]:
"""
    The function gets url of html and return soup object of this url.


    :param html_file_name:       String of url
    :type file_path:        str 

    :return:                soup object of this url
    :rtype:                 soup object 
"""
def load_soup_object(html_file_name):
    url = html_file_name
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")
    return soup


In [3]:
"""
    The function gets url of html and return the number of this page.
    for example:
    the url: "https://www.start.umd.edu/gtd/search/Results.aspx?expanded=no&casualties_type=&casualties_max=&success=yes&ob=GTDID&od=desc&page=10&count=1000#results-table"
                                                                                                                                                                                                                                                          ^                              
                                                                                                                                                                                                                                          The number of this page is 10           
    :param url:              String of url
    :type file_path:        str 

    :return:                     The number of this urt
    :rtype:                       int 
"""
def number_of_url(url):
    num_page_index_in_url = url.find("page=")
    index = num_page_index_in_url
    number = ""
    i = 0
    while (i < (len(url)-num_page_index_in_url)):
        if (url[index+5+i] == "&"):
            break
        i = i +1

    for i in range(i):
        number = number + url[index+5+i]
    
    return int(number)

In [4]:
"""
    The function gets url of html and return list of data.
    Details of the list:
    [
         Region,
         PROVINCE/ADMINISTRATIVE REGION/U.S. STATE,
         LOCATION DETAILS,
    
          ###Attack Information###
          Type of Attack,
          Successful Attack?,
          
          ###Target Information###
          Name of Entity,
          Specific Description,
          Nationality of Target,
          
          ###Additional Information###
          Hostages,
          Ransom,
          Property Damage,
         
          ###Weapon Information###
          Type of Weapon,
          Sub-type of Weapon,
          
          ###Additional Information###
          Suicide Attack?,
    
          ###Perpetrator Group Information###
          Name of Perpetrator Group,
          Sub name of Perpetrator Group,
          Claimed Responsibility,
          
          ###Perpetrator Statistics###
          Number of Perpetrators,
          
          ###Casualty Information###
          Total Number of Fatalities,
          Number of Perpetrator Fatalities,
          Total Number of Injured
    
    ]
    
    :param url:              String of url
    :type file_path:        str 

    :return:                     List of data
    :rtype:                       list 
"""

def more_data(url_address):
    Region = "Unknown"
    Province = "Unknown"
    Location = "Unknown"
              ###Attack Information###
    Type_of_Attack = "Unknown"
    Successful_Attack = "Unknown"
          
          ###Target Information###
    Name_of_Entity = "Unknown"
    Specific_Description = "Unknown"
    Nationality_of_Target = "Unknown"
          
          ###Additional Information###
    Hostages = "Unknown"
    Ransom = "Unknown"
    Property_Damage = "Unknown"
          
          ###Weapon Information###
    Type_of_Weapon = "Unknown"
    Type_of_sub_type = "Unknown"
    if_title_Weapon = False    
          ###Additional Information###
    Suicide_Attack = "Unknown"

          ###Perpetrator Group Information###
    Group_Name = "Unknown"
    Group_Sub_name = "Unknown"
    Claimed_Responsibility = "Unknown"
    if_title_Perpetrator_Group_2 = False
    if_title_Perpetrator_Group_3 = False
    
          ###Perpetrator Statistics###
    Number_of_Perpetrators = "Unknown"
          
          ###Casualty Information###
    Total_Number_of_Fatalities = "Unknown"
    Number_of_Perpetrator_Fatalities = "Unknown"
    Total_Number_of_Injured = "Unknown"

              
    soup = load_soup_object(url_address)
    url2 = soup("div",attrs={"class":"summary-overview"})
    for p in url2[0].find_all('p'):
        category = p.find('span', attrs={'class':'leftHead'}).text
        if (category == "Region:"):   
            Region = p.find('span', attrs={'class':'leftLarge'}).text
        elif (category == "Province/administrativeregion/u.s. state:"):
            Province = p.find('span', attrs={'class':'leftLarge'}).text
        elif (category == "Location Details:" ):
            Location = p.find('span', attrs={'class':'leftLarge'}).text

    url3 = soup("div",attrs={"id":"secondary-right"}) 
    nunber_of_table = 0 
    num_of_rows_in_nunber_of_table_3 = 0
    
    for tbl in url3[0].find_all('table'):
        nunber_of_table = nunber_of_table+1
        end_of_table_3 = True
        for row in tbl("tr"):
                cells = row("td")+row("th")

                if  (len(cells) == 2):
                    if (if_title_Weapon):
                        Type_of_Weapon =  cells[0].get_text().strip()
                        Type_of_sub_type =  cells[1].get_text().strip()
                        if_title_Weapon = False
                    elif(if_title_Perpetrator_Group_2):
                        Group_Name =  cells[0].get_text().strip()
                        Claimed_Responsibility =  cells[1].get_text().strip() 
                        if_title_Perpetrator_Group_2 = False                        
                    elif ( cells[0].get_text().strip()  =="Type of Attack (more)"):
                        Type_of_Attack =  cells[1].get_text().strip()
                    elif ( cells[0].get_text().strip()  =="Successful Attack? (more)"):
                        Successful_Attack =  cells[1].get_text().strip()
                    elif ( cells[0].get_text().strip()  =="Name of Entity"):
                        Name_of_Entity =  cells[1].get_text().strip()
                    elif ( cells[0].get_text().strip()  =="Specific Description"):
                        Specific_Description =  cells[1].get_text().strip()
                    elif ( cells[0].get_text().strip()  =="Nationality of Target"):
                        Nationality_of_Target =  cells[1].get_text().strip()
                    elif ( cells[0].get_text().strip()  =="Hostages"):
                        Hostages =  cells[1].get_text().strip()
                    elif ( cells[0].get_text().strip()  =="Ransom"):
                        Ransom =  cells[1].get_text().strip()
                    elif ( cells[0].get_text().strip()  =="Property Damage"):
                        Property_Damage =  cells[1].get_text().strip()
                    elif ( cells[0].get_text().strip()  =="Suicide Attack?"):
                        Suicide_Attack =  cells[1].get_text().strip()
                    elif ( cells[0].get_text().strip()  =="Number of Perpetrators"):
                        Number_of_Perpetrators =  cells[1].get_text().strip()
                    elif ( cells[0].get_text().strip()  =="Total Number of Fatalities"):
                        Total_Number_of_Fatalities =  cells[1].get_text().strip()
                    elif ( cells[0].get_text().strip()  =="Total Number of Injured"):
                        Total_Number_of_Injured =  cells[1].get_text().strip()
                    elif ( cells[0].get_text().strip()  =="Number of Perpetrator Fatalities"):
                        Number_of_Perpetrator_Fatalities =  cells[1].get_text().strip()
                    elif (cells[0].get_text().strip()  == "Type"):
                        if_title_Weapon = True
                    elif (cells[0].get_text().strip()  == "Group Name"):
                        if_title_Perpetrator_Group_2 = True
                if  (len(cells) == 3):
                    if(if_title_Perpetrator_Group_3):
                        Group_Name =  cells[0].get_text().strip()
                        Group_Sub_name =  cells[1].get_text().strip()
                        Claimed_Responsibility =  cells[2].get_text().strip()
                    elif (cells[0].get_text().strip()  == "Group Name"):
                        if_title_Perpetrator_Group_3 = True
                    
                

    list_from_table = [Region,
            Province,
            Location,
            Type_of_Attack,
            Successful_Attack,
            Name_of_Entity,
            Specific_Description,
            Nationality_of_Target,
            Hostages,
            Ransom,
            Property_Damage,
            Type_of_Weapon,
            Type_of_sub_type,
            Suicide_Attack,
            Group_Name,
            Group_Sub_name,
            Claimed_Responsibility,
            Number_of_Perpetrators,
            Total_Number_of_Fatalities,
            Total_Number_of_Injured,
            Number_of_Perpetrator_Fatalities]  
    for n, i in enumerate(list_from_table):
        if i == "":
            list_from_table[n] = "Unknown"
    return list_from_table                            

In [5]:
"""
    The function gets url of the website "GTD" (Global Terrorism Database) and return the database from him.

    :param url:              String of url
    :type file_path:        str 

    :return:                     Database of Global Terrorism
    :rtype:                       DataFrame 
"""

def load_data_of_terrorism(url_address,n_pages):
    list_of_data = list()
    DATE = list()
    COUNTRY = list()
    CITY = list()
    PERPETRATOR_GROUP = list()
    FATALITIES = list()
    INJURED = list()
    TARGET_TYPE = list()
    links_page = list()
    Region = list()
    Type_of_State = list()
    LOCATION_DETAILS = list()
    
          ###Attack Information###
    Type_of_Attack = list()
    Successful_Attack = list()
          
          ###Target Information###
    Name_of_Entity = list()
    Specific_Description = list()
    Nationality_of_Target = list()
          
          ###Additional Information###
    Hostages = list()
    Ransom = list()
    Property_Damage = list()
    
          ###Weapon Information###
    Type = list()
    Sub_type = list()
          
          ###Additional Information###
    Suicide_Attack = list()
          
          ###Perpetrator Group Information###
    Group_Name_of_Perpetrator_Group = list()
    Group_Sub_Name_of_Perpetrator_Group = list() 
    Claimed_Responsibility = list()
          
         ###Perpetrator Statistics###
    Number_of_Perpetrators = list()       
    
          ###Casualty Information###
    Total_Number_of_Fatalities = list()
    Total_Number_of_Injured = list()
    Number_of_Perpetrator_Fatalities = list()
          
    
    start_url = "https://www.start.umd.edu/gtd/search/Results.aspx" 
    url = url_address
    for i in range(n_pages):
        soup = load_soup_object(url)
        if (i !=0):            
            url2 = soup("div",attrs={"class":"search-bottom-nav screen-only"})
            for a in url2[0].find_all('a'):
                next_url = start_url + a['href']
                num_page_in_next_url = number_of_url(next_url)
                num_page_in_url = number_of_url(url)
                if (num_page_in_next_url > num_page_in_url):
                    links_page.append(next_url)
                    url = next_url
    url = url_address

    for i in range(n_pages):
        time.sleep(5.4)
        if (i ==0):
            soup = load_soup_object(url)
        else:
            soup = load_soup_object(links_page[i-1])
        tbl = soup ("table",attrs={"class":"results"})[0]
        for row in tbl("tr"):
            cells = row("td")
            if  (len(cells)<8):
                continue
            
            try:
                if(len(cells[0].get_text().strip()) == 12):
                    DATE.append(cells[1].get_text().strip())
                    COUNTRY.append(cells[2].get_text().strip())
                    CITY.append(cells[3].get_text().strip())
                    PERPETRATOR_GROUP.append(cells[4].get_text().strip())
                    FATALITIES.append(cells[5].get_text().strip())
                    INJURED.append(cells[6].get_text().strip())
                    TARGET_TYPE.append(cells[7].get_text().strip())
                    list_of_data = more_data( "https://www.start.umd.edu/gtd/search/IncidentSummary.aspx?gtdid=" + cells[0].get_text().strip())
                    Region.append(list_of_data[0])
                    Type_of_State.append(list_of_data[1])
                    LOCATION_DETAILS.append(list_of_data[2])
                    Type_of_Attack.append(list_of_data[3])
                    Successful_Attack.append(list_of_data[4])
                    Name_of_Entity.append(list_of_data[5])
                    Specific_Description.append(list_of_data[6])
                    Nationality_of_Target.append(list_of_data[7])
                    Hostages.append(list_of_data[8])
                    Ransom.append(list_of_data[9])
                    Property_Damage.append(list_of_data[10])
                    Type.append(list_of_data[11])
                    Sub_type.append(list_of_data[12])
                    Suicide_Attack.append(list_of_data[13])
                    Group_Name_of_Perpetrator_Group.append(list_of_data[14])
                    Group_Sub_Name_of_Perpetrator_Group.append(list_of_data[15])   
                    Claimed_Responsibility.append(list_of_data[16])
                    Number_of_Perpetrators.append(list_of_data[17])
                    Total_Number_of_Fatalities.append(list_of_data[18])
                    Total_Number_of_Injured.append(list_of_data[19])
                    Number_of_Perpetrator_Fatalities.append(list_of_data[20])
                    data_frame_of_rerror = pd.DataFrame({"DATE":DATE,
                             "COUNTRY":COUNTRY,
                             "CITY":CITY,
                             "PERPETRATOR GROUP":PERPETRATOR_GROUP,
                             "FATALITIES":FATALITIES,
                             "INJURED":INJURED,
                             "TARGET TYPE":TARGET_TYPE,
                             "Region":Region,
                             "PROVINCE/ADMINISTRATIVE REGION/U.S. STATE":Type_of_State,
                             "LOCATION DETAILS":LOCATION_DETAILS,
                             "Type of Attack":Type_of_Attack,
                             "Successful Attack?":Successful_Attack,
                             "Name of Entity":Name_of_Entity,
                             "Specific Description":Specific_Description,
                             "Nationality of Target":Nationality_of_Target,
                             "Hostages":Hostages,
                             "Ransom":Ransom,
                             "Property Damage":Property_Damage,
                             "Weapon Type":Type,
                             "Weapon Sub_type":Sub_type,
                             "Suicide_Attack":Suicide_Attack,
                             "Group Name of Perpetrator Group":Group_Name_of_Perpetrator_Group,
                             "Group Sub Name of Perpetrator Group":Group_Sub_Name_of_Perpetrator_Group,  
                             "Claimed Responsibility":Claimed_Responsibility,
                             "Number of Perpetrators":Number_of_Perpetrators,
                             "Total Number of Fatalities":Total_Number_of_Fatalities,
                             "Number of Perpetrator Fatalities":Number_of_Perpetrator_Fatalities,
                             "Total Number of Injured":Total_Number_of_Injured
                            })
            except:
                print("try ID:  ",cells[0].get_text().strip())
                print("The url is:  ",links_page[i-1])


                
    return data_frame_of_rerror

           


### Start Scraping 

At this point, I'll begin compiling data on terrorist occurrences for a central database.

In [6]:
url  = "https://www.start.umd.edu/gtd/search/Results.aspx?expanded=no&casualties_type=b&casualties_max=&dtp2=all&success=yes&region=10&ob=GTDID&od=desc&page=1&count=100#results-table"


print("Start Crawling")
df = load_data_of_terrorism(url,300)
print("Stop Crawling")



Start Crawling
Stop Crawling


### I'll now add the terrorist incidents' location coordinates into the central database.

#### First, I'll make a dictionary that will assign each city its own province.

In [7]:
Cities = list(df.CITY.unique())
Dict_Cities_and_States = dict()
for row in range(df.shape[0]):
    if (df.CITY[row] in Cities):
        city = df.CITY[row]
        Dict_Cities_and_States[city] = df["PROVINCE/ADMINISTRATIVE REGION/U.S. STATE"][row]
        Cities.remove(city)
len(Dict_Cities_and_States)

4521

#### Now I'm going to build a new dictionary that will give each city its own nation.

In [8]:
Cities = list(df.CITY.unique())
Dict_Cities_and_Countries = dict()
for row in range(df.shape[0]):
    if (df.CITY[row] in Cities):
        city = df.CITY[row]
        Dict_Cities_and_Countries[city] = df.COUNTRY[row]
        Cities.remove(city)
len(Dict_Cities_and_Countries)
    

4521

For all terrorist incidences, I will now add the position coordinates to the database using the Geopy library.

If he is unable to locate with the assistance of the city and state, the city district will assist him, and if he is still unable to discover the coordinates, the state will assist us.

In [9]:
Dict_Cities_and_location = dict()
Cities_without_location = list()
Cities_and_Countries_without_location = list()
geolocator = Nominatim(user_agent= '`geopy.geocoders.options.default_user_agent = "my-application"`.')
for city in Dict_Cities_and_Countries.keys():
    city_and_country = city + ", " +Dict_Cities_and_Countries[city]
    try:                                                                                                               #Finding Latitude and Longitude by city and country
        location = geolocator.geocode(city_and_country)                                 
        Dict_Cities_and_location[city] = (location.latitude, location.longitude)
    except:
        try:                                                                                                          #Finding Latitude and Longitude by city and state
            city_and_State = city + ", " +Dict_Cities_and_States[city]
            location = geolocator.geocode(city_and_State)
            Dict_Cities_and_location[city] = (location.latitude, location.longitude)
        except:
            try:                                                                                                        #Finding Latitude and Longitude by country
                location = geolocator.geocode(Dict_Cities_and_Countries[city])
                Dict_Cities_and_location[city] = (location.latitude, location.longitude)               
            except:   
                Cities_and_Countries_without_location.append(city_and_country)
                Cities_without_location.append(city)


In [10]:
latitude = list()
longitude = list()
for row in range(df.shape[0]):
    latitude.append(Dict_Cities_and_location[df.CITY[row]][0])
    longitude.append(Dict_Cities_and_location[df.CITY[row]][1])
df['latitude'] = latitude
df['longitude'] = longitude


In [11]:
df.head()

Unnamed: 0,CITY,COUNTRY,Claimed Responsibility,DATE,FATALITIES,Group Name of Perpetrator Group,Group Sub Name of Perpetrator Group,Hostages,INJURED,LOCATION DETAILS,...,Successful Attack?,Suicide_Attack,TARGET TYPE,Total Number of Fatalities,Total Number of Injured,Type of Attack,Weapon Sub_type,Weapon Type,latitude,longitude
0,Baghdad,Iraq,No,2019-12-31,0,Unknown,Unknown,No,0,The incident occurred along Palestine Street.,...,Yes,No,Private Citizens & Property,0,0,Bombing/Explosion,Unknown Explosive Type,Explosives/Bombs/Dynamite,33.302431,44.378799
1,Nada,Iraq,No,2019-12-30,1,Islamic State of Iraq and the Levant (ISIL),Unknown,No,0,Unknown,...,Yes,No,Military,1,0,Unknown,Unknown,Unknown,33.90684,45.083841
2,Tabqah,Syria,No,2019-12-31,3,Islamic State of Iraq and the Levant (ISIL),Unknown,No,0,Unknown,...,Yes,No,Private Citizens & Property,3,0,Unknown,Unknown,Unknown,35.832365,38.54147
3,Ataq district,Yemen,No,2019-12-31,Unknown,Al-Islah Party,Unknown,Yes,Unknown,Unknown,...,Yes,No,Government (General),Unknown,Unknown,Facility/Infrastructure Attack,Unknown,Unknown,14.572856,46.831872
4,Mahfad,Yemen,No,2019-12-30,1,Southern Yemen Separatists,Unknown,No,3,Unknown,...,Yes,No,Military,1,3,Armed Assault,Automatic Weapon,Firearms,14.475597,43.913878


You can see how big the table is.

In [12]:
df.shape

(30000, 30)

I'm going to save the table in CSV format right now.

In [13]:
df.to_csv('data_of_terrorism.csv')