# Importing from Humanitarian Data Exchange (HUM) 

This scripts is pulling data from arcgis API, where data is stored behind this dashboard: https://data.humdata.org/dataset/covid-19-global-travel-restrictions-and-airline-information

The data is divided in two datasets: - COVID-19 restrictions by country: This dataset shows current travel restrictions. Information is collected from various sources: IATA, media, national sources, WFP internal or any other. - COVID-19 airline restrictions information: This dataset shows restrictions taken by individual airlines or country. Information is collected again from various sources including WFP internal and public sources.

In [None]:
import requests
import json
import pandas as pd
import datetime
import pycountry

In [None]:
# papermill parameters
output_folder = '../output/'

In [None]:
def get_df_from_arcgis_api(url):
    res = requests.get(url)
    json_response = json.loads(res.text.encode('utf-8'))
    data = [feature["attributes"] for feature in json_response["features"]]
    return pd.DataFrame(data)
    

 ### COUNTRY RESTRICTIONS

In [None]:
url = "https://services3.arcgis.com/t6lYS2Pmd8iVx1fy/ArcGIS/rest/services/COVID_Travel_Restrictions_V2/FeatureServer/0/query?where=1%3D1&outFields=*&f=pjson"
countryDf = get_df_from_arcgis_api(url)

### Data Quality
1. rename columns
2. filtering data based on "Sources" and "Info Data" data, because the dataset has a lot of empty country data
3. drop unnecessary columns
4. converting date string to datetime format
5. adding Last Update Date column

In [None]:
reNamedCountryDf = countryDf.rename(
    columns={
        'adm0_name': 'COUNTRY',
        'iso3': 'ISO3_COUNTRY_CODE',
        'X': 'LONG',
        'Y': 'LAT',
        'published': 'PUBLISHED',
        'sources': 'SOURCES',
        'info': 'RESTRICTION_TEXT',
        'optional1': 'INFO_DATE',
        'optional2': 'QUARANTINE_TEXT'
    })

cleanCountryDf = reNamedCountryDf[reNamedCountryDf['SOURCES'].notnull()
                                  & reNamedCountryDf['INFO_DATE'].notnull()]
cleanCountryDf = cleanCountryDf.drop(['optional3', 'ObjectId'], axis=1)
cleanCountryDf['PUBLISHED'] = pd.to_datetime(
    cleanCountryDf['PUBLISHED'].astype(str),
    format='%d.%m.%Y',
    errors="coerce")
cleanCountryDf['INFO_DATE'] = pd.to_datetime(
    cleanCountryDf['INFO_DATE'].astype(str), format='%Y%m%d', errors="coerce")
cleanCountryDf['LAST_UPDATE_DATE'] = datetime.datetime.utcnow()

## Recode ISO-3 codes to ISO 3166-1 digrams

In [None]:
def get_country(row):
    if row.ISO3_COUNTRY_CODE is None:
        return ""    
    elif "," in row.ISO3_COUNTRY_CODE:
        country_trigram = row.ISO3_COUNTRY_CODE.split(", ")[0]
    elif row.ISO3_COUNTRY_CODE == "":
        return ""
    else:
        country_trigram = row.ISO3_COUNTRY_CODE
    
    country = pycountry.countries.get(alpha_3=country_trigram)
    
    if country:
        return country.alpha_2
    else:
        return ""

In [None]:
cleanCountryDf["ISO3_COUNTRY_CODE"] = cleanCountryDf.apply(get_country, axis=1)
cleanCountryDf = cleanCountryDf.rename(columns={"ISO3_COUNTRY_CODE": "ISO3166_1"})

## Export country restrictions

In [None]:
cleanCountryDf.to_csv(output_folder + "HUM_RESTRICTIONS_COUNTRY.csv", index=False)

In [None]:
cleanCountryDf

# AIRLINE RESTRICTIONS

In [None]:
url = "https://services3.arcgis.com/t6lYS2Pmd8iVx1fy/ArcGIS/rest/services/COVID_Airline_Information_V2/FeatureServer/0/query?where=1%3D1&outFields=*&f=pjson"
airlineDf = get_df_from_arcgis_api(url)

### Data Quality
1. rename columns
2. filtering data based on "Sources" and "Info Data" data, because the dataset has a lot of empty country data
3. drop unnecessary columns
4. converting date string to datetime format
5. adding Last Update Date column

In [None]:
reNamedAirlineDf = airlineDf.rename(columns = {
                            'adm0_name': 'COUNTRY',
                            'iso3':'ISO3_COUNTRY_CODE',
                            'X': 'LONG',
                            'Y': 'LAT',
                            'published':'PUBLISHED',
                            'source': 'SOURCES',
                            'airline': 'AIRLINE',
                            'info': 'RESTRICTION_TEXT'})

cleanAirlineDf = reNamedAirlineDf[reNamedAirlineDf['RESTRICTION_TEXT'].notnull()]
cleanAirlineDf = cleanAirlineDf.drop(['optional1', 'optional2', 'optional3', 'ObjectId'], axis=1)
cleanAirlineDf['PUBLISHED'] = pd.to_datetime(cleanAirlineDf['PUBLISHED'].astype(str),format='%d.%m.%Y', errors='coerce')
cleanAirlineDf['LAST_UPDATE_DATE'] = datetime.datetime.utcnow()

### Airline Lookup

Use IATA codes to lookup/dedup/explode Airlines codes - one airline per line.

In [None]:
airlines = pd.read_csv("https://raw.githubusercontent.com/jpatokal/openflights/master/data/airlines.dat",
                       header=None, index_col="IATA",
                      names=["ID", "Name", "Alias","IATA","ICAO","Callsign","Country","Active"], na_values='\\N')

In [None]:
cleanAirlineDf['AIRLINE'] = cleanAirlineDf['AIRLINE'].str.replace('.','')
cleanAirlineDf['AIRLINE'] = cleanAirlineDf['AIRLINE'].str.replace('zAirlines .Airline Code..','')

cleanAirlineDf['AIRLINE'] = cleanAirlineDf['AIRLINE'].str.split(',')
cleanAirlineDf = cleanAirlineDf.explode('AIRLINE')
cleanAirlineDf['AIRLINE'] = cleanAirlineDf['AIRLINE'].str.strip()


In [None]:
def lookup_iata_codes(row):
    if row["AIRLINE"] and len(row["AIRLINE"]) == 2:
        airline = airlines.loc[ row["AIRLINE"] ]
        if type(airline) == pd.core.series.Series:
            row["AIRLINE"] = airline["Name"]
        else:
            row["AIRLINE"] = airline.iloc[0]["Name"]
    return row

cleanAirlineDf = cleanAirlineDf.apply(lookup_iata_codes, axis=1)

In [None]:
cleanAirlineDf[cleanAirlineDf.ISO3_COUNTRY_CODE.isnull()]

## Write to output CSV

In [None]:
cleanAirlineDf["ISO3_COUNTRY_CODE"] = cleanAirlineDf.apply(get_country, axis=1)
cleanAirlineDf = cleanAirlineDf.rename(columns={"ISO3_COUNTRY_CODE": "ISO3166_1"})

In [None]:
cleanAirlineDf.to_csv(output_folder + "HUM_RESTRICTIONS_AIRLINE.csv", index=False)