## Hurricane Florence Graph Object Generation

The following code was used to create the graph object for Hurricane Florence, and is modified from the Harvey notebooks.

Note that two blocks use Django DB calls and therefore must be run on the correct database. These are commented out here, as they were run on a different machine.

In [1]:
### Initialisation ###
import os

# Location of data files
DIR = './data/florence_user_location/'
GEXF_FILE = 'Florence_network_data_20210720.gexf' # All components of network of class=2 users detected within 7 days of event

In [2]:
import re
import math
import yaml
import googlemaps

def parse_coordinates(string):
    '''Parse a string for coordinates'''
    reg = '[nsewNSEW]?\s?-?\d+[\.°]\s?\d+°?\s?[nsewNSEW]?'
    result = re.findall(reg, string)
    # Check if format is degrees minutes rather than degrees decimal (discard seconds)
    reg_has_minutes = '\d+\s?°\s*\d+.?\d+\s?\''
    has_minutes = re.findall(reg_has_minutes, string)
    if len(result) == 2: # Coordinates detected
        for i in range(len(result)):
            # Replace middle degree symbol with decimal:
            reg_middle_degree = '(\d+)°\s?(\d+)'
            result[i] = re.sub(reg_middle_degree, r'\1.\2', result[i])
            # Remove trailing degree symbol, N and E marks:
            reg_strip = '[°neNE\s]'
            result[i] = re.sub(reg_strip, '', result[i])
            # Replace south/west with negative sign:
            reg_replace_sw = '[swSW](\d+\.\d+)|(\d+\.\d+)[swSW]'
            result[i] = re.sub(reg_replace_sw, r'-\1\2', result[i])
            # Remove double negative (where string contained eg. '-99.10w')
            result[i] = re.sub('--', '-', result[i])
            result[i] = float(result[i])
            # Convert minutes to decimal
            if len(has_minutes) == 2:
                result[i] = math.modf(result[i])[1] + math.modf(result[i])[0] / 60 * 100
                result[i] = round(result[i], 5)
        return (result[0], result[1])
    else:
        return False

def is_in_bounding_box(coords, boxes):
    '''
    Check whether coordinates fall within defined bounding box:
    Boxes are defined as their NW and SE points.
    '''
    for box in boxes:
        if coords[0] < box[0][0] and coords[0] > box[1][0]:
            if coords[1] > box[0][1] and coords[1] < box[1][1]:
                return True
    return False


def is_local(location, boxes, known_localities=[]):
    '''
    Check whether a location string falls within a set of 
    bounding boxes using Googlemaps API.
    
    <<<< WARNING >>>>
    If a broad location is given (e.g. 'Texas', 'USA'), the 
    returned coordinates may fall within the bounding box by
    chance and give false positives.
    '''
    if not location:
        return
    # Check known localities first to save on API requests:
    for x in known_localities:
        if x in location:
            return True
    coords = get_coords(location)
    if coords:
        return(is_in_bounding_box(coords, boxes))
    return

def get_coords(location):
    if not location:
        return
    # Try and parse coordinates from string rather than API query:
    coords = parse_coordinates(location)
    # Get coords from API:
    if not coords:
        with open("auth.yml", 'r') as ymlfile:
            auth = yaml.load(ymlfile, Loader=yaml.BaseLoader)
        key = auth['apikeys']['googlemaps2']
        gmaps = googlemaps.Client(key=key)
        geocode_result = gmaps.geocode(location)
        if geocode_result:
            lat = geocode_result[0]['geometry']['location']['lat']
            lon = geocode_result[0]['geometry']['location']['lng']
            coords = (lat, lon)
    return coords

In [23]:
# Get list of profile location strings based on users who are part of the
# largest connected component (based on existing gexf file exported from
# website interface)
# Note -- users limited to those appearing in first week of event.

from streamcollect.models import User, Event, Tweet 
import networkx as nx

G = nx.read_gexf(DIR + GEXF_FILE)
Gcc = sorted(nx.connected_components(G.to_undirected()), key=len, reverse=True)        
G = G.subgraph(Gcc[0])

In [30]:
# NOTE: run on correct database

# Confirm correct database is set in Django settings.py
EVENT_NAME = Event.objects.all()[0].name.replace(' ', '')
if 'Florence' not in EVENT_NAME:
    raise Exception('Event name mismatch -- check database set in Django')
    
user_ids = list(G.nodes)
locs = User.objects.filter(user_id__in=user_ids
                          ).filter(location__isnull=False
                          ).exclude(location=''
                          ).values_list('location', flat=True)

locs = [l.lower().strip() for l in locs] 
locs = set(locs)

with open(DIR + 'flr_locs.txt', 'w') as file:
	file.write(json.dumps(list(locs))) 

Exception: Event name mismatch -- check database set in Django

In [11]:
# Running again after disruption, skipping where vals already recorded

LOCALITY_COORDS_DICT_FILE = "locality_coords_dict_flr_v2.txt"

with open(DIR + 'flr_locs.txt') as json_file:
    locs = json.load(json_file)
print('Running get_coords() for {} strings...'.format(len(locs)))

try:
    with open(DIR + LOCALITY_COORDS_DICT_FILE) as file:
        loc_coords_dict = json.load(file)
except:
    loc_coords_dict = {}

c = 0
for loc in locs:
    c += 1
    if c%1000 == 0:
        print('Requesting {} of {}'.format(c, len(locs)))
    try:
        loc_coords_dict[loc]
    except:
        loc_coords_dict[loc] = get_coords(loc)

Running get_coords() for 29507 strings...
Requesting 1000 of 29507
Requesting 2000 of 29507
Requesting 3000 of 29507
Requesting 4000 of 29507
Requesting 5000 of 29507
Requesting 6000 of 29507
Requesting 7000 of 29507
Requesting 8000 of 29507
Requesting 9000 of 29507
Requesting 10000 of 29507
Requesting 11000 of 29507
Requesting 12000 of 29507
Requesting 13000 of 29507
Requesting 14000 of 29507
Requesting 15000 of 29507
Requesting 16000 of 29507
Requesting 17000 of 29507
Requesting 18000 of 29507
Requesting 19000 of 29507
Requesting 20000 of 29507
Requesting 21000 of 29507
Requesting 22000 of 29507
Requesting 23000 of 29507
Requesting 24000 of 29507
Requesting 25000 of 29507
Requesting 26000 of 29507
Requesting 27000 of 29507
Requesting 28000 of 29507
Requesting 29000 of 29507


In [12]:
LOCALITY_COORDS_DICT_FILE = "locality_coords_dict_flr_v3.txt"

with open(DIR + LOCALITY_COORDS_DICT_FILE, 'w') as file:
     file.write(json.dumps(loc_coords_dict)) # use `json.loads` to do the reverse

In [20]:
# TODO: Data is not yet adjusted to exclude the exclusion box.
# 36.4134, -75.1782     Top Right
# 31.7161, -81.4328     Bottom left
# 34.9624, -78.2680     Middle Point for exclusion 

boxes = [[(36.4134, -81.4328), (31.7161, -75.1782)]]
# Boxes with exclusion:
# boxes = [[(34.9624, -81.4328), (31.7161, -78.2680)],  # Top Left, Bottom Middle
#         [(36.4134, -78.2680), (31.7161, -75.1782)]]   # Top Middle, Bottom right

loc_dict = {}
for k, v in loc_coords_dict.items():
    try:
        loc_dict[k] = is_in_bounding_box(v, boxes)
    except:
        loc_dict[k] = None

In [27]:
LOCALITY_DICT_FILE = "locality_dict_flr.txt"

with open(DIR + LOCALITY_DICT_FILE, 'w') as file:
     file.write(json.dumps(loc_dict)) # use `json.loads` to do the reverse

In [31]:
# # NOTE: run on correct database

# Confirm correct database is set in Django settings.py
EVENT_NAME = Event.objects.all()[0].name.replace(' ', '')
if 'Florence' not in EVENT_NAME:
    raise Exception('Event name mismatch -- check database set in Django')

from streamcollect.models import User, Event, Tweet 
import networkx as nx
import json
from datetime import datetime

GEXF_FILE = 'Florence_network_data_20210720.gexf'
LOCALITY_DICT_FILE = "locality_dict_flr.txt"

with open(DIR + LOCALITY_DICT_FILE) as json_file:
    loc_dict = json.load(json_file)

G = nx.read_gexf(DIR + GEXF_FILE)
Gcc = sorted(nx.connected_components(G.to_undirected()), key=len, reverse=True)        
G = G.subgraph(Gcc[0])

usd = User.objects.filter(user_id__in=G.nodes).values('user_id', 'location')
loc_dict_db = {}
for x in usd:
    if x['location']:
        loc_dict_db[x['user_id']] = x['location'].lower().strip()
    else:
        loc_dict_db[x['user_id']] = None

loc_prf_db = {}
for n in G.nodes:
    if loc_dict_db[int(n)] == None:
        continue
    if loc_dict[loc_dict_db[int(n)]] == None:
        continue
    loc_prf_db[n] = {'lcl_profile': loc_dict[loc_dict_db[int(n)]]}

nx.set_node_attributes(G, loc_prf_db)


filename = ‘Florence_network_data_' + datetime.today().strftime('%Y%m%d') + '.gexf'
nx.write_gexf(G, filename, prettyprint=True)


Exception: Event name mismatch -- check database set in Django