In [None]:
# Requires news-homepages to be installed and updated at ..\news-homepages from https://github.com/palewire/news-homepages

In [None]:
from opencage.geocoder import OpenCageGeocode
import simplejson as json
from tqdm import tqdm

import csv
import os

In [None]:
sourcedir = "../news-homepages/newshomepages/sources/"
supportdir = "support/"
geodatafile = supportdir + "geodata.json"
configfile = supportdir + "config.py"

In [None]:
if not os.path.exists(configfile):
    print(f"You need to build out a configuration file at {configfile} using the sample.")
else:
    from support.config import *
    print(f"Configuration successfully loaded")

In [None]:
sitedict = {}
with open(sourcedir + "sites.csv", "r", encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        row['geoname'] = None
        if len(row['bundle']) == 0:
            row['bundle'] = []
        else:
            row['bundle'] = row['bundle'].split("|")
        sitedict[row['handle']] = row

In [None]:
bundledict = {}
with open(sourcedir + "bundles.csv", "r", encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        bundledict[row['slug']] = row

In [None]:
countrydict = {}
with open(supportdir + "countries.csv", "r", encoding="utf-8") as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        countrydict[row['2codeupper']] = row['goodname']

In [None]:
for handle in sitedict:
    z = sitedict[handle]
    country = z['country']
    if country not in ['US']:
        if country not in countrydict:
            print(f"Missing country code {country}")
        else:
            sitedict[handle]['geoname'] = z['location'] + ", " + countrydict[country]
    elif country in ['US']:
        bundles = z['bundle']
        for bundle in bundles:
            if bundle not in bundledict:
                print(f"Missing bundle {bundle} from lookup")
            else:
                if bundledict[bundle]['type'] == "region":
                    sitedict[handle]['geoname'] = z['location'] + ", " + bundledict[bundle]['name']

In [None]:
if os.path.exists(geodatafile):
    with open(geodatafile, "r", encoding='utf-8') as infile:
        geodata = json.loads(infile.read())
#    geodata = json.load(geodatafile)
else:
    geodata = {}

In [None]:
for handle in sitedict:
    z = sitedict[handle]
    if z['geoname'] and z['geoname'] not in geodata:
        geodata[z['geoname']] = None

In [None]:
geocoder = OpenCageGeocode(apikey)

In [None]:
print(f"Looking for geography names that were not yet geocoded.")
for geoname in tqdm(geodata):
    if not geodata[geoname]:     # If we have no data, if we need to geocode
        results = geocoder.geocode(geoname)
        latlong = str(results[0]['geometry']['lat']) + ", " + str(results[0]['geometry']['lng'])
        geodata[geoname] = {}
        geodata[geoname]['details'] = results[0]
        geodata[geoname]['latlong'] = latlong

In [None]:
with open(geodatafile, "w") as outfile:
    outfile.write(json.dumps(geodata, indent=4 * ' ', encoding='utf-8'))

In [None]:
# Find countries not represented
countrytally = {}
for countrycode in countrydict:
    countrytally[countrycode] = 0
for site in sitedict:
    countrytally[sitedict[site]['country']] += 1
for countrycode in countrytally:
    if countrytally[countrycode] == 0:
        print(f"{countrydict[countrycode]}")

In [None]:
valueadds = {}
for handle in sitedict:
    geoname = sitedict[handle]['geoname']
    if geoname:
        valueadds[handle] = {}
        valueadds[handle]['geoname'] = geoname
        if geoname in geodata and 'latlong' in geodata[geoname]:
            valueadds[handle]['latlong'] = geodata[geoname]['latlong']

In [None]:
with open("valueadds.json", "w", encoding="utf-8") as outfile:
    outfile.write(json.dumps(valueadds, indent=4 * ' ', encoding='utf-8'))