This script will process the log files generated from the AWS_Access script, using the IP addresses of visitors to WHRB.org to populate two feature layers: One showing all visitors to the site, and one containing the locations of all prisons in the US. It will then perform a spatial join to identify any listeners within the boundaries of a prison.

In [6]:
import pandas, arcpy, requests, json, os, datetime 
import pyprojroot as ppr

# Preprocessing

Cycle through each log file, formatting IP addresses, timestamp, and seconds spent on site into a dataframe.

Make sure to change "loc" to the storage location of the logs on your computer.

In [11]:
loc = str(ppr.here('./data'))

ips = []
i = 1
while(i <= 1440 * 14):  #for testing, change this value to 50 to make sure everything works. Then go back and change to 1440*14.
    filename = loc + r"/whrblog" + str(i) + '.txt'
    file = open(filename, 'r')
    for line in file:
        try:
            items = json.loads(line)
            ip = items['host']
            timestamp = items['date']
            timesplit = timestamp.split("-")        
            day = int(timesplit[2][0:2])
            #months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
            month = int(timesplit[1])
            year = int(timesplit[0])
            hour = int(timesplit[2][3:5])
            minute = int(timesplit[2][6:8])
            sec = int(timesplit[2][9:11])
            dt = datetime.datetime(year, month, day, hour, minute, sec)
            dur = int(items['listentime'])
            ips.append([ip,dt,dur])
        except:
            pass
    file.close()
    i += 1
df = pandas.DataFrame(ips, columns=['IP Address', 'Timestamp', 'Duration'])
df

Unnamed: 0,IP Address,Timestamp,Duration
0,147.135.36.151,2022-05-30 22:48:20,0
1,140.247.235.238,2022-05-30 22:48:31,60
2,147.135.36.151,2022-05-30 22:48:40,0
3,51.91.219.191,2022-05-30 22:48:44,0
4,52.91.56.216,2022-05-30 22:48:44,0
...,...,...,...
242310,50.200.93.68,2022-06-14 03:10:40,0
242311,50.200.93.68,2022-06-14 03:10:40,0
242312,65.112.8.60,2022-06-14 03:10:41,0
242313,65.112.8.60,2022-06-14 03:10:41,0


Merge entries based on IP

IP addresses will be in the DataFrame multiple times if they were on the site for longer than a minute. Merging entries makes the data neater and makes processing much faster.

Please note that this code block will take a while to run. Doing another activity while waiting is highly recommended!

In [None]:
uniqueips = pandas.DataFrame(columns=['IP Address', 'Timestamp', 'Duration', 'Count'])

for i, j in df.iterrows():
    ip = j[0]
    newip = True
    
    for testip in uniqueips['IP Address']:
        if testip == ip:
            newip = False
            
    if newip:
        newrow = {'IP Address':ip, 'Timestamp':j[1], 'Duration':j[2], 'Count':1}
        uniqueips = uniqueips.append(newrow, ignore_index=True)
    else:
        uniqueips.loc[uniqueips['IP Address'] == ip, 'Timestamp'] = j[1] #update timestamp to most recent value
        uniqueips.loc[uniqueips['IP Address'] == ip, 'Duration'] += j[2] #add the two durations
        uniqueips.loc[uniqueips['IP Address'] == ip, 'Count'] += 1 #increment the count 
df = uniqueips
df

Unnamed: 0,IP Address,Timestamp,Duration,Count
0,147.135.36.151,2022-05-04 20:45:21,886,4518
1,213.226.123.30,2022-05-04 19:30:14,40,17
2,140.247.235.238,2022-05-04 20:45:22,87093,1437
3,34.88.144.110,2022-05-04 17:40:25,29,6
4,51.91.219.191,2022-05-04 20:45:25,16,2214
...,...,...,...,...
1219,96.249.228.253,2022-05-04 20:34:09,14510,1
1220,72.218.132.79,2022-05-04 20:35:20,9,1
1221,162.142.125.8,2022-05-04 20:35:25,1,2
1222,173.54.51.98,2022-05-04 20:39:47,66,1


# IP Geolocation

Test geolocator

This code block should return "Cambridge, Massachusetts". If you get an error or blank response, you may need to make your own account with Geolocation DB at https://geolocation-db.com. If doing so, make sure to replace the geolocator string with one that uses your API key!

In [10]:
geolocator = 'https://geolocation-db.com/json/8dd79c70-0801-11ec-a29f-e381a788c2c0/'
callback = requests.get(geolocator + "65.112.8.27")
decoded  = json.loads(callback.content)
print(decoded.get('city') + ", " + decoded.get('state'))

Cambridge, Massachusetts


Match IP addresses to Lat/Long coords using Geolocation DB

Please note that this code block will take a while to run. Doing another activity while waiting is highly recommended!

In [None]:
lats = []
longs = []
unlocateable = []
for i, j in df.iterrows():
    try:
        callback = requests.get(geolocator + j[0])
        decoded  = json.loads(callback.content)
        lat = decoded.get("latitude")
        long = decoded.get("longitude")
        lats.append(float(lat))
        longs.append(float(long))
    except:
        print("Could not locate " + j[0])
        lats.append(0.0)
        longs.append(0.0)
        unlocateable.append(j[0])
            
df.insert(0, 'Latitude', lats)
df.insert(1, 'Longitude', longs)
for i, j in df.iterrows():
    if(j[0] == 0.0 and j[1] == 0.0):
        df=df.drop(i, axis=0)
df

Could not locate 103.178.237.233
Could not locate 45.92.247.76
Could not locate 45.154.255.140
Could not locate 190.8.19.122
Could not locate 23.128.248.105
Could not locate 177.128.52.9


Unnamed: 0,Latitude,Longitude,IP Address,Timestamp,Duration,Count
0,38.9615,-77.3418,147.135.36.151,2022-05-04 20:45:21,886,4518
1,55.7386,37.6068,213.226.123.30,2022-05-04 19:30:14,40,17
2,42.3647,-71.1042,140.247.235.238,2022-05-04 20:45:22,87093,1437
3,37.7510,-97.8220,34.88.144.110,2022-05-04 17:40:25,29,6
4,48.8582,2.3387,51.91.219.191,2022-05-04 20:45:25,16,2214
...,...,...,...,...,...,...
1219,37.0768,-76.4967,96.249.228.253,2022-05-04 20:34:09,14510,1
1220,36.8546,-76.2143,72.218.132.79,2022-05-04 20:35:20,9,1
1221,37.7510,-97.8220,162.142.125.8,2022-05-04 20:35:25,1,2
1222,40.7681,-74.0208,173.54.51.98,2022-05-04 20:39:47,66,1


# ArcGIS

***Please note that this section was originally written for online hosting! I cannot run this on my own computer so I unfortunately cannot debug. You may run into glitches here - please let me know if anything does not work.

In [None]:
from arcgis.gis import GIS 
from arcgis.features import SpatialDataFrame, FeatureLayerCollection, GeoAccessor
gis = GIS('pro')
arcpy.env.overwriteOutput = True
folder = str(ppr.here('./gis'))
arcpy.env.workspace = folder

print("logged in as " + str(gis.properties.user.username))

logged in as rog796_Harvard_CGA


Create Spatial DataFrame with all visitors

In [None]:
#features = gis.content.get('dc6794324b5744548bdb760f5ec8e2e0')
#lyr = features.layers[0]
#querysdf = lyr.query().sdf

newsdf = GeoAccessor.from_xy(df, 'Longitude', 'Latitude')
newsdf.rename(columns = {'IP Address':'ip_address', 'Timestamp':'last_visit', "Count":"numLogs"}, inplace=True)
for i, j in newsdf.iterrows():
    timestamp = newsdf['last_visit'][i]
    newsdf.loc[i, 'last_visit'] = timestamp.strftime("%m/%d/%Y, %H:%M:%S")

#lyr.edit_features(deletes=querysdf)
#lyr.edit_features(adds=newsdf)

{'addResults': [{'objectId': 253167,
   'uniqueId': 253167,
   'globalId': None,
   'success': True},
  {'objectId': 253168, 'uniqueId': 253168, 'globalId': None, 'success': True},
  {'objectId': 253169, 'uniqueId': 253169, 'globalId': None, 'success': True},
  {'objectId': 253170, 'uniqueId': 253170, 'globalId': None, 'success': True},
  {'objectId': 253171, 'uniqueId': 253171, 'globalId': None, 'success': True},
  {'objectId': 253172, 'uniqueId': 253172, 'globalId': None, 'success': True},
  {'objectId': 253173, 'uniqueId': 253173, 'globalId': None, 'success': True},
  {'objectId': 253174, 'uniqueId': 253174, 'globalId': None, 'success': True},
  {'objectId': 253175, 'uniqueId': 253175, 'globalId': None, 'success': True},
  {'objectId': 253176, 'uniqueId': 253176, 'globalId': None, 'success': True},
  {'objectId': 253177, 'uniqueId': 253177, 'globalId': None, 'success': True},
  {'objectId': 253178, 'uniqueId': 253178, 'globalId': None, 'success': True},
  {'objectId': 253179, 'uniqu

Spatial join listener points with prison polygons

In [None]:
workspace = folder

newsdf.spatial.to_featureclass(location=workspace+'/listeners.shp')
listeners = workspace+'/listeners.shp'

spatialReference = arcpy.SpatialReference(4326) #CGS_WGS1984
prisonloc = workspace + '/Prison_Boundaries/Prison_Boundaries.shp'
arcpy.management.Project(prisonloc, workspace + '/prisons_projected.shp', spatialReference)
prisons = workspace + '/prisons_projected.shp'

joined = workspace + '/joined.shp'
arcpy.analysis.SpatialJoin(listeners, prisons, joined, match_option='WITHIN', join_type='KEEP_COMMON', {field_mapping}, {match_option}, {search_radius}, {distance_field_name})

joined.shp in the gis folder should contain the locations of any listeners from the period between 9/28 and 10/12 within incarceration facilities. You can take a look at the attribute table to see the properties for each. Here's a quick function (which may or may not work) to quickly check how many listeners are within incarceration facilities:

In [None]:
len(joined.indexes)