In [2]:
import os

In [3]:
import plotly.express as px
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.basemap import Basemap
import datetime as dt
import re
import tempfile

In [4]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__

# Import People Groups Files (Currently from Azure)

In [5]:
try:
    connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')

    container_name="jp-snapshots"
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    container_client = blob_service_client.get_container_client(container_name)

    # local_path = tempfile.gettempdir()

    local_path = os.getcwd()

    blob_list = container_client.list_blobs()
    for blob in blob_list: 
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob.name)
        download_file_path = os.path.join(local_path, blob.name)
        print("\nDownloading blob to \n\t" + download_file_path)
        with open(download_file_path, "wb") as download_file:
            download_file.write(blob_client.download_blob().readall())
except Exception as ex:
    print('Exception:')
    print(ex)


Downloading blob to 
	/Users/patricksaul/Documents/Projects/Python Projects/Unreached-Analysis-Engine/Notebooks/AllPeoplesByCountry_2020-06-29.csv

Downloading blob to 
	/Users/patricksaul/Documents/Projects/Python Projects/Unreached-Analysis-Engine/Notebooks/AllPeoplesByCountry_2020-07-28.csv

Downloading blob to 
	/Users/patricksaul/Documents/Projects/Python Projects/Unreached-Analysis-Engine/Notebooks/AllPeoplesByCountry_2020-12-03.csv

Downloading blob to 
	/Users/patricksaul/Documents/Projects/Python Projects/Unreached-Analysis-Engine/Notebooks/AllPeoplesByCountry_2020-12-05.csv

Downloading blob to 
	/Users/patricksaul/Documents/Projects/Python Projects/Unreached-Analysis-Engine/Notebooks/AllPeoplesByCountry_2020-12-11.csv

Downloading blob to 
	/Users/patricksaul/Documents/Projects/Python Projects/Unreached-Analysis-Engine/Notebooks/AllPeoplesByCountry_2020-12-18.csv

Downloading blob to 
	/Users/patricksaul/Documents/Projects/Python Projects/Unreached-Analysis-Engine/Notebooks

# Create Snapshots Over Time

In [6]:
def readJpSnapshots(directory):
    ssDict = {}
    for entry in os.scandir(directory):
        if entry.path.endswith(".csv"):
            fname = entry.name
            match = re.search('\d{4}-\d{2}-\d{2}', fname)
            date = dt.datetime.strptime(match.group(), '%Y-%m-%d').date()
            print(fname)
            ssDict[date] = pd.read_csv(fname,low_memory=False,skiprows=1)
    return ssDict

readDir = os.getcwd()
ssDict = readJpSnapshots(readDir)

AllPeoplesByCountry_2020-12-18.csv
AllPeoplesByCountry_2020-12-25.csv
AllPeoplesByCountry_2020-07-28.csv
AllPeoplesByCountry_2020-06-29.csv
AllPeoplesByCountry_2020-12-05.csv
AllPeoplesByCountry_2020-12-11.csv
AllPeoplesByCountry_2020-12-03.csv


In [20]:
#Cleaning & updating

numBins = 10
# binLabels = list(range(numBins))
# binLabels = [x + 1 for x in binLabels]
# print(binLabels)

for ss, data in ssDict.items():
    #Dropping non - people group related data at bottom of files
    ssDict[ss] = ssDict[ss].dropna(subset=['PeopleID3'])
    #Creating a unique ID for each group based on country and people ID
    ssDict[ss]['ID'] = data['ROG3'].str.cat(data['PeopleID3'].astype(str),sep='_')
    ssDict[ss] = ssDict[ss].set_index(['ID'])
    #Creating population bins based on quartiles  = used to display on map later
    ssDict[ss]['PopBin'] = pd.qcut(data['Population'], numBins, labels=False) + 1
    #Creating snapshot date
    ssDict[ss]['Snapshot Date'] = ss

In [21]:
ssDict[ss].head()

Unnamed: 0_level_0,ROG3,Ctry,PeopleID3,ROP3,PeopNameAcrossCountries,PeopNameInCountry,Population,JPScale,LeastReached,ROL3,...,ROG2,Continent,10_40Window,IndigenousCode,WorkersNeeded,Frontier,Latitude,Longitude,PopBin,Snapshot Date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AF_14372.0,AF,Afghanistan,14372.0,107989.0,"Afghan, Tajik","Afghan, Tajik",10585000.0,1.0,Y,prs,...,ASI,Asia,Y,Y,212.0,Y,31.15621,62.14612,10.0,2020-12-03
AF_19409.0,AF,Afghanistan,19409.0,100096.0,Afshari,Afshari,15000.0,1.0,Y,azb,...,ASI,Asia,Y,N,1.0,Y,34.44796,69.28976,6.0,2020-12-03
AF_21454.0,AF,Afghanistan,21454.0,118270.0,Aimaq,Aimaq,1595000.0,1.0,Y,aiq,...,ASI,Asia,Y,Y,32.0,Y,35.00501,63.128594,10.0,2020-12-03
AF_15741.0,AF,Afghanistan,15741.0,110448.0,"Americans, U.S.","Americans, U.S.",10000.0,5.0,N,eng,...,ASI,Asia,Y,N,,N,34.528621,69.168549,5.0,2020-12-03
AF_16221.0,AF,Afghanistan,16221.0,111334.0,Ansari,Ansari,2400.0,1.0,Y,urd,...,ASI,Asia,Y,N,1.0,Y,34.54777,69.3198,3.0,2020-12-03


In [22]:
#Below logic used to create a change over time dataframe for all snapshots in the dictionary passed in

def createDelta(ssDict, fields):
    deltDf = pd.DataFrame()
    chgFields = [field + '_CHG' for field in fields]
    for ss, data in ssDict.items():
        for ss2, data2 in ssDict.items():
            if ss2 < ss:
                newDelt = data.copy()
                
                newDelt['Begin Snapshot'] = ss2
                newDelt = newDelt.rename(columns={'Snapshot Date' : 'End Snapshot'})

                newDelt[chgFields] = newDelt[fields].subtract(data2[fields])

                deltDf = deltDf.append(newDelt)
                
    return deltDf

#Specifying fields to be used for change calculations
deltFields = ['Population','PercentAdherents','PercentEvangelical','CountOfCountries','WorkersNeeded']
deltDf = createDelta(ssDict,deltFields)

# Map Visualization Prep

In [23]:
ur_plot = ssDict[pd.to_datetime('2020-12-18').date()].copy()
ur_plot = ur_plot[ur_plot['LeastReached'] == 'Y']
ur_plot = ur_plot.fillna(0)
ur_plot.head()

Unnamed: 0_level_0,ROG3,Ctry,PeopleID3,ROP3,PeopNameAcrossCountries,PeopNameInCountry,Population,JPScale,LeastReached,ROL3,...,ROG2,Continent,10_40Window,IndigenousCode,WorkersNeeded,Frontier,Latitude,Longitude,PopBin,Snapshot Date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AF_14372.0,AF,Afghanistan,14372.0,107989.0,"Afghan, Tajik","Afghan, Tajik",10585000.0,1.0,Y,prs,...,ASI,Asia,Y,Y,212.0,Y,31.15621,62.14612,10.0,2020-12-18
AF_19409.0,AF,Afghanistan,19409.0,100096.0,Afshari,Afshari,15000.0,1.0,Y,azb,...,ASI,Asia,Y,N,1.0,Y,34.44796,69.28976,6.0,2020-12-18
AF_21454.0,AF,Afghanistan,21454.0,118270.0,Aimaq,Aimaq,1595000.0,1.0,Y,aiq,...,ASI,Asia,Y,Y,32.0,Y,35.00501,63.128594,10.0,2020-12-18
AF_16221.0,AF,Afghanistan,16221.0,111334.0,Ansari,Ansari,2400.0,1.0,Y,urd,...,ASI,Asia,Y,N,1.0,Y,34.54777,69.3198,3.0,2020-12-18
AF_15202.0,AF,Afghanistan,15202.0,109734.0,"Arab, Tajiki","Arab, Tajiki",18000.0,1.0,Y,abh,...,ASI,Asia,Y,N,1.0,N,36.9447,66.80422,6.0,2020-12-18


In [24]:
ur_plot.columns.to_list()

['ROG3',
 'Ctry',
 'PeopleID3',
 'ROP3',
 'PeopNameAcrossCountries',
 'PeopNameInCountry',
 'Population',
 'JPScale',
 'LeastReached',
 'ROL3',
 'PrimaryLanguageName',
 'BibleStatus',
 'RLG3',
 'PrimaryReligion',
 'PercentAdherents',
 'PercentEvangelical',
 'PeopleID1',
 'ROP1',
 'AffinityBloc',
 'PeopleID2',
 'ROP2',
 'PeopleCluster',
 'CountOfCountries',
 'RegionCode',
 'RegionName',
 'ROG2',
 'Continent',
 '10_40Window',
 'IndigenousCode',
 'WorkersNeeded',
 'Frontier',
 'Latitude',
 'Longitude',
 'PopBin',
 'Snapshot Date']

In [25]:
chg_plot = deltDf.copy()
chg_plot = chg_plot[chg_plot['LeastReached'] == 'Y']
chg_plot = chg_plot[chg_plot['Begin Snapshot'] == chg_plot['Begin Snapshot'].min()]
chg_plot = chg_plot[chg_plot['End Snapshot'] == chg_plot['End Snapshot'].max()]
chg_plot = chg_plot.fillna(0)
chg_plot = chg_plot[chg_plot['PercentEvangelical_CHG'] != 0]
chg_plot.head()

Unnamed: 0_level_0,ROG3,Ctry,PeopleID3,ROP3,PeopNameAcrossCountries,PeopNameInCountry,Population,JPScale,LeastReached,ROL3,...,Latitude,Longitude,PopBin,End Snapshot,Begin Snapshot,Population_CHG,PercentAdherents_CHG,PercentEvangelical_CHG,CountOfCountries_CHG,WorkersNeeded_CHG
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AG_10379.0,AG,Algeria,10379.0,100226.0,"Algerian, Arabic-speaking","Algerian, Arabic-speaking",29618000.0,1.0,Y,arq,...,35.997425,5.389439,10.0,2020-12-25,2020-06-29,789000.0,-1.491,-1.474,0.0,15.0
AG_12399.0,AG,Algeria,12399.0,104365.0,"Berber, Kabyle","Berber, Kabyle",6029000.0,1.0,Y,kab,...,36.39003,4.55784,10.0,2020-12-25,2020-06-29,160000.0,-2.75,-1.56,0.0,4.0
BR_20070.0,BR,Brazil,20070.0,115119.0,Karuazu,Karuazu,800.0,1.0,Y,por,...,-9.2972,-38.0336,2.0,2020-12-25,2020-06-29,0.0,-80.0,-20.0,0.0,0.0
BR_20100.0,BR,Brazil,20100.0,115636.0,Pipipa,Pipipa,1000.0,1.0,Y,por,...,-8.579,-38.0333,2.0,2020-12-25,2020-06-29,100.0,-96.0,-20.0,0.0,0.0
BR_15497.0,BR,Brazil,15497.0,110070.0,Tingui-Boto,Tingui-Boto,900.0,1.0,Y,por,...,-8.9263,-36.71777,2.0,2020-12-25,2020-06-29,600.0,-50.0,-8.0,0.0,0.0


## Unreached Map

In [29]:
#From https://plotly.com/python/scatter-plots-on-maps/
#Size by population, color by # evangelical

px.set_mapbox_access_token(open('mapbox_token.txt').read())
fig1 = px.scatter_mapbox(ur_plot, lat="Latitude",
                     lon = "Longitude",
                     color="PercentEvangelical", # which column to use to set the color of markers
                     hover_name="PeopNameInCountry", # column added to hover information
                     size="PopBin", # size of markers,
                     hover_data=['Population'],
                     size_max=10,
                     color_continuous_scale=px.colors.sequential.YlOrRd_r,
                     zoom=0.5)
# fig.update_layout(mapbox_style="open-street-map")
fig1.show()

## Change over time map

In [28]:
#From https://plotly.com/python/scatter-plots-on-maps/
#Size by population, color by % change evangelical
#Need to fix visual

px.set_mapbox_access_token(open('mapbox_token.txt').read())
fig2 = px.scatter_mapbox(chg_plot, lat="Latitude",
                     lon = "Longitude",
                     color="PercentEvangelical_CHG", # which column to use to set the color of markers
                     hover_name="PeopNameInCountry", # column added to hover information
                     size="PopBin", # size of markers
                     size_max=10,
                     hover_data=['Population','Begin Snapshot', 'End Snapshot'],
                     color_continuous_scale=px.colors.sequential.Bluered_r,
                     zoom=0.5)
# fig.update_layout(mapbox_style="open-street-map")
fig2.show()