## Create code that processes model output into nice visualizations:

1) The files with daily cases, hospital, deaths counts across all population per districts with format output_workplaceBubblesSophie_ need to be averaged for daily trajectories
2) The files with daily cases, deaths, and in occupation by occupation (occ4) with format output_workplaceBubblesSophie_1_Economic_Status_Covid need to be averaged for daily trajectories
3) The files with daily demographics of cases and deaths by sex and gender

In [19]:
# generic packages
import os
from os.path import isfile, join
import glob
import subprocess
import re

#dataframe packages
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# plot packages
import matplotlib as mpl
import matplotlib.pyplot as plt
import pylab as plt
import seaborn as sns
sns.set(style="darkgrid")

#maps
import geopandas as gpd
import plotly.express as px
import json


In [20]:

dist_input_path= "/Users/sophieayling/Library/CloudStorage/GoogleDrive-sophie2ayling@gmail.com/My Drive/PhD/06_Data and Modelling/thesis_data/model_output/e_perc_dist_covid/"
dist_output_path= "/Users/sophieayling/Library/CloudStorage/GoogleDrive-sophie2ayling@gmail.com/My Drive/PhD/06_Data and Modelling/thesis_data/model_output/e_perc_dist_covid/plots/"

In [40]:
# # decide which version I am creating graphics for 

# # Define the folder path and file prefix
folder_path = dist_input_path


# try the loop 

prefix_pairs = [
    ('output_workplaceBubblesSophie_', 'bubblesNorm'),
    ('output_perfectMixingSophie_', 'perfMix'),
    ('output_schoolToHomeSophie_', 'schoolToHome'),
    ('output_schoolToComSophie_', 'schoolToCom'),
    ('output_comWorkToHomeSophie_', 'comWorkersToHome'),
    ('output_workToHomeSophie_', 'workToHome'),
    ('output_allToHomeSophie_', 'allToHome'),
    ('output_BubblesLd_', 'bubblesLd_'),
    ('output_BubblesLd1a_', 'bubblesLd_1a'),
    ('output_BubblesLd1b_', 'bubblesLd_1b'),
    ('output_BubblesLd2a_', 'bubblesLd_2a'),
    ('output_BubblesLd2b_', 'bubblesLd_2b'),
    ('output_BubblesLd3a_', 'bubblesLd_3a'),
    ('output_BubblesLd3b_', 'bubblesLd_3b')
]

In [41]:
# try whole thing as loop 

for file_prefix, id_prefix in prefix_pairs:
    # Use glob to find all files with the specified prefix
    file_pattern = f"{folder_path}/{file_prefix}*Percent_In_District_With_Covid.txt"
    file_list = glob.glob(file_pattern)
    # Initialize an empty list to store individual DataFrames
    df_list = []
    # Loop through the list of files and read each one into a DataFrame
    for file in file_list:
        df = pd.read_csv(file, delimiter='\t')# Adjust delimiter as per your file format
        # Extract the run number from the filename
        run_number = os.path.basename(file).split('_')[2] # 2 for workplacebubbles
        df['run']=int(run_number)
        df_list.append(df)
    # Concatenate all DataFrames in the list into a single DataFrame
    final_df = pd.concat(df_list, ignore_index=True)
    # Convert the 'run_number' column to numeric (int64)
    final_df['run'] = pd.to_numeric(final_df['run'])
    # now reshape the data across
    df_melted = pd.melt(final_df, id_vars=['day', 'run'], value_vars=[f'd_{i}' for i in range(1, 61)], var_name='district', value_name='perc_w_covid')
    # create version which is without the d_
    df_melted['dist_no']= df_melted['district'].str.replace('d_', '')
    #group the data by daily district cases (aggregating runs) but keeping districts independent
    r_data = df_melted.groupby(['day','dist_no'])['perc_w_covid'].mean().reset_index()
    # Round the 'value' column to 1 decimal place
    r_data['perc_w_covid'] = r_data['perc_w_covid'].round(3)
    ## making the district numbers numeric 
    r_data['dist_no']=pd.to_numeric(r_data['dist_no'])
    r_data=r_data.sort_values(by=['day', 'dist_no'])
    r_data.to_csv (dist_output_path+f'{id_prefix}_perc_w_covid_dist.csv')
    r_data.sort_values(by='dist_no')
    r_data.head()
    ##Import district total pop files 
    # Use glob to find all files with the specified prefix
    file_pattern = f"{folder_path}/{file_prefix}*District_Level_Population_Size.txt"
    file_list = glob.glob(file_pattern)
    # Initialize an empty list to store individual DataFrames
    df_list = []
    # Loop through the list of files and read each one into a DataFrame
    for file in file_list:
        df = pd.read_csv(file, delimiter='\t')# Adjust delimiter as per your file format
        # Extract the run number from the filename
        run_number = os.path.basename(file).split('_')[2] # 2 for workplacebubbles
        df['run']=int(run_number)
        df_list.append(df)
    # Concatenate all DataFrames in the list into a single DataFrame
    final_df = pd.concat(df_list, ignore_index=True)
    # Convert the 'run_number' column to numeric (int64)
    final_df['run'] = pd.to_numeric(final_df['run'])
    # now reshape the data across
    df_melted_tot = pd.melt(final_df, id_vars=['day'], value_vars=[f'd_{i}' for i in range(1, 61)], var_name='district', value_name='dist_pop')
    # create version which is without the d_
    df_melted_tot['dist_no']= df_melted_tot['district'].str.replace('d_', '')
    df_melted_tot['dist_no'] = pd.to_numeric(df_melted_tot['dist_no'])
    df_melted_tot.head()
    #group the data by daily district cases (aggregating runs) but keeping districts independent
    r_data_tot = df_melted_tot.groupby(['day','dist_no'])['dist_pop'].mean().reset_index()
    # Round the 'value' column to 1 decimal place
    r_data_tot['dist_pop'] = r_data_tot['dist_pop'].round(0)
    r_data_tot.to_csv (dist_output_path+f'{id_prefix}_tot_pop_dist.csv')
    # now combine the two datasets 
    r_data_comb =r_data.merge(r_data_tot, on= ['dist_no', 'day'] )
    r_data_comb.to_csv (dist_output_path+f'{id_prefix}_comb_dist.csv')
    #create the top 5 prevalence lists
    prev_per_dist = r_data_comb.groupby('dist_no', as_index=False)['perc_w_covid'].max()
    r_data_t5_prev = prev_per_dist.nlargest(5,'perc_w_covid')
    r_data_t5_prev.to_csv(dist_output_path+f'_top5_prev_dists_{id_prefix}.csv')

r_data_t5_prev.head()

Unnamed: 0,dist_no,perc_w_covid
1,2,0.659
0,1,0.65
55,56,0.645
58,59,0.637
35,36,0.634


In [None]:
stop

In [None]:
## below is a breakdown of the code bit by bit, but now don't need anymore unless something breaks :)

file_prefix = 'output_workplaceBubblesSophie_'
id_prefix='bubblesNorm'

file_prefix = 'output_perfectMixingSophie_'
id_prefix="perfMix"

file_prefix ='output_schoolToHomeSophie_'
id_prefix = "schoolToHome"

file_prefix ='output_schoolToComSophie_'
id_prefix = "schoolToCom"

file_prefix ='output_comWorkToHomeSophie_'
id_prefix = "comWorkersToHome"

file_prefix = 'output_workToHomeSophie_'
id_prefix = 'workToHome'

file_prefix = 'output_allToHomeSophie_'
id_prefix = 'allToHome'

# mobility scenarios set 

file_prefix = 'output_BubblesLd_'
id_prefix = 'bubblesLd'

file_prefix = 'output_BubblesLdv2_'
id_prefix = 'bubblesLdv2'

file_prefix = 'output_BubblesLd1a_'
id_prefix = 'bubblesLd_1a'

file_prefix = 'output_BubblesLd1b_'
id_prefix = 'bubblesLd_1b'

file_prefix = 'output_BubblesLd2a_'
id_prefix = 'bubblesLd_2a'

file_prefix = 'output_BubblesLd2b_'
id_prefix = 'bubblesLd_2b'

file_prefix = 'output_BubblesLd3a_'
id_prefix = 'bubblesLd_3a'

file_prefix = 'output_BubblesLd3b_'
id_prefix = 'bubblesLd_3b'

## 3. Percentage of covid cases per district- maps

In [23]:


# Use glob to find all files with the specified prefix
file_pattern = f"{folder_path}/{file_prefix}*Percent_In_District_With_Covid.txt"
file_list = glob.glob(file_pattern)


# Initialize an empty list to store individual DataFrames
df_list = []

In [24]:


# Loop through the list of files and read each one into a DataFrame
for file in file_list:
    df = pd.read_csv(file, delimiter='\t')# Adjust delimiter as per your file format
    # Extract the run number from the filename
    run_number = os.path.basename(file).split('_')[2] # 2 for workplacebubbles
    df['run']=int(run_number)
    df_list.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
final_df = pd.concat(df_list, ignore_index=True)
# Convert the 'run_number' column to numeric (int64)
final_df['run'] = pd.to_numeric(final_df['run'])
# Display the resulting DataFrame
final_df.head()
#final_df.metric.value_counts()

Unnamed: 0,day,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,...,d_53,d_54,d_55,d_56,d_57,d_58,d_59,d_60,Unnamed: 61,run
0,0,0.000397,8.5e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,5
1,1,0.000611,0.000104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,5
2,2,0.000856,0.000161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,5
3,3,0.000917,0.000217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,5
4,4,0.000978,0.000236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,5


In [25]:
# now reshape the data across
df_melted = pd.melt(final_df, id_vars=['day', 'run'], value_vars=[f'd_{i}' for i in range(1, 61)], var_name='district', value_name='perc_w_covid')
# create version which is without the d_
df_melted['dist_no']= df_melted['district'].str.replace('d_', '')
#df_melted=df_melted[df_melted['day']==99]
df_melted.head()

# these are not cumulative percentages, they need to be converted 

Unnamed: 0,day,run,district,perc_w_covid,dist_no
0,0,5,d_1,0.000397,1
1,1,5,d_1,0.000611,1
2,2,5,d_1,0.000856,1
3,3,5,d_1,0.000917,1
4,4,5,d_1,0.000978,1


In [26]:
#group the data by daily district cases (aggregating runs) but keeping districts independent
r_data = df_melted.groupby(['day','dist_no'])['perc_w_covid'].mean().reset_index()
# Round the 'value' column to 1 decimal place
r_data['perc_w_covid'] = r_data['perc_w_covid'].round(3)

## making the district numbers numeric 

r_data['dist_no']=pd.to_numeric(r_data['dist_no'])
r_data=r_data.sort_values(by=['day', 'dist_no'])

r_data.to_csv (dist_output_path+f'{id_prefix}_perc_w_covid_dist.csv')
r_data.sort_values(by='dist_no')
r_data.head()

Unnamed: 0,day,dist_no,perc_w_covid
0,0,1,0.0
11,0,2,0.0
22,0,3,0.0
33,0,4,0.0
44,0,5,0.0


### import district population files to make prevalence per 100,000 figures 

In [27]:
##Import district total pop files 

# Use glob to find all files with the specified prefix
file_pattern = f"{folder_path}/{file_prefix}*District_Level_Population_Size.txt"
file_list = glob.glob(file_pattern)


# Initialize an empty list to store individual DataFrames
df_list = []

In [28]:
# Loop through the list of files and read each one into a DataFrame
for file in file_list:
    df = pd.read_csv(file, delimiter='\t')# Adjust delimiter as per your file format
    # Extract the run number from the filename
    run_number = os.path.basename(file).split('_')[2] # 2 for workplacebubbles
    df['run']=int(run_number)
    df_list.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
final_df = pd.concat(df_list, ignore_index=True)
# Convert the 'run_number' column to numeric (int64)
final_df['run'] = pd.to_numeric(final_df['run'])
# Display the resulting DataFrame
final_df.head()
#final_df.metric.value_counts()

Unnamed: 0,day,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,...,d_53,d_54,d_55,d_56,d_57,d_58,d_59,d_60,Unnamed: 61,run
0,0,32721,105869,12460,6782,16182,14984,22775,8417,6385,...,3926,11964,16448,12421,15499,9320,4886,5754,,8
1,1,32721,105869,12460,6782,16182,14984,22775,8417,6385,...,3926,11964,16448,12421,15499,9320,4886,5754,,8
2,2,32721,105869,12460,6782,16182,14984,22775,8417,6385,...,3926,11964,16448,12421,15499,9320,4886,5754,,8
3,3,32720,105869,12460,6782,16182,14984,22775,8417,6385,...,3926,11964,16448,12421,15499,9320,4886,5754,,8
4,4,32720,105869,12459,6782,16182,14984,22774,8417,6385,...,3926,11964,16448,12421,15499,9320,4886,5754,,8


In [29]:
# now reshape the data across
df_melted_tot = pd.melt(final_df, id_vars=['day'], value_vars=[f'd_{i}' for i in range(1, 61)], var_name='district', value_name='dist_pop')


# create version which is without the d_
df_melted_tot['dist_no']= df_melted_tot['district'].str.replace('d_', '')

df_melted_tot['dist_no'] = pd.to_numeric(df_melted_tot['dist_no'])
df_melted_tot.head()

Unnamed: 0,day,district,dist_pop,dist_no
0,0,d_1,32721,1
1,1,d_1,32721,1
2,2,d_1,32721,1
3,3,d_1,32720,1
4,4,d_1,32720,1


In [30]:
#group the data by daily district cases (aggregating runs) but keeping districts independent
r_data_tot = df_melted_tot.groupby(['day','dist_no'])['dist_pop'].mean().reset_index()
# Round the 'value' column to 1 decimal place
r_data_tot['dist_pop'] = r_data_tot['dist_pop'].round(0)

r_data_tot.to_csv (dist_output_path+f'{id_prefix}_tot_pop_dist.csv')


In [31]:
# now combine the two datasets 
r_data_comb =r_data.merge(r_data_tot, on= ['dist_no', 'day'] )
r_data_comb.to_csv (dist_output_path+f'{id_prefix}_comb_dist.csv')

r_data_comb.head()


Unnamed: 0,day,dist_no,perc_w_covid,dist_pop
0,0,1,0.0,32721.0
1,0,2,0.0,105869.0
2,0,3,0.0,12460.0
3,0,4,0.0,6782.0
4,0,5,0.0,16182.0


In [32]:
prev_per_dist = r_data_comb.groupby('dist_no', as_index=False)['perc_w_covid'].max()

r_data_t5_prev = prev_per_dist.nlargest(5,'perc_w_covid')


r_data_t5_prev.to_csv(dist_output_path+f'_top5_prev_dists_{id_prefix}.csv')
r_data_t5_prev.head()

Unnamed: 0,dist_no,perc_w_covid
1,2,0.659
0,1,0.65
55,56,0.645
58,59,0.637
35,36,0.634


In [33]:
stop

NameError: name 'stop' is not defined

In [None]:
# district level prevalence comparison (graph)
# now graph

# ensure I have enough colours in the palette 
palette = sns.color_palette("husl", 60)

# Plotting the data
plt.figure(figsize=(10, 5))
sns.lineplot(data=r_data_comb, x="day", y="perc_w_covid", hue="dist_no", palette=palette) #, style="", , err_style="band"


# Adding titles and labels
plt.title(f'{id_prefix}_Prevalence Over Time by District', size=18)
plt.xlabel('Day')
plt.ylabel('Prevalence of COVID cases')
plt.legend(title='District', loc= 'upper left', bbox_to_anchor=(1,1), ncol=3, prop={'size': 8})
# plt.xlim(0,1)
# plt.ylim(0,1)
plt.grid(True)
plt.subplots_adjust(right=0.7)  # Adjust the right margin to make room for the legend

# export the plot 
plt.savefig(dist_output_path+f'{id_prefix}case_prevalence_over_time_district.png', dpi=300)

## the graph is useless because it just shows the same value for all districts. No meaninful difference is detectable between them from this visual 

## Prevalence map at end of simulation

In [None]:
# now plot - districts with the highest case numbers overall

map_input_path= "/Users/sophieayling/Library/CloudStorage/GoogleDrive-sophie2ayling@gmail.com/My Drive/PhD/06_Data and Modelling/thesis_data/shapefiles/60_districts/"
dist_shape="ZWE_adm2.shp"

dists=gpd.read_file(map_input_path+dist_shape)

dists.plot()
plt.show()
dists.head()
dists['dist_no']= dists['ID_2']

In [None]:
r_data_comb['cum_prevalence'] = r_data_comb.groupby(['dist_no'])['perc_w_covid'].cumsum()
r_data_comb.to_csv(dist_output_path+f'{id_prefix}_cum_prev_data_dist.csv')

r_data_cum=r_data_comb[r_data_comb['day']==99]

r_data_cum.head()

In [None]:
merged_gdf=dists.merge(r_data_cum, on= 'dist_no')

# Rename the column 'NAME_2' to 'name'
merged_gdf.rename(columns={'NAME_2': 'name'}, inplace=True)

merged_gdf.info()

In [None]:
import matplotlib.pyplot as plt
import geopandas as gpd

# Assuming r_data_cum is your GeoDataFrame
# Define color map
cmap = 'magma_r'  # You can use a string directly to specify the colormap

# Create plot
fig, ax = plt.subplots(1, 1, figsize=(10, 8))

# Plot the GeoDataFrame with the specified column and color map
merged_gdf.plot(column='cum_prevalence', cmap=cmap, linewidth=0.8, ax=ax, edgecolor='0.8', legend=True, vmin=8.5, vmax=12)

# Add title
plt.title(f'{id_prefix}_Perc Covid by District', fontsize=18)

# Export
plt.savefig(dist_output_path + f'{id_prefix}dist_perc_cases.png', dpi=300)
plt.show()

In [None]:
# # use plotly to make an interactive map instead

# import plotly.express as px


# # Convert the GeoDataFrame back to a geographic CRS (EPSG:4326) for Plotly
# gdf= merged_gdf.to_crs(epsg=4326)

# # covert geodataframe to geojson
# geojson_data= json.loads(gdf.to_json())

# # Coordinates for Zimbabwe's approximate center
# zimbabwe_center = {"lat": -19.015438, "lon": 29.154857}
    
# # Create the choropleth map using Plotly
# fig = px.choropleth_mapbox(
#     gdf,
#     geojson=geojson_data,
#     locations='dist_no',  # Column in merged_gdf to match with the GeoJSON
#     featureidkey="properties.dist_no",  # Property in GeoJSON to match locations - if you don't do this it gets super confused, this command is essential
#     color='tot_cases',  # Column to use for color
#     color_continuous_scale="magma_r",
#     center=zimbabwe_center,
#     mapbox_style="carto-positron",
#     zoom=5,
#     title='Total Cases by District', 
#     hover_data={'dist_no': True, 'name': True, 'tot_cases': True}  # Add district name and other relevant data to hover info

# )

# # Update the layout to set the color scale and other properties
# fig.update_layout(
#     coloraxis_colorbar=dict(
#         title="Total Cases",
#         ticks="outside"
#     ),
#     margin={"r":0,"t":0,"l":0,"b":0}
# )

# # Show the map
# fig.show()



