### Import libraries

In [1]:
# Libraries for data processing and math 
import pandas as pd
import numpy as np

# Library for file path manipulation 
import os

# Library for Mandarin to English translation 
import pinyin

# Library for simplifying function calls 
from functools import partial

### Set data paths

In [2]:
# This gets the file path of the data folder in EnvCausal-replication 
root = os.path.dirname(os.getcwd())
data_dir = os.path.join(root, 'data')

# Set the file path for the cleaned snapshot data set
snapshot_path = os.path.join(data_dir, 'cleaned_snapshot_data.csv')

# Set the file path for the cleaned cluster data sets
cluster1_path = os.path.join(data_dir, 'cluster1_snapshot.csv')
cluster2_path = os.path.join(data_dir, 'cluster2_snapshot.csv')
cluster3_path = os.path.join(data_dir, 'cluster3_snapshot.csv')

# Set the file path for the 3-day moving average data set
time_series_3_day_dir = os.path.join(data_dir, 'time_series', '3_day_moving_average')
time_series_path = os.path.join(time_series_3_day_dir, 'df_m3.csv')

### Create Summary Statistics Table for Snapshot data

In [3]:
# Read snapshots data set 
snapshots = pd.read_csv(snapshot_path)

# Rename features for clarity 
column_name_map = {
                   'POP':'Population (in thousands)',
                   'Area':'City area (in km^2)',
                   'POPDENS':'Population Density (people per km^2)',
                   'GDP':'GDP (Billions USD)',
                   'PRIM':'Primary sector (Billions USD)',
                   'SEC':'Secondary sector (Billions USD)',
                   'TERT':'Tertiary sector (Billions USD)',
                   'Prim.':'Primary sector % of GDP',
                   'Sec.':'Secondary sector % of GDP',
                   'Tert.':'Tertiary sector % of GDP',
                   'GDPpc':'GDP per capita (Billions USD per km^2)',
                   'X.60yr.':'Elderly population %',
                   'BED':'Hospital Beds (per thousand people)',
                   'DOC':'Registered doctors (per thousand)',
                   'NRS':'Registered nurses (per thousand)',
                   'TVLR':'Wuhan travellers (thousands)',
                   'TVLR.':'Wuhan travellers (per thousand pop.)',
                   'ACTV':'Average degree of activeness (0-8)'
                  }

# Drop unnecessary columns 
snapshots_general = snapshots.drop(
    columns=[x for x in snapshots.columns if x not in column_name_map]).rename(
    column_name_map, axis=1)

# Compute summary statistics 
first_quartile = partial(np.percentile, q=25, axis=0)
third_quartile = partial(np.percentile, q=75, axis=0)
summary_stats_snapshot  = snapshots_general.apply([np.mean, np.std, np.min, first_quartile,
                 np.median, third_quartile, np.max],
                 axis=0, result_type='broadcast').T.round(2)

# Clean up snapshot summary table 
summary_stats_snapshot.columns = ['Mean',
                         'SD',
                         'Min',
                         '25th %ile',
                         'Median',
                         '75th %ile',
                         'Max']
summary_stats_snapshot['IQR'] = summary_stats_snapshot['75th %ile'] - summary_stats_snapshot['25th %ile']

# Display summary table 
summary_stats_snapshot

Unnamed: 0,Mean,SD,Min,25th %ile,Median,75th %ile,Max,IQR
Population (in thousands),5624.67,4029.79,720.96,3176.92,4666.55,7181.67,31243.2,4004.75
GDP (Billions USD),66.02,81.53,5.13,23.14,39.94,72.06,552.18,48.92
Primary sector (Billions USD),3.54,2.53,0.17,1.93,3.13,4.65,22.45,2.72
Secondary sector (Billions USD),25.8,26.94,1.89,10.2,16.21,29.99,151.89,19.79
Tertiary sector (Billions USD),36.68,57.06,2.85,11.53,19.1,35.68,427.53,24.15
Elderly population %,19.51,4.51,4.92,17.13,19.69,22.48,32.2,5.35
Hospital Beds (per thousand people),6.22,1.22,3.82,5.43,6.1,6.9,9.67,1.47
Registered doctors (per thousand),2.81,0.76,1.32,2.29,2.73,3.14,5.76,0.85
Registered nurses (per thousand),3.19,1.01,1.27,2.51,3.03,3.6,6.72,1.09
City area (in km^2),11733.64,9080.77,1459.0,6339.5,10238.0,14288.5,82402.0,7949.0


### Create Summary Statistics Table for 3-day moving average time-series data

In [4]:
# Read time series data set 
time_series = pd.read_csv(time_series_path)

# Rename features for clarity
column_name_map_2 = {
                        'PM2.5':'PM2.5 (μg/m3)',
                        'PM10':'PM10 (μg/m3)',
                        'SO2':'SO2 (μg/m3)',
                        'CO':'CO (mg/m3)' ,
                        'NO2':'NO2 (μg/m3)',
                        'O3':'O3 (μg/m3)',
                        'HUM':'Relative humidity (%)',
                        'PRES':'Atmospheric pressure (hpa)',
                        'WSPD':'Wind speed (m/s)',
                        'TEMP':'Average air temperature',
                        'ACTV':'Degree of activeness',
                        'Case':'New confirmed cases',
                        'MORB%':'Morbidity rate'
                    }

# Define utility functions for Chinese to English (pinyin) character translation 
def eng_translator(phrase_list):
    translation_map = {}
    for phrase in phrase_list:
        pinyin_phrase = pinyin.get(phrase, format="strip", delimiter=" ")
        pinyin_phrase = ''.join(pinyin_phrase.split(' '))
        pinyin_phrase = pinyin_phrase[0].upper() + pinyin_phrase[1:]
        translation_map[phrase] = pinyin_phrase
    return translation_map

def translate_column_to_eng(df, col):
    unique_phrases = np.unique(df[col]).tolist()
    translation_map = eng_translator(unique_phrases)
    return df[col].map(translation_map)

# time_series.columns.values[0] = 'Date'
# time_series.columns.values[1] = 'City'
# time_series['City'] = translate_column_to_eng(time_series, 'City')

# Drop unnecessary columns 
time_series = time_series.drop(
    columns=[x for x in time_series.columns if x not in column_name_map_2]).rename(
    column_name_map_2, axis=1)

# Compute summary statistics 
summary_stats_time_series = time_series.apply([np.mean, np.std, np.min, first_quartile,
                 np.median, third_quartile, np.max],
                 axis=0, result_type='broadcast').T.round(2)

# Clean up snapshot summary table 
summary_stats_time_series.columns = ['Mean',
                         'SD',
                         'Min',
                         '25th %ile',
                         'Median',
                         '75th %ile',
                         'Max']

# Add IQR
summary_stats_time_series['IQR'] = summary_stats_time_series['75th %ile'] - summary_stats_time_series['25th %ile']


# Display summary table 
summary_stats_time_series

Unnamed: 0,Mean,SD,Min,25th %ile,Median,75th %ile,Max,IQR
PM2.5 (μg/m3),46.67,31.34,3.67,27.33,39.67,55.67,349.0,28.34
PM10 (μg/m3),70.27,38.73,6.33,42.67,63.67,89.33,378.0,46.66
SO2 (μg/m3),10.33,7.41,1.67,6.0,8.0,12.33,92.0,6.33
CO (mg/m3),0.81,0.35,0.2,0.6,0.73,0.93,4.5,0.33
NO2 (μg/m3),25.26,11.17,2.67,16.67,24.0,32.0,87.0,15.33
O3 (μg/m3),83.82,22.06,5.0,69.0,83.33,97.67,166.67,28.67
Relative humidity (%),71.23,18.2,8.0,60.33,74.67,85.33,100.0,25.0
Atmospheric pressure (hpa),991.77,50.3,644.33,984.0,1011.0,1018.67,1035.33,34.67
Wind speed (m/s),2.23,1.31,0.1,1.4,1.9,2.97,11.47,1.57
Average air temperature,8.98,6.34,-22.0,5.0,9.17,13.17,27.67,8.17


### Create Summary Statistics Table for Snapshot data by Cluster

In [5]:
# Read cluster snapshots data set 
cluster1 = pd.read_csv(cluster1_path)
cluster2 = pd.read_csv(cluster2_path)
cluster3 = pd.read_csv(cluster3_path)

# Rename features for clarity 
column_name_map = {
                   'POP':'Population (in thousands)',
                   'Area':'City area (in km^2)',
                   'POPDENS':'Population Density (people per km^2)',
                   'GDP':'GDP (Billions USD)',
                   'PRIM':'Primary sector (Billions USD)',
                   'SEC':'Secondary sector (Billions USD)',
                   'TERT':'Tertiary sector (Billions USD)',
                   'Prim.':'Primary sector % of GDP',
                   'Sec.':'Secondary sector % of GDP',
                   'Tert.':'Tertiary sector % of GDP',
                   'GDPpc':'GDP per capita (Billions USD per km^2)',
                   'X.60yr.':'Elderly population %',
                   'BED':'Hospital Beds (per thousand people)',
                   'DOC':'Registered doctors (per thousand)',
                   'NRS':'Registered nurses (per thousand)',
                   'TVLR':'Wuhan travellers (thousands)',
                   'TVLR.':'Wuhan travellers (per thousand pop.)',
                   'ACTV':'Average degree of activeness (0-8)'
                  }

# Drop unnecessary columns 
cluster1_general = cluster1.drop(
    columns=[x for x in cluster1.columns if x not in column_name_map]).rename(
    column_name_map, axis=1)
cluster2_general = cluster2.drop(
    columns=[x for x in cluster2.columns if x not in column_name_map]).rename(
    column_name_map, axis=1)
cluster3_general = cluster3.drop(
    columns=[x for x in cluster3.columns if x not in column_name_map]).rename(
    column_name_map, axis=1)

# Compute cluster-wise feature averages 
summary_stats_cluster1  = cluster1_general.apply([np.mean],
                 axis=0, result_type='broadcast').T.round(2)
summary_stats_cluster2  = cluster2_general.apply([np.mean],
                 axis=0, result_type='broadcast').T.round(2)
summary_stats_cluster3  = cluster3_general.apply([np.mean],
                 axis=0, result_type='broadcast').T.round(2)

# Concatenate results together 
mean_feature_by_cluster = pd.concat(
    [summary_stats_cluster1, summary_stats_cluster2, summary_stats_cluster3],
    axis=1)

# Clean up snapshot summary table
mean_feature_by_cluster.columns.values[0] = 'Cluster 1'
mean_feature_by_cluster.columns.values[1] = 'Cluster 2'
mean_feature_by_cluster.columns.values[2] = 'Cluster 3'

# Display summary table 
mean_feature_by_cluster

Unnamed: 0,Cluster 1,Cluster 2,Cluster 3
Population (in thousands),19019.47,6266.86,4620.88
GDP (Billions USD),380.27,97.45,36.97
Primary sector (Billions USD),5.9,2.92,3.61
Secondary sector (Billions USD),117.5,39.16,15.92
Tertiary sector (Billions USD),256.87,55.37,17.44
Elderly population %,17.46,16.54,20.62
Hospital Beds (per thousand people),6.37,6.73,6.04
Registered doctors (per thousand),3.49,3.57,2.51
Registered nurses (per thousand),4.3,4.32,2.74
City area (in km^2),19653.14,10440.7,11702.39


### Notes
- Need to append ```\usepackage{booktabs}``` before using LaTeX representation of table

### Generate $\LaTeX$ code from dataframe

In [6]:
print(summary_stats_snapshot.to_latex(index = True, multirow = True, bold_rows=True))

\begin{tabular}{lrrrrrrrr}
\toprule
{} &      Mean &       SD &      Min &  25th \%ile &    Median &  75th \%ile &       Max &      IQR \\
\midrule
\textbf{Population (in thousands)             } &   5624.67 &  4029.79 &   720.96 &    3176.92 &   4666.55 &    7181.67 &  31243.20 &  4004.75 \\
\textbf{GDP (Billions USD)                    } &     66.02 &    81.53 &     5.13 &      23.14 &     39.94 &      72.06 &    552.18 &    48.92 \\
\textbf{Primary sector (Billions USD)         } &      3.54 &     2.53 &     0.17 &       1.93 &      3.13 &       4.65 &     22.45 &     2.72 \\
\textbf{Secondary sector (Billions USD)       } &     25.80 &    26.94 &     1.89 &      10.20 &     16.21 &      29.99 &    151.89 &    19.79 \\
\textbf{Tertiary sector (Billions USD)        } &     36.68 &    57.06 &     2.85 &      11.53 &     19.10 &      35.68 &    427.53 &    24.15 \\
\textbf{Elderly population \%                  } &     19.51 &     4.51 &     4.92 &      17.13 &     19.69 &      22.48 &

In [7]:
print(summary_stats_time_series.to_latex(index = True, multirow = True, bold_rows=True))

\begin{tabular}{lrrrrrrrr}
\toprule
{} &    Mean &     SD &     Min &  25th \%ile &   Median &  75th \%ile &      Max &    IQR \\
\midrule
\textbf{PM2.5 (μg/m3)             } &   46.67 &  31.34 &    3.67 &      27.33 &    39.67 &      55.67 &   349.00 &  28.34 \\
\textbf{PM10 (μg/m3)              } &   70.27 &  38.73 &    6.33 &      42.67 &    63.67 &      89.33 &   378.00 &  46.66 \\
\textbf{SO2 (μg/m3)               } &   10.33 &   7.41 &    1.67 &       6.00 &     8.00 &      12.33 &    92.00 &   6.33 \\
\textbf{CO (mg/m3)                } &    0.81 &   0.35 &    0.20 &       0.60 &     0.73 &       0.93 &     4.50 &   0.33 \\
\textbf{NO2 (μg/m3)               } &   25.26 &  11.17 &    2.67 &      16.67 &    24.00 &      32.00 &    87.00 &  15.33 \\
\textbf{O3 (μg/m3)                } &   83.82 &  22.06 &    5.00 &      69.00 &    83.33 &      97.67 &   166.67 &  28.67 \\
\textbf{Relative humidity (\%)     } &   71.23 &  18.20 &    8.00 &      60.33 &    74.67 &      85.33 &   100.

In [8]:
print(mean_feature_by_cluster.to_latex(index = True, multirow = True, bold_rows=True))

\begin{tabular}{lrrr}
\toprule
{} &  Cluster 1 &  Cluster 2 &  Cluster 3 \\
\midrule
\textbf{Population (in thousands)             } &   19019.47 &    6266.86 &    4620.88 \\
\textbf{GDP (Billions USD)                    } &     380.27 &      97.45 &      36.97 \\
\textbf{Primary sector (Billions USD)         } &       5.90 &       2.92 &       3.61 \\
\textbf{Secondary sector (Billions USD)       } &     117.50 &      39.16 &      15.92 \\
\textbf{Tertiary sector (Billions USD)        } &     256.87 &      55.37 &      17.44 \\
\textbf{Elderly population \%                  } &      17.46 &      16.54 &      20.62 \\
\textbf{Hospital Beds (per thousand people)   } &       6.37 &       6.73 &       6.04 \\
\textbf{Registered doctors (per thousand)     } &       3.49 &       3.57 &       2.51 \\
\textbf{Registered nurses (per thousand)      } &       4.30 &       4.32 &       2.74 \\
\textbf{City area (in km\textasciicircum 2)                   } &   19653.14 &   10440.70 &   11702.39 \