### Import libraries

In [None]:
# Libraries for data processing and math 
import pandas as pd
import numpy as np

# Library for file path manipulation 
import os

# Library for Mandarin to English translation 
import pinyin

# Library for simplifying function calls 
from functools import partial

### Set data paths

In [None]:
# This gets the file path of the data folder in EnvCausal-replication 
root = os.path.dirname(os.getcwd())
data_dir = os.path.join(root, 'data')

# Set the file path for the snapshot data set
snapshot_path = os.path.join(data_dir, 'snapshot_data.csv')

# Set the file path for the 3-day moving average data set
time_series_3_day_dir = os.path.join(data_dir, 'time_series', '3_day_moving_average')
time_series_path = os.path.join(time_series_3_day_dir, 'df_m3.csv')

### Create Summary Statistics Table for Snapshot data

In [None]:
# Read snapshots data set 
snapshots = pd.read_csv(snapshot_path)

# Rename features for clarity 
column_name_map = {
                   'POP':'Population (in thousands)',
                   'Area':'City area (in km^2)',
                   'POPDENS':'Population Density (people per km^2)',
                   'GDP':'GDP (Billions USD)',
                   'PRIM':'Primary sector (Billions USD)',
                   'SEC':'Secondary sector (Billions USD)',
                   'TERT':'Tertiary sector (Billions USD)',
                   'Prim%':'Primary sector % of GDP',
                   'Sec%':'Secondary sector % of GDP',
                   'Tert%':'Tertiary sector % of GDP',
                   'GDPpc':'GDP per capita (Billions USD per km^2)',
                   '>60yr%':'Elderly population %',
                   'BED':'Hospital Beds (per thousand people)',
                   'DOC':'Registered doctors (per thousand)',
                   'NRS':'Registered nurses (per thousand)',
                   'TVLR':'Wuhan travellers (thousands)',
                   'TVLR‰':'Wuhan travellers (per thousand pop.)',
                   'ACTV':'Average degree of activeness (0-8)'
                  }
snapshots['POP'] *= 10
snapshots['POPDENS'] *= 10000
snapshots['Prim%'] *= 100
snapshots['Sec%'] *= 100
snapshots['Tert%'] *= 100
snapshots['>60yr%'] *= 100
snapshots['TVLR'] *= 10
snapshots['TVLR‰'] *= 1000
snapshots = snapshots.drop(
    columns=[x for x in snapshots.columns if x not in column_name_map]).rename(
    column_name_map, axis=1)
first_quartile = partial(np.percentile, q=25, axis=0)
third_quartile = partial(np.percentile, q=75, axis=0)
summary_stats_snapshot  = snapshots.apply([np.mean, np.std, np.min, first_quartile,
                 np.median, third_quartile, np.max],
                 axis=0, result_type='broadcast').T.round(2)
summary_stats_snapshot.columns = ['Mean',
                         'SD',
                         'Min',
                         '25th %ile',
                         'Median',
                         '75th %ile',
                         'Max']
summary_stats_snapshot['IQR'] = summary_stats_snapshot['75th %ile'] - summary_stats_snapshot['25th %ile']
summary_stats_snapshot

In [None]:
time_series

In [None]:
time_series = pd.read_csv(time_series_path)
column_name_map_2 = {
                        'PM2.5':'PM2.5 (μg/m3)',
                        'PM10':'PM10 (μg/m3)',
                        'SO2':'SO2 (μg/m3)',
                        'CO':'CO (mg/m3)' ,
                        'NO2':'NO2 (μg/m3)',
                        'O3':'O3 (μg/m3)',
                        'HUM':'Relative humidity (%)',
                        'PRES':'Atmospheric pressure (hpa)',
                        'WSPD':'Wind speed (m/s)',
                        'TEMP':'Average air temperature',
                        'ACTV':'Degree of activeness',
                        'Case':'New confirmed cases',
                        'MORB%':'Morbidity rate'
                    }

# def eng_translator(phrase_list):
#     translation_map = {}
#     for phrase in phrase_list:
#         pinyin_phrase = pinyin.get(phrase, format="strip", delimiter=" ")
#         pinyin_phrase = ''.join(pinyin_phrase.split(' '))
#         pinyin_phrase = pinyin_phrase[0].upper() + pinyin_phrase[1:]
#         translation_map[phrase] = pinyin_phrase
#     return translation_map

# def translate_column_to_eng(df, col):
#     unique_phrases = np.unique(df[col]).tolist()
#     translation_map = eng_translator(unique_phrases)
#     return df[col].map(translation_map)

# time_series.columns.values[0] = 'Date'
# time_series.columns.values[1] = 'City'
# time_series['City'] = translate_column_to_eng(time_series, 'City')
time_series = time_series.drop(
    columns=[x for x in time_series.columns if x not in column_name_map_2]).rename(
    column_name_map_2, axis=1)
summary_stats_time_series = time_series.apply([np.mean, np.std, np.min, first_quartile,
                 np.median, third_quartile, np.max],
                 axis=0, result_type='broadcast').T.round(2)
summary_stats_time_series.columns = ['Mean',
                         'SD',
                         'Min',
                         '25th %ile',
                         'Median',
                         '75th %ile',
                         'Max']
summary_stats_time_series

### Notes
- The original data doesn't measure in same units as summary table 
    - Needed to convert to in thousands and proportions to percentages 

- The GDP measurements are off (probably due to different data being used)
    - GDP
    - Primary sector
    - Secondary sector 
    - Tertiary sector 
    - Primary sector %
    - Secondary sector %
    - Tertiary sector %
    - GDP per capita
    
- Time series measurements are difficult to replicate
    - Need to figure out if I aggregate over all dates, then over all cities

- Need to append ```\usepackage{booktabs}``` before using LaTeX representation of table

In [None]:
print(summary_stats_snapshot.to_latex(index = True, multirow = True, bold_rows=True))

In [None]:
print(summary_stats_time_series.to_latex(index = True, multirow = True, bold_rows=True))