# Air quality

#### Load Python tools

In [1]:
%load_ext lab_black
token = !echo $dw_api

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
from datetime import timedelta
import numpy as np
import glob

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [4]:
from datawrapper import Datawrapper

dw = Datawrapper(access_token=token[0])

In [5]:
dw.account_info()

{'id': 393404,
 'email': 'mstiles@mipo.news',
 'name': None,
 'role': 'editor',
 'language': 'en-US',
 'teams': [{'id': '1NhUBkSc',
   'name': 'Grid',
   'url': '/v3/teams/1NhUBkSc',
   'active': True}],
 'chartCount': 176,
 'url': '/v3/users/393404',
 'activeTeam': '1NhUBkSc'}

---

#### Read data

In [6]:
df = pd.read_csv("data/raw/china-air-aqli-feb2022.csv")

In [7]:
len(df)

50

In [8]:
df.head()

Unnamed: 0,place,pop,pm_2008,pm_2013,pm_2020,pct_change_08-22,life_gain_years_08_22
0,Chongqing,30.0,56.5,56.2,29.0,49,2.7
1,Shanghai,24.1,47.5,50.2,28.1,41,1.9
2,Beijing,20.5,75.7,85.2,37.9,50,3.7
3,Chengdu,13.9,66.3,70.3,35.4,47,3.0
4,Tianjin,13.6,72.5,77.6,47.4,35,2.5


In [9]:
df.pm_2008.max()

81.4

In [10]:
df.pm_2008.min()

30.1

In [11]:
df.pm_2020.max()

56.0

In [12]:
df.pm_2020.min()

17.7

In [13]:
df.sort_values("pop", ascending=False).head(10)

Unnamed: 0,place,pop,pm_2008,pm_2013,pm_2020,pct_change_08-22,life_gain_years_08_22
0,Chongqing,30.0,56.5,56.2,29.0,49,2.7
1,Shanghai,24.1,47.5,50.2,28.1,41,1.9
2,Beijing,20.5,75.7,85.2,37.9,50,3.7
3,Chengdu,13.9,66.3,70.3,35.4,47,3.0
4,Tianjin,13.6,72.5,77.6,47.4,35,2.5
5,Guangzhou,13.2,47.8,45.5,22.8,52,2.4
6,Baoding,11.6,79.3,101.2,46.2,42,3.2
7,Harbin,11.1,46.1,59.5,40.7,12,0.5
8,Suzhou,10.8,54.8,57.3,31.2,43,2.3
9,Nanyang,10.8,38.3,37.2,19.1,50,1.9


In [14]:
largest_places = df.sort_values("pop", ascending=False).head(5)["place"].to_list()
places = df.sort_values("pop", ascending=False)["place"].to_list()

In [15]:
largest_places

['Chongqing', 'Shanghai', 'Beijing', 'Chengdu', 'Tianjin']

In [16]:
large_df = df[df["place"].isin(largest_places)]

In [17]:
large_df

Unnamed: 0,place,pop,pm_2008,pm_2013,pm_2020,pct_change_08-22,life_gain_years_08_22
0,Chongqing,30.0,56.5,56.2,29.0,49,2.7
1,Shanghai,24.1,47.5,50.2,28.1,41,1.9
2,Beijing,20.5,75.7,85.2,37.9,50,3.7
3,Chengdu,13.9,66.3,70.3,35.4,47,3.0
4,Tianjin,13.6,72.5,77.6,47.4,35,2.5


#### Pivot the dataframe so it's wide for datawrapper

In [18]:
df_pivot = large_df.pivot_table(
    columns="place", values=["pm_2008", "pm_2013", "pm_2020"]
)

In [19]:
df_pivot["Year"] = df_pivot.index.str.replace("pm_", "", regex=False)

In [20]:
df_pivot.to_csv("data/processed/air_quality_slope.csv", index=False)

In [21]:
df_pivot.index = df_pivot.index.str.replace("pm_", "", regex=False)

In [22]:
dw.add_data(chart_id="emXrI", data=df_pivot)

<Response [404]>

---

## Geography

#### China prefectures

In [23]:
# gdf = gpd.read_file("data/raw/prefectures/City0010.shp")

In [24]:
# gdf.rename(
#     columns={"A101004_10": "pop2010", "A101001_10": "households2010"}, inplace=True
# )

In [25]:
# gdf.columns = gdf.columns.str.lower()

In [26]:
# gdf_slim = gdf[~gdf["city_en"].str.contains("Counties")][
#     [
#         "objectid",
#         "gbcity",
#         "city_ch",
#         "city_en",
#         "households2010",
#         "pop2010",
#         "gbprov",
#         "prov_ch",
#         "prov_en",
#     ]
# ]

In [27]:
# gdf_slim = gdf[
#     [
#         "objectid",
#         "gbcity",
#         "city_ch",
#         "city_en",
#         "households2010",
#         "pop2010",
#         "gbprov",
#         "prov_ch",
#         "prov_en",
#         "geometry",
#     ]
# ].copy()

In [28]:
# gdf_slim["city_en"] = gdf_slim["city_en"].str.replace(" (Districts)", "", regex=False)

---

In [29]:
# merge = gpd.GeoDataFrame(pd.merge(df, gdf_slim, left_on="place", right_on="city_en"))

In [30]:
# len(merge)

In [31]:
# merge.to_file("data/processed/prefectures_merge_air_data.geojson", driver="GeoJSON")

---

## Specific places over time

In [32]:
beijing = pd.read_csv("data/raw/air/beijing-air-quality.csv")
yanqing = pd.read_csv("data/raw/air/yanqing-town, beijing-air-quality.csv")
zhangjiakou = pd.read_csv("data/raw/air/zhangjiakou-air-quality.csv")

In [33]:
beijing["city"] = "Beijing"
yanqing["city"] = "Yanqing"
zhangjiakou["city"] = "Zhangjiakou"

In [34]:
air_olympics_src = pd.concat([beijing, yanqing, zhangjiakou])

In [35]:
air_olympics_src.columns = air_olympics_src.columns.str.strip()

In [36]:
air_olympics = air_olympics_src[["date", "pm25", "city"]].copy()

In [37]:
air_olympics["date"] = pd.to_datetime(air_olympics["date"])

In [38]:
air_olympics["year"] = air_olympics["date"].dt.year
air_olympics["month"] = air_olympics["date"].dt.month
air_olympics["day"] = air_olympics["date"].dt.day
air_olympics["month-year"] = air_olympics["date"].dt.strftime("%Y-%m")

In [39]:
air_olympics = air_olympics.replace(r"^\s*$", np.nan, regex=True)

In [40]:
air_olympics["pm25"] = air_olympics["pm25"].astype(float)

#### Mean PM 2.5 in February over the years

In [41]:
mean_air = (
    air_olympics[air_olympics["month"] == 2]
    .groupby(["month-year", "city"])
    .agg({"pm25": "mean"})
    .round()
    .reset_index()
)

In [42]:
alt.Chart(mean_air).mark_line().encode(
    x=alt.X("month-year:T", axis=alt.Axis(format="%Y", tickCount=6)),
    y="pm25:Q",
    color="city",
).properties(width=650, title="Average February PM 2.5 AQI")

In [43]:
aqi = pd.read_excel("/Users/stiles/data/aqi_scale.xlsx")

In [44]:
aqi

Unnamed: 0,Color,Rating,AQI,Description
0,Green,Good,0 to 50,"Air quality is satisfactory, and air pollution poses little or no risk."
1,Yellow,Moderate,51 to 100,"Air quality is acceptable. However, there may be a risk for some people, particularly those who are unusually sensitive to air pollution."
2,Orange,Unhealthy for Sensitive Groups,101 to 150,Members of sensitive groups may experience health effects. The general public is less likely to be affected.
3,Red,Unhealthy,151 to 200,Some members of the general public may experience health effects; members of sensitive groups may experience more serious health effects.
4,Purple,Very Unhealthy,201 to 300,Health alert: The risk of health effects is increased for everyone.
5,Maroon,Hazardous,301 and higher,Health warning of emergency conditions: everyone is more likely to be affected.


In [45]:
air_olympics.head()

Unnamed: 0,date,pm25,city,year,month,day,month-year
0,2022-02-02,29.0,Beijing,2022,2,2,2022-02
1,2022-02-03,50.0,Beijing,2022,2,3,2022-02
2,2022-02-04,29.0,Beijing,2022,2,4,2022-02
3,2022-02-05,38.0,Beijing,2022,2,5,2022-02
4,2022-02-06,30.0,Beijing,2022,2,6,2022-02


In [46]:
def set_aqi_rating(row):
    if row.pm25 <= 50:
        return "Good"
    elif (row.pm25 > 50) and (row.pm25 <= 100):
        return "Moderate"
    elif (row.pm25 > 100) and (row.pm25 <= 150):
        return "Unhealthy for Sensitive Groups"
    elif (row.pm25 > 150) & (row.pm25 <= 200):
        return "Unhealthy"
    elif (row.pm25 > 200) & (row.pm25 <= 300):
        return "Very Unhealthy"
    elif row.pm25 > 300:
        return "Hazardous"

In [47]:
air_olympics["rating"] = air_olympics.apply(set_aqi_rating, axis=1)

In [48]:
air_ratings = (
    air_olympics[["year", "city", "rating"]]
    .groupby(["year", "city", "rating"])
    .size()
    .round()
    .reset_index(name="count")
)

In [49]:
air_ratings.head()

Unnamed: 0,year,city,rating,count
0,2014,Beijing,Good,15
1,2014,Beijing,Hazardous,25
2,2014,Beijing,Moderate,73
3,2014,Beijing,Unhealthy,106
4,2014,Beijing,Unhealthy for Sensitive Groups,93


In [50]:
len(air_olympics[(air_olympics["city"] == "Beijing") & (air_olympics["year"] == 2019)])

364

In [51]:
air_ratings_pivot = (
    air_ratings.pivot_table(columns="rating", index=["year", "city"], values="count")
    .round()
    .fillna(0)
    .reset_index()
)

In [52]:
cols_to_sum = [
    "Good",
    "Moderate",
    "Unhealthy",
    "Unhealthy for Sensitive Groups",
    "Very Unhealthy",
    "Hazardous",
]

In [53]:
air_ratings_pivot["total_days"] = air_ratings_pivot[cols_to_sum].sum(axis=1)

In [54]:
air_ratings_pivot.columns = air_ratings_pivot.columns.str.title()

In [55]:
air_ratings_pivot[
    (air_ratings_pivot["Year"] == 2015)
    | (air_ratings_pivot["Year"] == 2017)
    | (air_ratings_pivot["Year"] == 2019)
    | (air_ratings_pivot["Year"] == 2021)
][
    [
        "City",
        "Year",
        "Good",
        "Moderate",
        "Unhealthy For Sensitive Groups",
        "Unhealthy",
        "Very Unhealthy",
        "Hazardous",
    ]
].to_csv(
    "data/processed/cities_olympics_days_air_ratings_limited_years.csv", index=False
)

In [56]:
air_ratings_pivot[
    [
        "City",
        "Year",
        "Good",
        "Moderate",
        "Unhealthy For Sensitive Groups",
        "Unhealthy",
        "Very Unhealthy",
        "Hazardous",
    ]
].to_csv("data/processed/cities_olympics_days_air_ratings.csv", index=False)

---

## Update to Datawrapper

In [57]:
# air_ratings_pivot[
#     [
#         "City",
#         "Year",
#         "Good",
#         "Moderate",
#         "Unhealthy For Sensitive Groups",
#         "Unhealthy",
#         "Very Unhealthy",
#         "Hazardous",
#     ]
# ]

In [58]:
# dw_api='FtIwtvFtoGLaRT9a3gjX69PLu4wSuRyKddoOz6SOPw3k9wWyNICMHTkcPhOGCR5Z'