# Daily temperature averages in Rio, Paris, Mexico City and Rome

#### Load python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import altair_stiles as altstiles
import glob
from pathlib import Path
import numpy as np

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

---

## Process weatherspark data

#### Make a list of all our decade-level temperature data files

In [27]:
all_files = []

for path in Path("/Users/stiles/data/climate/melbourne/").rglob("*Summary*.csv"):
    all_files.append(str(path.parent) + "/" + str(path.name))

#### Read each file into a list of dataframes

In [28]:
dfs_list = []
for file in all_files:
    dfs = pd.read_csv(file, on_bad_lines="skip")
    dfs_list.append(dfs)

#### One big dataframe with all cities and decades

In [29]:
src = pd.concat(dfs_list)

#### Clean up the messy column headers

In [30]:
src.columns = (
    src.columns.str.lower()
    .str.strip()
    .str.replace("/", "_", regex=False)
    .str.replace(" (°f)", "", regex=False)
    .str.replace(" (%)", "_pct", regex=False)
    .str.replace(" (in)", "_inch", regex=False)
    .str.replace(" (hr)", "_hour", regex=False)
    .str.replace(" (mph)", "_mph", regex=False)
    .str.replace(" (string)", "", regex=False)
    .str.replace(" ", "_", regex=False)
    .str.replace("temperature_sampled_", "", regex=False)
    .str.strip()
)

#### Place names

In [31]:
src.rename(columns={"time_zone_id": "place"}, inplace=True)

In [32]:
src[["continent", "place"]] = src["place"].str.split("/", expand=True)

In [33]:
src.loc[src["place"] == "Sao_Paulo", "place"] = "Rio"

#### Make a simple dataframe with just the temp columns we need

In [34]:
src_slim = src[["date", "place", "low", "high"]].copy()

#### Process dates

In [35]:
src_slim["date"] = pd.to_datetime(src_slim["date"]).dt.strftime("%Y-%m-%d")
src_slim["year"] = pd.to_datetime(src_slim["date"]).dt.strftime("%Y")
src_slim["month"] = pd.to_datetime(src_slim["date"]).dt.strftime("%m")
src_slim["day"] = pd.to_datetime(src_slim["date"]).dt.strftime("%d")

#### Decade categories

In [36]:
src_slim["decade"] = src_slim["year"].str[:3] + "0s"

#### Get the mean temp

In [37]:
src_slim["mean"] = (src_slim["high"] + src_slim["low"]) / 2

#### Make a copy for analysis

In [38]:
df = src_slim[src_slim["decade"] != "2020s"].sort_values("date", ascending=False).copy()

In [39]:
df.head()

Unnamed: 0,date,place,low,high,year,month,day,decade,mean
3651,2019-12-31,Melbourne,57.2,71.6,2019,12,31,2010s,64.4
3650,2019-12-30,Melbourne,66.2,107.6,2019,12,30,2010s,86.9
3649,2019-12-29,Melbourne,60.8,95.0,2019,12,29,2010s,77.9
3648,2019-12-28,Melbourne,66.2,89.6,2019,12,28,2010s,77.9
3647,2019-12-27,Melbourne,53.6,82.4,2019,12,27,2010s,68.0


---

## Aggregate

#### Group by decades and get mean temp each month and day for each decade

In [88]:
decades = (
    df.groupby(["place", "decade", "month"])
    .agg({"mean": "mean"})
    .round(1)
    .reset_index()
)

In [89]:
open_months = decades[((decades["month"] == "01") | (decades["month"] == "12"))]

In [90]:
open_months

Unnamed: 0,place,decade,month,mean
0,Melbourne,1970s,1,66.6
11,Melbourne,1970s,12,63.0
12,Melbourne,1980s,1,64.4
23,Melbourne,1980s,12,62.4
24,Melbourne,1990s,1,67.1
35,Melbourne,1990s,12,63.7
36,Melbourne,2000s,1,68.3
47,Melbourne,2000s,12,64.7
48,Melbourne,2010s,1,69.6
59,Melbourne,2010s,12,65.9


In [91]:
alt.Chart(open_months).mark_line().encode(
    x="decade",
    y=alt.Y(
        "mean", axis=alt.Axis(tickCount=5), scale=alt.Scale(domain=[60, 70]), title=" "
    ),
    color="month",
).properties(width=650, title="Mean temperature change during Australian open")