# Average LA cloud cover, by month
> This notebook fetches and processes average cloudiness data captured at US weather stations from the [National Centers for Environmental Information](https://www.ncei.noaa.gov/products/land-based-station/comparative-climatic-data). [The table]('https://www.ncei.noaa.gov/pub/data/ccd-data/clpcdy20.dat') shows the historical mean number of days per category of cloudiness. The categories are determined for daylight hours only. Clear denotes zero to 3/10 average sky cover. Partly cloudy denotes 4/10 to 7/10 average sky cover. Cloudy denotes 8/10 to 10/10 average sky cover. The data are used to try to better understand the "May gray" and "June gloom" phenomena in Los Angeles.

In [1]:
import json
import requests
import pandas as pd
import jupyter_black
import altair as alt
import geopandas as gpd
import seaborn as sns

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [3]:
today = pd.Timestamp("today").strftime("%Y%m%d")

---

## Read data

In [4]:
# Define the column names based on the dataset structure
column_names = [
    "station",
    "years",
    "Jan_CL",
    "Jan_PC",
    "Jan_CD",
    "Feb_CL",
    "Feb_PC",
    "Feb_CD",
    "Mar_CL",
    "Mar_PC",
    "Mar_CD",
    "Apr_CL",
    "Apr_PC",
    "Apr_CD",
    "May_CL",
    "May_PC",
    "May_CD",
    "Jun_CL",
    "Jun_PC",
    "Jun_CD",
    "Jul_CL",
    "Jul_PC",
    "Jul_CD",
    "Aug_CL",
    "Aug_PC",
    "Aug_CD",
    "Sep_CL",
    "Sep_PC",
    "Sep_CD",
    "Oct_CL",
    "Oct_PC",
    "Oct_CD",
    "Nov_CL",
    "Nov_PC",
    "Nov_CD",
    "Dec_CL",
    "Dec_PC",
    "Dec_CD",
    "ANN_CL",
    "ANN_PC",
    "ANN_CD",
]

# Load the data into a DataFrame using fixed-width format, skipping the first two rows
url = "https://www.ncei.noaa.gov/pub/data/ccd-data/clpcdy20.dat"
src = pd.read_fwf(url, skiprows=2, names=column_names)

In [5]:
# Melt the DataFrame to long format for clear days
df_clear = src.melt(
    id_vars=["station", "years"],
    value_vars=[
        f"{month}_CL"
        for month in [
            "Jan",
            "Feb",
            "Mar",
            "Apr",
            "May",
            "Jun",
            "Jul",
            "Aug",
            "Sep",
            "Oct",
            "Nov",
            "Dec",
            "ANN",
        ]
    ],
    var_name="month",
    value_name="days",
)

# Melt the DataFrame to long format for partly cloudy days
df_pc = src.melt(
    id_vars=["station", "years"],
    value_vars=[
        f"{month}_PC"
        for month in [
            "Jan",
            "Feb",
            "Mar",
            "Apr",
            "May",
            "Jun",
            "Jul",
            "Aug",
            "Sep",
            "Oct",
            "Nov",
            "Dec",
            "ANN",
        ]
    ],
    var_name="month",
    value_name="days",
)

# Melt the DataFrame to long format for cloudy days
df_cd = src.melt(
    id_vars=["station", "years"],
    value_vars=[
        f"{month}_CD"
        for month in [
            "Jan",
            "Feb",
            "Mar",
            "Apr",
            "May",
            "Jun",
            "Jul",
            "Aug",
            "Sep",
            "Oct",
            "Nov",
            "Dec",
            "ANN",
        ]
    ],
    var_name="month",
    value_name="days",
)

In [6]:
df_melted = pd.concat([df_clear, df_pc, df_cd])

In [7]:
months = {
    "Jan": "1",
    "Feb": "2",
    "Mar": "3",
    "Apr": "4",
    "May": "5",
    "Jun": "6",
    "Jul": "7",
    "Aug": "8",
    "Sep": "9",
    "Oct": "10",
    "Nov": "11",
    "Dec": "12",
    "ANN": "0",
}

In [8]:
conditions = {
    "CL": "clear",
    "PC": "partly_cloudy",
    "CD": "cloudy",
}

In [9]:
df_melted[["month_abbr", "condition_code"]] = df_melted["month"].str.split(
    "_", expand=True
)

In [10]:
df_melted["condition"] = df_melted["condition_code"].map(conditions)

In [11]:
df_melted["month"] = df_melted["month_abbr"].map(months).astype(int)

In [12]:
df_melted["station_code"] = df_melted["station"].str[:5].astype(str)

In [13]:
df_melted[["station_location", "station_state"]] = (
    df_melted["station"].str[5:].str.split(",", expand=True)
)

In [14]:
df_melted["station_state"] = df_melted["station_state"].str.strip()

In [15]:
df = (
    df_melted[
        [
            "station_code",
            "station_location",
            "station_state",
            "years",
            "month",
            "days",
            "condition",
        ]
    ]
    .query("month != 0")
    .copy()
    .reset_index(drop=True)
)

In [16]:
df.head()

Unnamed: 0,station_code,station_location,station_state,years,month,days,condition
0,13876,BIRMINGHAM AP,AL,37,1,7,clear
1,3856,HUNTSVILLE,AL,27,1,7,clear
2,13894,MOBILE,AL,47,1,8,clear
3,13895,MONTGOMERY,AL,51,1,7,clear
4,26451,ANCHORAGE,AK,44,1,7,clear


In [17]:
df_ca = df.query('station_code == "23174"').reset_index(drop=True)

In [18]:
df_ca_pivot = (
    df_ca.pivot(
        index=["station_code", "station_location", "month"],
        values="days",
        columns="condition",
    )
    .reset_index()
    .sort_values("month")
)

In [19]:
df_ca_pivot["partly_cloudy_or_cloudy"] = (
    df_ca_pivot["partly_cloudy"] + df_ca_pivot["cloudy"]
)

In [20]:
# Define the color gradient for the heatmap
cm = sns.light_palette("#f8c153", as_cmap=True, reverse=True)

# Apply the gradient to the 'partly_cloudy_or_cloudy' column
styled_df = df_ca_pivot.style.background_gradient(
    cmap=cm, subset=["partly_cloudy_or_cloudy"]
)

# Display the styled DataFrame
styled_df

condition,station_code,station_location,month,clear,cloudy,partly_cloudy,partly_cloudy_or_cloudy
0,23174,LOS ANGELES AP,1,12,11,8,19
1,23174,LOS ANGELES AP,2,11,11,6,17
2,23174,LOS ANGELES AP,3,12,11,9,20
3,23174,LOS ANGELES AP,4,11,9,9,18
4,23174,LOS ANGELES AP,5,10,10,11,21
5,23174,LOS ANGELES AP,6,10,9,11,20
6,23174,LOS ANGELES AP,7,13,5,13,18
7,23174,LOS ANGELES AP,8,14,5,12,17
8,23174,LOS ANGELES AP,9,13,6,10,16
9,23174,LOS ANGELES AP,10,13,8,10,18


---

In [21]:
sun_src = (
    pd.read_fwf("https://www.ncei.noaa.gov/pub/data/ccd-data/pctpos20.dat")
    .rename(columns={"Unnamed: 0": "station"})
    .dropna()
)

In [22]:
sun_src[
    [
        "JAN",
        "FEB",
        "MAR",
        "APR",
        "MAY",
        "JUN",
        "JUL",
        "AUG",
        "SEP",
        "OCT",
        "NOV",
        "DEC",
        "ANN",
    ]
] = sun_src[
    [
        "JAN",
        "FEB",
        "MAR",
        "APR",
        "MAY",
        "JUN",
        "JUL",
        "AUG",
        "SEP",
        "OCT",
        "NOV",
        "DEC",
        "ANN",
    ]
].astype(
    int
)

In [23]:
sun_src["station_code"] = sun_src["station"].str[:5].astype(str)

In [24]:
sun_src[["station_location", "station_state"]] = (
    sun_src["station"].str[5:].str.split(",", expand=True)
)

In [25]:
sun_src.columns = sun_src.columns.str.lower()

In [26]:
sun_df = sun_src[
    [
        "station_code",
        "station_location",
        "station_state",
        "jan",
        "feb",
        "mar",
        "apr",
        "may",
        "jun",
        "jul",
        "aug",
        "sep",
        "oct",
        "nov",
        "dec",
        "ann",
    ]
].dropna()

In [27]:
sun_la = sun_df.query('station_location.str.contains("LOS ANGELES")')

In [28]:
mean_sun_la = int(sun_la["ann"].iloc[0]) / 100

In [29]:
sun_la_melt = sun_la.melt(
    id_vars=["station_location"],
    value_vars=[
        "jan",
        "feb",
        "mar",
        "apr",
        "may",
        "jun",
        "jul",
        "aug",
        "sep",
        "oct",
        "nov",
        "dec",
    ],
    var_name="month",
    value_name="pct_sun",
)

In [30]:
sun_la_melt["pct_sun"] = sun_la_melt["pct_sun"] / 100

In [31]:
sun_la_melt["month_num"] = pd.to_datetime(
    sun_la_melt["month"], format="%b"
).dt.strftime("%-m")

In [32]:
base = alt.Chart(sun_la_melt).encode(
    x=alt.X("month_num:T", axis=alt.Axis(format="%b", grid=False), title=""),
    y=alt.Y(
        "pct_sun:Q",
        axis=alt.Axis(format="%", grid=False, offset=15),
        title="Mean possible sunshine",
    ),
    text=alt.Text("pct_sun:Q", format=".0%"),
)

bars = base.mark_bar(color="#f8c153", width=35)
text = base.mark_text(align="center", dy=-10, dx=0)

# Mean line
mean_line = (
    alt.Chart(pd.DataFrame({"y": [mean_sun_la]}))
    .mark_rule(color="#666", strokeDash=[4, 4])
    .encode(y="y:Q")
)

# Mean label
mean_label = (
    alt.Chart(pd.DataFrame({"y": [mean_sun_la], "text": ["Mean: 72%"]}))
    .mark_text(align="left", dx=-75, dy=-10, color="#666")
    .encode(y="y:Q", text="text:N")
)

chart = (
    (bars + text + mean_line + mean_label)
    .configure_view(strokeWidth=0)
    .properties(width=500, height=350, title="Percent sunshine in LA, by month")
)

chart

---

## Hourly

#### Fetch [normals](https://www.ncei.noaa.gov/maps/hourly/) from NCEI

In [33]:
hourly_normals = pd.read_csv(
    "https://www.ncei.noaa.gov/data/normals-hourly/2006-2020/access/USW00023174.csv"
)

In [34]:
columns = [
    "STATION",
    "NAME",
    "LATITUDE",
    "LONGITUDE",
    "ELEVATION",
    "DATE",
    "month",
    "day",
    "hour",
    "HLY-CLOD-PCTCLR",
    "HLY-CLOD-PCTFEW",
    "HLY-CLOD-PCTSCT",
    "HLY-CLOD-PCTBKN",
    "HLY-CLOD-PCTOVC",
]

In [35]:
hourly_la = hourly_normals[columns].copy()

In [36]:
hourly_la.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,month,day,hour,HLY-CLOD-PCTCLR,HLY-CLOD-PCTFEW,HLY-CLOD-PCTSCT,HLY-CLOD-PCTBKN,HLY-CLOD-PCTOVC
0,USW00023174,"LOS ANGELES INTL AP, CA US",33.9381,-118.3889,29.6,01-01T00:00:00,1,1,0,35.9,25.0,20.1,12.5,6.5
1,USW00023174,"LOS ANGELES INTL AP, CA US",33.9381,-118.3889,29.6,01-01T01:00:00,1,1,1,36.8,24.2,19.2,13.2,6.6
2,USW00023174,"LOS ANGELES INTL AP, CA US",33.9381,-118.3889,29.6,01-01T02:00:00,1,1,2,38.5,23.6,19.8,13.2,4.9
3,USW00023174,"LOS ANGELES INTL AP, CA US",33.9381,-118.3889,29.6,01-01T03:00:00,1,1,3,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
4,USW00023174,"LOS ANGELES INTL AP, CA US",33.9381,-118.3889,29.6,01-01T04:00:00,1,1,4,31.0,21.6,16.0,18.8,12.7


In [37]:
hourly_la["HLY-CLOD-PCTCLR"] = hourly_la["HLY-CLOD-PCTCLR"].replace(-9999.0, pd.NA)
hourly_la["HLY-CLOD-PCTFEW"] = hourly_la["HLY-CLOD-PCTFEW"].replace(-9999.0, pd.NA)
hourly_la["HLY-CLOD-PCTSCT"] = hourly_la["HLY-CLOD-PCTSCT"].replace(-9999.0, pd.NA)
hourly_la["HLY-CLOD-PCTBKN"] = hourly_la["HLY-CLOD-PCTBKN"].replace(-9999.0, pd.NA)
hourly_la["HLY-CLOD-PCTOVC"] = hourly_la["HLY-CLOD-PCTOVC"].replace(-9999.0, pd.NA)

In [38]:
daily_mean_overcast_pct_lax = (
    hourly_la.groupby(["NAME", "month", "day"])["HLY-CLOD-PCTOVC"]
    .mean()
    .reset_index(name="overcast_pct")
).rename(columns={"NAME": "location"})

In [39]:
daily_mean_overcast_pct_lax["overcast_pct"] = (
    daily_mean_overcast_pct_lax["overcast_pct"].astype(float).round(2)
)

In [40]:
# Create a date column
daily_mean_overcast_pct_lax["date"] = pd.to_datetime(
    daily_mean_overcast_pct_lax[["month", "day"]].assign(year=2023)
)

color_scheme = alt.Color(
    "overcast_pct:Q", title="Overcast %", scale=alt.Scale(range=["#f8c153", "#999"])
)

# Create the calendar heatmap
base = (
    alt.Chart(daily_mean_overcast_pct_lax)
    .encode(
        x=alt.X("month(date):O", title="Month"),
        y=alt.Y("date(date):T", title="Day", timeUnit="date"),
        color=color_scheme,
        tooltip=["monthdate(date):T", "overcast_pct"],
    )
    .properties(width=700, height=300)
)

heatmap = base.mark_rect().encode(
    y=alt.Y("month(date):O", title="Month"),
    x=alt.X("date(date):O", title="Day of the month", timeUnit="date"),
    color=color_scheme,
)

heatmap.properties(
    width=600,
    height=250,
    title="30-year historical overcast percentage at LAX, by month and day",
).configure_legend(orient="top")

---

## Exports

#### JSON

In [41]:
# df.to_json(
#     f"data/processed/NAME.json",
#     indent=4,
#     orient="records",
# )

#### CSV

In [42]:
# df.to_csv(
#     f"data/processed/NAME.csv", index=False
# )