<a href="https://colab.research.google.com/github/simasaadi/noaa-nyc-climate-2020-2025/blob/main/notebooks/01_eda_noaa_nyc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Setup and data loading

In [8]:
import pandas as pd

url = "https://raw.githubusercontent.com/simasaadi/noaa-nyc-climate-2020-2025/main/data/raw/noaa_nyc_annual_2020_2025.csv"

df = pd.read_csv(url)
df.head()


Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,CDSD,CLDD,DSND,...,PRCP,SNOW,TAVG,TMAX,TMIN,TSUN,WDF2,WDF5,WSF2,WSF5
0,US1NJHD0018,"KEARNY 1.7 NNW, NJ US",40.774342,-74.137109,25.6,2023,,,,,...,52.16,,,,,,,,,
1,US1NJHD0018,"KEARNY 1.7 NNW, NJ US",40.774342,-74.137109,25.6,2024,,,,,...,46.54,,,,,,,,,
2,US1NJES0018,"MAPLEWOOD TWP 0.9 SE, NJ US",40.724466,-74.259542,72.5,2020,,,,,...,43.05,,,,,,,,,
3,US1NJES0018,"MAPLEWOOD TWP 0.9 SE, NJ US",40.724466,-74.259542,72.5,2024,,,,,...,51.04,,,,,,,,,
4,USW00094728,"NY CITY CENTRAL PARK, NY US",40.77898,-73.96925,42.7,2020,,1306.0,1306.0,9.0,...,45.39,12.8,57.3,64.3,50.3,,,,,


## 2. Basic structure and data quality checks

In [9]:
# Shape: how many rows and columns
df.shape


(219, 31)

In [10]:
# Columns and data types
df.dtypes


Unnamed: 0,0
STATION,object
NAME,object
LATITUDE,float64
LONGITUDE,float64
ELEVATION,float64
DATE,int64
AWND,float64
CDSD,float64
CLDD,float64
DSND,float64


In [11]:
# Quick summary of numeric columns
df.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LATITUDE,219.0,40.753184,0.189493,40.296952,40.669501,40.752391,40.87645,41.12996
LONGITUDE,219.0,-74.01659,0.327623,-74.42972,-74.28284,-74.137109,-73.728943,-73.37309
ELEVATION,219.0,43.226484,43.327582,0.8,7.6,23.2,71.9,188.7
DATE,219.0,2022.255708,1.561698,2020.0,2021.0,2022.0,2024.0,2025.0
AWND,36.0,8.063889,2.235449,4.3,6.85,7.95,9.725,11.9
CDSD,67.0,1178.432836,228.139127,782.0,978.5,1206.0,1341.5,1632.0
CLDD,67.0,1178.432836,228.139127,782.0,978.5,1206.0,1341.5,1632.0
DSND,61.0,13.57377,9.604615,0.0,6.0,12.0,23.0,39.0
DSNW,82.0,4.243902,2.55585,0.0,2.0,4.0,6.0,11.0
DX32,67.0,9.41791,6.410572,1.0,3.0,8.0,14.0,26.0


In [12]:
# Missing values per column
df.isna().sum().sort_values(ascending=False)


Unnamed: 0,0
TSUN,219
WDF2,184
WSF2,184
WDF5,184
WSF5,184
AWND,183
DSND,158
EMSD,158
TMIN,152
EMNT,152


In [13]:
# Unique years
df["DATE"].unique(), df["DATE"].min(), df["DATE"].max()


(array([2023, 2024, 2020, 2021, 2022, 2025]), 2020, 2025)

In [14]:
# Number of unique stations
df["STATION"].nunique()


58

## 3. Feature engineering

In [15]:
df_fe = df.copy()

# Ensure DATE is integer (year)
df_fe["DATE"] = df_fe["DATE"].astype(int)

# Annual temperature range: max - min
df_fe["annual_range"] = df_fe["EMXT"] - df_fe["EMNT"]

# Total degree days (simple combo of heating degree day columns)
df_fe["total_degree_days"] = df_fe["HDSD"] + df_fe["HTDD"]

# Wetness index: precipitation + snow
df_fe["wetness_index"] = df_fe["PRCP"].fillna(0) + df_fe["SNOW"].fillna(0)

# Heat and cold extreme days (rename for clarity)
df_fe["heat_extreme_days"] = df_fe["DX90"]
df_fe["cold_extreme_days"] = df_fe["DX32"]

# Quick check of new columns
df_fe[[
    "DATE", "STATION", "TAVG", "PRCP", "SNOW",
    "annual_range", "total_degree_days",
    "wetness_index", "heat_extreme_days", "cold_extreme_days"
]].head()


Unnamed: 0,DATE,STATION,TAVG,PRCP,SNOW,annual_range,total_degree_days,wetness_index,heat_extreme_days,cold_extreme_days
0,2023,US1NJHD0018,,52.16,,,,52.16,,
1,2024,US1NJHD0018,,46.54,,,,46.54,,
2,2020,US1NJES0018,,43.05,,,,43.05,,
3,2024,US1NJES0018,,51.04,,,,51.04,,
4,2020,USW00094728,57.3,45.39,12.8,82.0,8666.0,58.19,20.0,6.0


## 4. Save processed dataset

In [16]:
# Save as Parquet (efficient for Python + Streamlit)
output_path_parquet = "/content/noaa_nyc_annual_clean.parquet"
df_fe.to_parquet(output_path_parquet, index=False)

# Optionally also save as CSV for inspection
output_path_csv = "/content/noaa_nyc_annual_clean.csv"
df_fe.to_csv(output_path_csv, index=False)

output_path_parquet, output_path_csv


('/content/noaa_nyc_annual_clean.parquet',
 '/content/noaa_nyc_annual_clean.csv')

## 5. Exploratory Data Analysis (EDA)
### 5.1 Summary statistics

In [17]:
# Quick high-level summary
df_fe[['TAVG','TMAX','TMIN','PRCP','SNOW','DX90','DX32','annual_range']].describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TAVG,67.0,55.713433,1.602261,52.4,54.5,55.7,56.75,58.7
TMAX,67.0,64.11791,1.437477,61.4,63.0,64.2,65.2,67.2
TMIN,67.0,47.301493,2.190578,42.6,45.6,47.1,48.8,51.9
PRCP,198.0,50.709242,6.352835,34.31,46.5175,49.585,54.945,64.77
SNOW,82.0,16.221951,10.083528,0.0,9.025,14.95,24.0,43.0
DX90,67.0,22.298507,11.397787,3.0,13.0,21.0,31.5,49.0
DX32,67.0,9.41791,6.410572,1.0,3.0,8.0,14.0,26.0
annual_range,67.0,88.074627,4.550436,79.0,84.0,88.0,92.0,97.0


### 5.2 Yearly trends (across all stations)


In [18]:
import plotly.express as px

yearly = df_fe.groupby("DATE").agg({
    "TAVG": "mean",
    "PRCP": "sum",
    "DX90": "mean",
    "DX32": "mean"
}).reset_index()

fig = px.line(yearly, x="DATE", y="TAVG", markers=True, title="Average Annual Temperature (Across All Stations)")
fig.show()

fig = px.line(yearly, x="DATE", y="PRCP", markers=True, title="Total Annual Precipitation (Across All Stations)")
fig.show()

fig = px.line(yearly, x="DATE", y="DX90", markers=True, title="Annual Heat Extremes (Days â‰¥ 90Â°F)")
fig.show()

fig = px.line(yearly, x="DATE", y="DX32", markers=True, title="Annual Cold Extremes (Days â‰¤ 32Â°F)")
fig.show()


### 5.3 Compare stations


In [19]:
station_means = df_fe.groupby("NAME").agg({
    "TAVG": "mean",
    "PRCP": "mean",
    "DX90": "mean",
    "annual_range": "mean"
}).reset_index()

px.bar(station_means.sort_values("TAVG"),
       x="NAME", y="TAVG", title="Average Temperature by Station").show()


In [20]:
px.bar(station_means.sort_values("annual_range"),
       x="NAME", y="annual_range", title="Temperature Variability (Annual Range)").show()


### 5.4 Correlation heatmap


In [21]:
import plotly.express as px
import numpy as np

numeric_df = df_fe.select_dtypes(include=[np.number])

corr = numeric_df.corr()

fig = px.imshow(
    corr,
    text_auto=True,
    aspect="auto",
    color_continuous_scale="RdBu_r",
    title="Correlation Matrix of Climate Variables"
)
fig.show()


### 5.5 Geographic map of stations

In [22]:
# Use station average temperature as color
station_geo = df_fe.groupby(["NAME", "LATITUDE", "LONGITUDE"]).agg({
    "TAVG": "mean",
    "PRCP": "mean"
}).reset_index()

fig = px.scatter_mapbox(
    station_geo,
    lat="LATITUDE",
    lon="LONGITUDE",
    color="TAVG",
    size="PRCP",
    hover_name="NAME",
    zoom=8,
    height=600,
    mapbox_style="carto-positron",
    title="NOAA Stations in NYC Metro Area"
)

fig.show()
