# Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data import
Data was downloaded from National Centers for Environmental Information (NOAA) 
* https://www.ncei.noaa.gov/cdo-web/search?datasetid=GHCND

Data corresponds weather parameters from Los Angeles, CA International Airport.

## Core values

* TMAX - Maximum temperature
* WT03 - Thunder
* WT04 - Ice pellets, sleet, snow pellets, or small hail"
* PRCP - Precipitation

## Other values
* WT05 - Hail (may include small hail)
* WT06 - Glaze or rime
* WT07 - Dust, volcanic ash, blowing dust, blowing sand, or blowing obstruction
* WT08 - Smoke or haze
* SNWD - Snow depth
* WT09 - Blowing or drifting snow
* WDF1 - Direction of fastest 1-minute wind
* WDF2 - Direction of fastest 2-minute wind
* WDF5 - Direction of fastest 5-second wind
* WT10 - Tornado, waterspout, or funnel cloud"
* PGTM - Peak gust time
* WT11 - High or damaging winds

* WT13 - Mist
* WSF2 - Fastest 2-minute wind speed
* FMTM - Time of fastest mile or fastest 1-minute wind
* ACMH - Average cloudiness midnight to midnight from manual observations
* WSF5 - Fastest 5-second wind speed
* SNOW - Snowfall
* WDFG - Direction of peak wind gust
* WT14 - Drizzle
* ACSH - Average cloudiness sunrise to sunset from manual observations
* WT16 - Rain (may include freezing rain, drizzle, and freezing drizzle)"
* WT18 - Snow, snow pellets, snow grains, or ice crystals
* WSF1 - Fastest 1-minute wind speed
* AWND - Average wind speed
* WT21 - Ground fog
* WSFG - Peak gust wind speed
* WT01 - Fog, ice fog, or freezing fog (may include heavy fog)
* WESD - Water equivalent of snow on the ground
* WT02 - Heavy fog or heaving freezing fog (not always distinguished from fog)
* PSUN - Daily percent of possible sunshine for the period
* TAVG - Average Temperature.
* TMIN - Minimum temperature
* TSUN - Total sunshine for the period 

In [9]:
df = pd.read_csv('../data/raw/los_angeles_weather_noaa.csv')
df

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,ACMH,ACSH,AWND,FMTM,...,WT10,WT11,WT13,WT14,WT16,WT18,WT21,WV01,WV03,WV20
0,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",33.93816,-118.3866,29.7,1950-01-01,,,,,...,,,,,,,,,,
1,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",33.93816,-118.3866,29.7,1950-01-02,,,,,...,,,,,1.0,,,,,
2,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",33.93816,-118.3866,29.7,1950-01-03,,,,,...,,,,,,,,,,
3,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",33.93816,-118.3866,29.7,1950-01-04,,,,,...,,,,,,,,,,
4,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",33.93816,-118.3866,29.7,1950-01-05,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27746,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",33.93816,-118.3866,29.7,2025-12-19,,,,,...,,,,,,,,,,
27747,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",33.93816,-118.3866,29.7,2025-12-20,,,,,...,,,,,,,,,,
27748,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",33.93816,-118.3866,29.7,2025-12-21,,,,,...,,,,,,,,,,
27749,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",33.93816,-118.3866,29.7,2025-12-22,,,,,...,,,,,,,,,,


In [10]:
# List of columns  
df.columns

Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'ACMH',
       'ACSH', 'AWND', 'FMTM', 'FRGT', 'PGTM', 'PRCP', 'SNOW', 'SNWD', 'TAVG',
       'TMAX', 'TMIN', 'TSUN', 'WDF1', 'WDF2', 'WDF5', 'WDFG', 'WDFI', 'WDFM',
       'WESD', 'WSF1', 'WSF2', 'WSF5', 'WSFG', 'WSFI', 'WSFM', 'WT01', 'WT02',
       'WT03', 'WT04', 'WT05', 'WT06', 'WT07', 'WT08', 'WT09', 'WT10', 'WT11',
       'WT13', 'WT14', 'WT16', 'WT18', 'WT21', 'WV01', 'WV03', 'WV20'],
      dtype='object')

In [11]:
df.dtypes

STATION       object
NAME          object
LATITUDE     float64
LONGITUDE    float64
ELEVATION    float64
DATE          object
ACMH         float64
ACSH         float64
AWND         float64
FMTM         float64
FRGT         float64
PGTM         float64
PRCP         float64
SNOW         float64
SNWD         float64
TAVG         float64
TMAX         float64
TMIN         float64
TSUN         float64
WDF1         float64
WDF2         float64
WDF5         float64
WDFG         float64
WDFI         float64
WDFM         float64
WESD         float64
WSF1         float64
WSF2         float64
WSF5         float64
WSFG         float64
WSFI         float64
WSFM         float64
WT01         float64
WT02         float64
WT03         float64
WT04         float64
WT05         float64
WT06         float64
WT07         float64
WT08         float64
WT09         float64
WT10         float64
WT11         float64
WT13         float64
WT14         float64
WT16         float64
WT18         float64
WT21         

In [12]:
# Sum of null values in each column
def list_null_values(df):
    return df.isnull().sum()

In [13]:
list_null_values(df)

STATION          0
NAME             0
LATITUDE         0
LONGITUDE        0
ELEVATION        0
DATE             0
ACMH         16005
ACSH         16004
AWND         12541
FMTM         21682
FRGT         27750
PGTM          5737
PRCP             0
SNOW         10404
SNWD          9735
TAVG         20554
TMAX             0
TMIN             0
TSUN         26600
WDF1         27293
WDF2         17110
WDF5         17519
WDFG         10607
WDFI         24729
WDFM         27744
WESD         21907
WSF1         27293
WSF2         17110
WSF5         17519
WSFG         10602
WSFI         24729
WSFM         27744
WT01         18258
WT02         26374
WT03         27450
WT04         27745
WT05         27609
WT06         27750
WT07         27388
WT08         12131
WT09         27676
WT10         27747
WT11         27749
WT13         24783
WT14         27428
WT16         24070
WT18         27747
WT21         27653
WV01         27727
WV03         27749
WV20         27744
dtype: int64

# Preparing the dataset 

Preparation:

* Fill missing values with zeros.
* Do train/validation/test split with 60%/20%/20% distribution. 
* Use the `train_test_split` function and set the `random_state` parameter to 1.
* Use `DictVectorizer(sparse=True)` to turn the dataframes into matrices.

In [None]:
# Function to fill missing values with a value
def fill_null_values(df, cat_fill_value, num_fill_value):
    cat_columns = df.select_dtypes(include=['object']).columns
    num_columns = df.select_dtypes(include=['int64','float64']).columns
    
    # Fill NaNs for categorical columns with the provided value
    df[cat_columns] = df[cat_columns].fillna(cat_fill_value)
    
    # Fill NaNs for numerical columns with the provided value
    df[num_columns] = df[num_columns].fillna(num_fill_value)
    
    return df