In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.layers import LSTM, Dense
from keras.models import Sequential
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from numpy import array

In [35]:
la_2020 = pd.read_csv('./data/LA_pm10_2020.csv')
la_2021 = pd.read_csv('./data/LA_pm10_2021.csv')
la_2022 = pd.read_csv('./data/LA_pm10_2022.csv')

In [37]:
# subsetting
la_2020 = la_2020[['Date', 'Daily Mean PM10 Concentration']]
la_2021 = la_2021[['Date', 'Daily Mean PM10 Concentration']]
la_2022 = la_2022[['Date', 'Daily Mean PM10 Concentration']]

In [38]:
# converting all to datetime
la_2020['Date'] = pd.to_datetime(la_2020['Date'])
la_2021['Date'] = pd.to_datetime(la_2021['Date'])
la_2022['Date'] = pd.to_datetime(la_2022['Date'])

In [6]:
# # merging
# la_pm10 = pd.concat([la_2020, la_2021, la_2022])
# la_pm10.rename(columns = {'Daily Mean PM10 Concentration':'daily_pm10'}, inplace = True)

In [39]:
#merging
la_pm10 = pd.concat([la_2020, la_2021])
la_pm10.rename(columns = {'Daily Mean PM10 Concentration':'daily_pm10'}, inplace = True)
la_2022.rename(columns = {'Daily Mean PM10 Concentration':'daily_pm10'}, inplace = True)

In [40]:
# dropping null cols
la_pm10 = la_pm10.dropna()

In [41]:
la_pm10.head()

Unnamed: 0,Date,daily_pm10
0,2020-01-04,25
1,2020-01-10,20
2,2020-01-16,32
3,2020-01-22,21
4,2020-01-28,23


In [42]:
la_pm10.tail()

Unnamed: 0,Date,daily_pm10
983,2021-12-27,6
984,2021-12-28,6
985,2021-12-29,5
986,2021-12-30,4
987,2021-12-31,7


In [11]:
# # finding row with max pm10 value
# la_pm10.loc[(la_pm10['daily_pm10'] == max(la_pm10['daily_pm10']))]

Unnamed: 0,Date,daily_pm10
906,2021-10-11,411


In [12]:
# #normalizing
# la_pm10['daily_pm10'] = (la_pm10['daily_pm10'] - la_pm10['daily_pm10'].mean()) / la_pm10['daily_pm10'].std()

In [66]:
la_pm10.head()

Unnamed: 0,Date,daily_pm10
0,2020-01-04,25
1,2020-01-10,20
2,2020-01-16,32
3,2020-01-22,21
4,2020-01-28,23


Data Split

In [67]:
def split_sequence(sequence, n_steps):
	X, y = list(), list()
	for i in range(len(sequence)):
		# find the end of this pattern
		end_ix = i + n_steps
		# check if we are beyond the sequence
		if end_ix > len(sequence)-1:
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
		X.append(seq_x)
		y.append(seq_y)
	return array(X), array(y)

In [79]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1,1))
scaler.fit(la_pm10[['daily_pm10']])

class DataScaler(BaseEstimator, TransformerMixin):
    def __init__(self, scaler) -> None:
        super().__init__()
        self.scaler = scaler
    def fit(self, X, y=None):
        X = check_array(X)
        return self
    def transform(self, X):
        return scaler.transform(X)

class DataSplitter(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
    def fit(self, X, y=None):
        X = check_array(X)
        return self
    def transform(self, X):
        X = X.reshape(X.shape[0],).tolist()
        X = X[0:len(X)-1]
        x,y = split_sequence(X, 10)
        x = x.reshape((x.shape[0], x.shape[1], 1))
        return x,y

data_pipeline = Pipeline([
    ("scaler", DataScaler(scaler=scaler)),
    ("splitter", DataSplitter())
])

In [80]:
x_train, y_train = data_pipeline.transform(la_pm10[['daily_pm10']])

In [81]:
x_train

array([[[-0.8783455 ],
        [-0.9026764 ],
        [-0.84428224],
        ...,
        [-0.84428224],
        [-0.86861314],
        [-0.90754258]],

       [[-0.9026764 ],
        [-0.84428224],
        [-0.89781022],
        ...,
        [-0.86861314],
        [-0.90754258],
        [-0.8296837 ]],

       [[-0.84428224],
        [-0.89781022],
        [-0.88807786],
        ...,
        [-0.90754258],
        [-0.8296837 ],
        [-0.9756691 ]],

       ...,

       [[-0.90754258],
        [-0.88807786],
        [-0.8540146 ],
        ...,
        [-0.9756691 ],
        [-0.98053528],
        [-0.97080292]],

       [[-0.88807786],
        [-0.8540146 ],
        [-0.83941606],
        ...,
        [-0.98053528],
        [-0.97080292],
        [-0.97080292]],

       [[-0.8540146 ],
        [-0.83941606],
        [-0.8540146 ],
        ...,
        [-0.97080292],
        [-0.97080292],
        [-0.9756691 ]]])

In [82]:
y_train

array([-0.8296837 , -0.9756691 , -0.94647202, ..., -0.97080292,
       -0.9756691 , -0.98053528])

In [84]:
# define model
n_steps = 10
n_features = 1
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(n_steps, n_features)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')



In [85]:
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

In [86]:
model.fit(x_train, y_train, epochs=200, verbose=0)

2023-02-21 18:14:35.386984: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


<keras.callbacks.History at 0x7f0c5b4c2f80>

In [87]:
model.save('models/lstm_model_1.h5')

In [33]:
# import joblib
# from joblib import Parallel, delayed

# joblib.dump(model, './models/keras_lstm_1.pkl')