In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFECV

# Read in data, split into train and test sets

In [89]:
# read in data and convert time axis to datetime
df = pd.read_csv('../air_weather_data/pollution_and_weather_cleaned.csv')
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d %H:%M')
df = df.set_index('Date')

# Create lagged values of PM10 and PM2.5

In [90]:
df['PM$_{10}$, Torkel Knutssonsgatan, lag 1'] = df['PM$_{10}$, Torkel Knutssonsgatan'].shift(1)
df['PM$_{2.5}$, Torkel Knutssonsgatan, lag 1'] = df['PM$_{2.5}$, Torkel Knutssonsgatan'].shift(1)
df_lag1 = df.dropna()

In [91]:
# split to train and test set
df_train = df['2016-01-01 01:00:00':'2019-01-01 00:00:00']
df_test = df['2021-01-01 01:00:00':]

# MinMax normlize train and test data 

In [92]:
scaler = MinMaxScaler()
df_train_norm = scaler.fit_transform(df_train.values)
df_train_norm = pd.DataFrame(df_train_norm, columns=[df_train.columns])
df_test_norm = scaler.transform(df_test.values)
df_test_norm = pd.DataFrame(df_test_norm, columns=df_test.columns)
# inversed = scaler.inverse_transform(df_train_norm) # inverse normalization
# df_train_norm['PM$_{2.5}$, Torkel Knutssonsgatan']
# df_train_norm.head(3)

# Split into X and y (two: one for PM10 and PM2.5)

In [93]:
X_pm10 = df_train_norm.drop(['PM$_{10}$, Torkel Knutssonsgatan', 'PM$_{2.5}$, Torkel Knutssonsgatan, lag 1'], axis=1)
y_pm10 = df_train_norm['PM$_{10}$, Torkel Knutssonsgatan']

X_pm2p5 = df_train_norm.drop(['PM$_{2.5}$, Torkel Knutssonsgatan', 'PM$_{10}$, Torkel Knutssonsgatan, lag 1'], axis=1)
y_pm2p5 = df_train_norm['PM$_{2.5}$, Torkel Knutssonsgatan']


X_pm10_test = df_test_norm.drop(['PM$_{10}$, Torkel Knutssonsgatan', 'PM$_{2.5}$, Torkel Knutssonsgatan, lag 1'], axis=1)
y_pm10_test = df_test_norm['PM$_{10}$, Torkel Knutssonsgatan']

X_pm2p5_test = df_test_norm.drop(['PM$_{2.5}$, Torkel Knutssonsgatan', 'PM$_{10}$, Torkel Knutssonsgatan, lag 1'], axis=1)
y_pm2p5_test = df_test_norm['PM$_{2.5}$, Torkel Knutssonsgatan']

  X_pm10 = df_train_norm.drop(['PM$_{10}$, Torkel Knutssonsgatan', 'PM$_{2.5}$, Torkel Knutssonsgatan, lag 1'], axis=1)
  X_pm2p5 = df_train_norm.drop(['PM$_{2.5}$, Torkel Knutssonsgatan', 'PM$_{10}$, Torkel Knutssonsgatan, lag 1'], axis=1)


# Fit a regular MLR model

In [94]:
regular_MLR = linear_model.LinearRegression(fit_intercept=True)
regular_MLR.fit(X_pm10, y_pm10)
regular_MLR.score(X_pm10, y_pm10)



0.797882963294062

In [110]:
(
    np.sqrt(mean_squared_error(regular_MLR.predict(X_pm10), y_pm10)), # train error
    np.sqrt(mean_squared_error(regular_MLR.predict(X_pm10_test), y_pm10_test)) # test error
)



(0.024215270278270605, 0.020937835657629585)

# Fit a ridge regression model