In [3]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_val_predict, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from datetime import date
from datetime import datetime
import calendar
import pickle

In [4]:
df = pd.read_csv('data/combined_data_1hr_lags.csv')

In [5]:
df.shape

(2184, 104)

In [6]:
df.rename(columns = {"Unnamed: 0": "Date"}, inplace = True) 
df = df.set_index('Date (LT)')
df.index = pd.to_datetime(df.index)
df.dropna(inplace = True) 
df = pd.get_dummies(df, columns = ['Month', 'Hour', 'Day'], drop_first = True)
df.head(3)

Unnamed: 0_level_0,Raw Conc.,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Precip.,Condition,...,Hour_20,Hour_21,Hour_22,Hour_23,Day_Mon,Day_Sat,Day_Sun,Day_Thu,Day_Tue,Day_Wed
Date (LT),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-10-19 22:00:00,69.0,77.0,75.0,94.0,67.5,3.0,0.0,29.87,0.0,1.0,...,0,0,1,0,0,1,0,0,0,0
2019-10-19 23:00:00,75.0,77.0,75.0,94.0,22.5,3.0,0.0,29.87,0.0,1.0,...,0,0,0,1,0,1,0,0,0,0
2019-10-20 00:00:00,90.0,77.0,75.0,94.0,0.0,0.0,0.0,29.9,0.0,2.0,...,0,0,0,0,0,0,1,0,0,0


In [7]:
#regular old linear regression
reg = LinearRegression()
#remove variables I want to predict
X = df.drop(['Raw Conc.+1', 'Raw Conc.+2', 'Raw Conc.+3', 'Raw Conc.+4', 'Raw Conc.+5', 'Raw Conc.+6'], axis = 1)
#normalize with this method so I can normalize the input for predictions later on
X_norm = preprocessing.normalize(X)
#set y as the variables I want to predict
y = df[['Raw Conc.+1', 'Raw Conc.+2', 'Raw Conc.+3', 'Raw Conc.+4', 'Raw Conc.+5', 'Raw Conc.+6']]
#split data into test/train sets
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size = 0.3, random_state = 21)
#fit model on training data
reg.fit(X_train, y_train)
#predict output from test set
y_pred = reg.predict(X_test)
#check how variable my results are with different test-train splits
cv_results = cross_val_score(reg, X, y, cv = 5)
print(cv_results)
#look at r-squared comparting predicted values to actual values
print(r2_score(y_test, y_pred))
#look at MSE, better for my purposed because there are harsher penalties for being off by a lot
#I want to capture outliers - high PM days - so use MSE
mean_squared_error(y_test, y_pred)

[0.63832799 0.56129214 0.56182336 0.50438627 0.57728804]
0.6732676175407878




1740.020755048043

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [11]:
#get data set for test and train
X = df.drop(['Raw Conc.+1', 'Raw Conc.+2', 'Raw Conc.+3', 'Raw Conc.+4', 'Raw Conc.+5', 'Raw Conc.+6'], axis = 1)
y = df[['Raw Conc.+1', 'Raw Conc.+2', 'Raw Conc.+3', 'Raw Conc.+4', 'Raw Conc.+5', 'Raw Conc.+6']]
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size = 0.3, random_state = 21)

In [None]:
#hyperparameter tuning - check for max depth and n estimators
rfr = RandomForestRegressor()
param_dist = {"max_depth": [3, 5, None],
              "n_estimators": [10, 50, 100, 500, 1000]}

n_iter_search = 5
random_search = RandomizedSearchCV(rfr, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False, 
                                   random_state = 0, scoring = "neg_mean_squared_error")
random_search.fit(X_train, y_train)
random_search.best_params_
# results show:   {'n_estimators': 1000, 'max_depth': None}

In [12]:
from sklearn.tree import DecisionTreeRegressor as dtr

# Fit Decision Tree Model
exam_model = dtr(random_state = 1)
exam_model.fit(X_train,y_train)

# Predict using DTM
from sklearn.metrics import mean_absolute_error as mae
val_fin = exam_model.predict(X_test)

# Display MAE
print(mae(y_test, val_fin))

36.732791956689866


In [81]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Fit Random Forest Model
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(X_train, y_train)

# Predict using RFM
melb_preds = forest_model.predict(X_test)

# Display MAE
print(mean_absolute_error(y_test, melb_preds))

#print(len(X_train[0]))
#print (type(X_test))
#X_train[0].shape



28.28054911059552


(126,)

In [41]:
# Save Model Using Pickle
# save the model to disk
filename = 'finalised_model.sav'
pickle.dump(forest_model, open(filename, 'wb'))

# some time later...

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

#result = loaded_model.score(X_test, y_test)
#print(result)


In [16]:
dt = pd.to_datetime('2019-10-19 23:00:00')
print (dt)

print (df.index.get_loc(dt, method='nearest'))

idx = df.index[df.index.get_loc(dt, method='nearest')]
print (idx)

#df.columns[-1]
#df.loc[dt,:'Day_Wed'].index#.index
#df.loc[dt].values[0]
#print(df.loc[[2019-10-19 23:00:00]])

input_datetime = pd.to_datetime('2019-11-21 07:00:00')
print(input_datetime)
type(input_datetime)
#input_datetime = datetime.strftime('2019-11-21 07:00:00', '%Y-%m-%d %H:%M:%S')

2019-10-19 23:00:00
1
2019-10-19 23:00:00


In [78]:
from datetime import timedelta

input_datetime = pd.to_datetime('2019-12-23 07:00:00')
print(input_datetime)


index = df.index.get_loc(input_datetime, method='nearest')
print(index)

#print(df.loc[input_datetime].values[0])

#st.write(df.loc[input_datetime].values)
#+ timedelta(hours=9)

for i in range(6):
    print(df.loc[input_datetime + timedelta(hours=i)].values[0])

print(df.loc[input_datetime].values)

type(df.loc[input_datetime].values)
len(df.loc[input_datetime].values)

input_datetime = pd.to_datetime('2019-11-21 07:00:00')
print(input_datetime)

df.index = pd.to_datetime(df.index)
print(df.index)

2019-12-23 07:00:00
895
179.0
173.0
162.0
202.0
202.0
218.0
[179.    70.    57.    64.    90.     7.     0.    29.98   0.     1.
 163.    66.    61.    83.   112.5    9.     0.    30.01   0.     2.
 157.    64.    59.    83.    90.     8.     0.    30.04   0.     2.
 168.    63.    57.    82.    67.5    9.     0.    30.04   0.     2.
 173.    70.    61.    73.    90.     9.     0.    29.96   0.     1.
 162.    70.    61.    73.   112.5    9.     0.    29.96   0.     1.
 202.    70.    61.    73.    90.     8.     0.    29.96   0.     1.
 202.    68.    61.    78.    90.     7.     0.    29.93   0.     1.
 218.    66.    59.    78.    90.     5.     0.    29.96   0.     1.
 228.    64.    61.    88.    90.     5.     0.    29.96   0.     2.
   0.     0.     0.     0.     0.     0.     0.     0.     0.     1.
   0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
   0.     0.     0.     0.     0.     0.     1.     0.     0.     0.
   0.     0.  ]
2019-11-21 07:00:00
Datetim

In [80]:
dt = pd.to_datetime('2019-10-19 23:00:00')

input_datetime = pd.to_datetime('2019-10-19 23:00:00')
print(input_datetime)

#From later 
#print (df.index.get_loc(dt, method='nearest'))

#idx = df.index[df.index.get_loc(dt, method='nearest')]
#print (idx)

#df.columns[-1]
#df.loc[dt,:'Day_Wed'].index#.index
#df.loc[dt].values[0]
#print(df.loc[[2019-10-19 23:00:00]])

input = np.delete(df.loc[input_datetime].values, [40,50,60,70,80,90])
output=loaded_model.predict([input])
print(output[0])

2019-10-19 23:00:00
[283.9 269.6 275.6 259.7 259.7 249.2]
