


# Build a prediction model for water level at main station

# Group: Huynh, Thuc Nhat Truong-Jehad Nasser-Asfynder Hashmi

In [122]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from sklearn.linear_model import LinearRegression, MultiTaskElasticNetCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor


### Load data

First of all, in order to have the data to process and construct a prediction model for this assignment we re-use the pre-processed data which has been provided and re-order the dataframe as following.


In [128]:

df = pd.read_csv('preprocessed_stations.csv',index_col = 'time')
df.reindex(columns=['a_temp','a_status','a_rain','c_temp','c_status','c_rain','main_level','main_flow'])
df.dropna(inplace=True)#make sure there will be no NaN value
df

Unnamed: 0_level_0,main_level,main_flow,a_temp,a_status,a_rain,c_temp,c_status,c_rain
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-01-01 01:00:00,182.0,7.19,2.6,2,0.0,2.8,2,0.0
2014-01-01 02:00:00,182.0,7.19,2.4,2,0.0,2.5,2,0.0
2014-01-01 03:00:00,182.0,7.19,1.9,2,0.0,1.9,2,0.0
2014-01-01 04:00:00,182.0,7.19,2.0,2,0.0,2.1,2,0.0
2014-01-01 05:00:00,182.0,7.19,1.7,2,0.0,2.1,2,0.0
...,...,...,...,...,...,...,...,...
2017-12-31 19:00:00,255.0,15.90,8.9,4,0.1,9.0,4,0.0
2017-12-31 20:00:00,259.0,16.40,8.9,4,0.0,9.3,3,0.0
2017-12-31 21:00:00,263.0,16.90,8.9,4,0.0,9.2,4,0.0
2017-12-31 22:00:00,268.0,17.50,8.7,4,0.2,8.9,3,0.5


In [129]:
def create_X_y(df):
    """
	X has the following format:
	One week per row
	row == sample
	9 sensor values (including timestamps) * 24 h * 7 days = 1512 entries per sample

	[t_0, w_station_A(t_0), w_station_B(t_0), w_station_C(t_0), 
	 t_1, w_station_A(t_1), w_station_B(t_1), w_station_C(t_1), 
	 
	 t_N, w_station_A(t_N), w_station_B(t_N), w_station_C(t_N)] 


	"""
    timestamps = df.shape[0]
    
    prediction_timestamps = 24
    prediction_step = 3
    
    timestamps_per_week = 24 * 7
    samples = timestamps - timestamps_per_week - prediction_timestamps

    X = []
    y = []
    for i in range(samples):
        X.append(df[i:i+timestamps_per_week].to_numpy().flatten())
        y.append(df['main_level'].iloc[i + timestamps_per_week : 
                                       i + timestamps_per_week + prediction_timestamps : 
                                           prediction_step])
    X = np.array(X)
    y = np.array(y)
    return X, y





def get_sample_dataframe(X, index):
    
    assert 0 <= index <= X.shape[0]
    
    sample_mat = X[index].reshape((24*7,8))
    sample_df = pd.DataFrame(sample_mat[:, 1:], index=sample_mat[:,0])

    return sample_df

def plot_in_2d(X, title=None):
    pca = PCA(n_components=2)
    pca.fit(X)
    X_2d = pca.transform(X)
    plt.scatter(X_2d[:,0], X_2d[:,1])
    
    if title is not None:
        plt.title(title)
    plt.show()    
    
    
def rmse(y_pred, y_true):  
    return np.sqrt(np.mean((y_pred - y_true)**2))

In [130]:
#### Load data in ML representation X, y
X, y = create_X_y(df)
# X = np.delete(X, slice(0, X.shape[1], 9), axis=1)


#Split test set and training set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
#Reduce the dimensions with PCA
dim_reducer = PCA(n_components=10).fit(X_train)
X_train_actual = X_train.copy()
X_train = dim_reducer.transform(X_train)
#train and test linear regression model
lr_regressor = LinearRegression()
lr_regressor.fit(X_train,y_train)

LinearRegression()

In [131]:
#get Mean Square Error >>> Loss 
y_predict = lr_regressor.predict(dim_reducer.transform(X_test))
print('Loss for Linear Regression: ', mean_squared_error(y_true=y_test, y_pred = y_predict, squared=False))

Loss for Linear Regression:  9.974016856744601


In [132]:
elastic_net_regressor = MultiTaskElasticNetCV()#this is used for parallel prcoessing
elastic_net_regressor.fit(X_train,y_train)

MultiTaskElasticNetCV()

In [92]:
y_pred = lr_regressor.predict(dim_reducer.transform(X_test))
print('Loss for Elastic Net: ', mean_squared_error(y_true=y_test, y_pred= y_pred, squared=False))

Loss for Elastic Net:  7.920131472283702


### In order to find a better model, we're going to build the second model: Random Forest.

In [112]:
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train,y_train)

RandomForestRegressor()

In [113]:
y_predict = rf_regressor.predict(dim_reducer.transform(X_test))

print('Random Forest Loss : ', mean_squared_error(y_true=y_test, y_pred= y_predict, squared=False))

Random Forest Loss :  2.5574455587682112


In [99]:
### From the result above, obviously we see that Random Forest model is better than Linear Regression

from sklearn.model_selection import cross_val_score as cvs
##Applying the Cross Validation to test whether the Randon Forest's performance is good over all the data

score_lr = cvs(estimator=lr_regressor,X=dim_reducer.transform(X),y=y,scoring='neg_root_mean_squared_error',cv=6, n_jobs=-1)
score_lr

array([ -5.44711348,  -8.50201752,  -7.26920903,  -7.60144478,
        -7.70576487, -11.47864141])

In [100]:
score_rf = cvs(estimator=rf_regressor,X=dim_reducer.transform(X),y=y,scoring='neg_root_mean_squared_error',cv=6, n_jobs=-1)
score_rf

array([-12.52667616, -17.71926564, -15.40745063, -15.77689074,
       -11.24637719, -20.69484177])

In [84]:
score_rl = cvs(estimator=elastic_net_regressor,X=dim_reducer.transform(X),y=y,scoring='neg_root_mean_squared_error',cv=6, n_jobs=-1)
score_rl

array([ -5.51396016,  -9.50185444,  -8.07288254,  -8.21705023,
        -7.55161405, -12.77093067])

In [85]:
print('negative MSE Linear Regression( mean: {} std:{})'.format(score_lr.mean(),score_lr.std()))
print('negative MSE Random Forest Regression( mean: {} std:{})'.format(score_rf.mean(),score_rf.std()))
print('negative MSE Elastic Net( mean: {} std:{})'.format(score_rl.mean(),score_rl.std()))

negative MSE Linear Regression( mean: -8.00076965570607 std:1.8097084983140221)
negative MSE Random Forest Regression( mean: -15.321181684691794 std:2.993054415727865)
negative MSE Elastic Net( mean: -8.60471534714482 std:2.2082092677610676)
