In [67]:

import numpy as np
import pandas as pd
import glob
import random
import time
from multiprocessing import Pool

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from math import sqrt
import xgboost as xgb

# For data visualization
import matplotlib.pyplot as plt
# from pandas.tools.plotting import autocorrelation_plot
from bokeh.io import output_notebook, show
from bokeh.models import Title
from bokeh.plotting import figure, output_file, show

import seaborn as sns
%matplotlib inline

from datetime import datetime, timedelta, date
from tqdm import tqdm            #for .py version
# from tqdm import tqdm_notebook as tqdm     # for .ipynb version

pd.options.mode.chained_assignment = None  # default='warn'

In [68]:
# The dict 'params' consists of all the parameters used in the simulation software for ease of alteration
params = {
#         Set the regression model related parameters
          'train_start_dt':'2013-01',
          'train_stop_dt':'2013-12',
          'y_variable': 'trial_d',
          'X_variables':['trial_n', 'low', 'normal', 'high', 'WIND_DIRECTION',
                         'WIND_SPEED', 'VISIBILITY', 'MSL_PRESSURE',
                         'AIR_TEMPERATURE', 'DEWPOINT', 'WETB_TEMP', 
                         'STN_PRES', 'WMO_HR_SUN_DUR', 'hour', 'day'],
    
#         Set XGBoost regression parameters (for consumption model)
          'n_estimators': 2000,
          'early_stopping_rounds': 50,  #stop if 50 consequent rounds without decrease of error
          'verbose': False,             # Change verbose to True if you want to see it train
          'nthread': 4,
    
#         Set simulator parameters to default values
          'season': 3,
          'day_of_week': 3,
          'special_event': 0,
          'tariff_policy':[],
    
#         Set Occupant behaviour dynamics
          'active_users': 0.1,#.5,     # Set the % of users who are willing to engage in the experiments
          'avail_users': 0.1,#.5,       # Set the % of users who will be available to participate in specific experiment
          'user_latency': 0,         # Set the values which correspond to real life participation delay for users 
          'frac_users_exp':1,      # Fraction of users selected for a particular trial
          
#         Set parameters for active learning
          'total_iterations':10,
          'total_experiments':200,#100, #Total number of experiments allowed per trial
          'init_samples': 100,#50,      # Set the initial random samples to be chosen
          'test_samples':10,
          'test_size':.3,           # Set test data size for splitting data in train-test
    
    
          'X_var_activeL':['dow', 
                           'season', 
                           'hod', 
                           'AIR_TEMPERATURE', 
                           'DEWPOINT', 
                           'MSL_PRESSURE', 
                           'STN_PRES',
                           'VISIBILITY', 
                           'WETB_TEMP',
                           'WIND_DIRECTION',
                           'WIND_SPEED',
                           'WMO_HR_SUN_DUR',
                           'hod', 
                           'month',
                           'tariff'],
          'X_var_supervised':['dow', 
                           'season', 
                           'hod', 
                           'AIR_TEMPERATURE', 
                           'DEWPOINT', 
                           'MSL_PRESSURE', 
                           'STN_PRES',
                           'VISIBILITY', 
                           'WETB_TEMP',
                           'WIND_DIRECTION',
                           'WIND_SPEED',
                           'WMO_HR_SUN_DUR',
                           'hod', 
                           'month',
                           'tariff_original'],
    
          'y_var_activeL':['response', 'expected']
         }

In [69]:
df = pd.read_csv("../results/generated_data2019-06-03.csv", sep='\t', header=0, index_col=0, parse_dates=['GMT'], low_memory=False)


print("Reading generated data...")
path = r'../mod_datasets/gen_data' # use your path
all_files = glob.glob(path + "/*.csv")
print(all_files)
li = []

for filename in all_files:
    df_temp = pd.read_csv(filename, sep='\t', header=0, index_col=0, parse_dates=['GMT'], low_memory=False)
    li.append(df_temp)

df_test = pd.concat(li, axis=0, ignore_index=False)
df_test['tariff'] = 0
df_test.loc[(df_test.expected-df_test.response!=0),'tariff'] = 1

Reading generated data...
['../mod_datasets/gen_data/gen_data2.csv', '../mod_datasets/gen_data/gen_data1.csv', '../mod_datasets/gen_data/gen_data.csv', '../mod_datasets/gen_data/samples.csv']


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [70]:
class Generator(object):
    
    
    def __init__(self, df):
        self.df = df
        self.spring = [3, 4, 5]
        self.summer = [6, 7, 8]
        self.autumn = [9, 10, 11]
        self.winter = [1, 2, 12]


     
        
    def next_sample(self):
    
#         Sample a random day timestamp
        shortlist = self.df.sample(axis = 0).index
        random_index = random.choice(shortlist)
#         print(random_index)

        self.timestamp = str(random_index.year)+"-"+str(random_index.month)+"-"+str(random_index.day)
        df_sample = self.df.loc[self.timestamp,:]
        
        df_sample = df_sample.loc[~df_sample.index.duplicated(keep='first')]
#         print("next_sample size:", df_sample.shape)
        return df_sample
        
        
    
   

In [71]:
def get_test_samples(df, num):
        
    response = gen.next_sample()
    df = response.copy()

#        Create n number of datapoints from simulator (n=self.params["init_samples"])
    list_ = [i for i in range(num)]

    for i in tqdm(list_):

        response = gen.next_sample()           
        df = pd.concat([df, response], axis=0, sort=True)
    return df

In [73]:
if __name__ == '__main__':


    gen = Generator(df_test)
   


    df = get_test_samples(df_test, 100)
    y_train = df['response']
    X_train = df[params['X_var_activeL']]

    
    
    df_test = get_test_samples(df_test, 100)
    
    X_test = df_test[params['X_var_activeL']]
    y_test = df_test['response']
   

    regres = RandomForestRegressor(n_estimators=10)
    regres.fit(X_train, y_train)
    y_preds = regres.predict(X_test)
    
    print("RMS Prediction Error:", np.mean((y_test - y_preds)**2))

    
    
    list_ = [i for i in range(len(y_test))]
    
    output_notebook()
    file_name = "../temp/predictions_" + str(date.today()) + ".html"
    output_file(file_name) #Uncom`ment it to save the plot in html file
    p=figure(plot_width=800, plot_height=400,  x_axis_label = "time slot of day", y_axis_label = "energy consumption",)
    p.line(list_, y_test.values, line_width=1, color='blue')
    p.line(list_, y_preds, line_width=1, color='orange')

    show(p)

100%|██████████| 100/100 [00:00<00:00, 149.07it/s]
100%|██████████| 100/100 [00:00<00:00, 154.54it/s]


RMS Prediction Error: 0.00017478889881707661
