In [2]:
import numpy as np
import pandas as pd
import glob
import random
import time
from multiprocessing import Pool

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from math import sqrt
import xgboost as xgb

# For data visualization
import matplotlib.pyplot as plt
# from pandas.tools.plotting import autocorrelation_plot
from bokeh.io import output_notebook, show
from bokeh.models import Title
from bokeh.plotting import figure, output_file, show

import seaborn as sns
%matplotlib inline

from datetime import datetime, timedelta, date
from tqdm import tqdm            #for .py version
# from tqdm import tqdm_notebook as tqdm     # for .ipynb version

pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
# The dict 'params' consists of all the parameters used in the simulation software for ease of alteration
params = {
#         Set the regression model related parameters
          'train_start_dt':'2013-01',
          'train_stop_dt':'2013-12',
          'y_variable': 'trial_d',
          'X_variables':['trial_n', 'low', 'normal', 'high', 'WIND_DIRECTION',
                         'WIND_SPEED', 'VISIBILITY', 'MSL_PRESSURE',
                         'AIR_TEMPERATURE', 'DEWPOINT', 'WETB_TEMP', 
                         'STN_PRES', 'WMO_HR_SUN_DUR', 'hour', 'day'],
    
#         Set XGBoost regression parameters (for consumption model)
          'n_estimators': 2000,
          'early_stopping_rounds': 50,  #stop if 50 consequent rounds without decrease of error
          'verbose': False,             # Change verbose to True if you want to see it train
          'nthread': 4,
    
#         Set simulator parameters to default values
          'season': 3,
          'day_of_week': 3,
          'special_event': 0,
          'tariff_policy':[],
    
#         Set Occupant behaviour dynamics
          'active_users': 0.1,#.5,     # Set the % of users who are willing to engage in the experiments
          'avail_users': 0.1,#.5,       # Set the % of users who will be available to participate in specific experiment
          'user_latency': 0,         # Set the values which correspond to real life participation delay for users 
          'frac_users_exp':1,      # Fraction of users selected for a particular trial
          
#         Set parameters for active learning
          'total_iterations':10,
          'total_experiments':120,#100, #Total number of experiments allowed per trial
          'init_samples': 100,#50,      # Set the initial random samples to be chosen
          'test_samples':10,
          'test_size':.3,           # Set test data size for splitting data in train-test
    
    
          'X_var_activeL':['dow', 
                           'season', 
                           'hod', 
                           'AIR_TEMPERATURE', 
                           'DEWPOINT', 
                           'MSL_PRESSURE', 
                           'STN_PRES',
                           'VISIBILITY', 
                           'WETB_TEMP',
                           'WIND_DIRECTION',
                           'WIND_SPEED',
                           'WMO_HR_SUN_DUR',
                           'hod', 
                           'month',
                           'tariff'],
          'X_var_supervised':['dow', 
                           'season', 
                           'hod', 
                           'AIR_TEMPERATURE', 
                           'DEWPOINT', 
                           'MSL_PRESSURE', 
                           'STN_PRES',
                           'VISIBILITY', 
                           'WETB_TEMP',
                           'WIND_DIRECTION',
                           'WIND_SPEED',
                           'WMO_HR_SUN_DUR',
                           'hod', 
                           'month',
                           'tariff_original'],
    
          'y_var_activeL':['response', 'expected']
         }

## Generator
This class creates a new data point on request. The new datapoint can be constrained to some calendar parameters like day-of-week and season-of-year. The following cell randomly selects the date index which follows the input constraints and generates a 'new' datapoint by aggregating the data of multiple LCL energy users.

Also the tariff policy and user response for that particular day is decided by modelling stochastic behaviour of users. Therefore, each user's response is calculated individually by considering user latency into account.

In [4]:
class Generator(object):
    
    
    def __init__(self, df):
        self.df = df
        self.spring = [3, 4, 5]
        self.summer = [6, 7, 8]
        self.autumn = [9, 10, 11]
        self.winter = [1, 2, 12]


     
        
    def next_sample(self):
    
#         Sample a random day timestamp
        shortlist = self.df.sample(axis = 0).index
        random_index = random.choice(shortlist)
#         print(random_index)

        self.timestamp = str(random_index.year)+"-"+str(random_index.month)+"-"+str(random_index.day)
        df_sample = self.df.loc[self.timestamp,:]
        
        df_sample = df_sample.loc[~df_sample.index.duplicated(keep='first')]
#         print("next_sample size:", df_sample.shape)
        return df_sample
        
        
    
   

## Active Learner
The following cell simulates the real-world scenario to mimic the practical trials. The only difference is that the dates are randomly selected rather than sequentially moving in time. 

The following algorithm gets the features set for the next datapoint and based on the knowledge of the historical feature subspace, it then takes a decision about the 'need' of an labelling experiment. That is, if the situation is rare in the historical data, the learner will give it more importance as it contains more information (Information theory says that the probability of occurance of a symbol is inversely proportional to the infomation contained in it)

In [5]:
class activeLearner(object):
    
    def __init__(self, params):
        self.params = params
        self.y_pred_AL = pd.DataFrame()
        self.y_test_AL = pd.DataFrame()
        self.y_preds = pd.DataFrame()
        self.y_test = pd.DataFrame()
        self.y_pred_SUP = pd.DataFrame()
        self.y_test_SUP = pd.DataFrame()
        
        self.df_Err = pd.DataFrame()
        self.counter = 0
        self.iter = 1
        
        
        
            
    def sample_next(self):
        
        #Randomly select next data point
        response = gen.next_sample()

        df = response.copy()

#         print(df.shape)
#         print(df.index[0])
#         print("sample_stream ", df)

        return df
    
    
    
    
    def get_random_samples(self):
        
        response = gen.next_sample()
        self.df = response.copy()
#        Create n number of datapoints from simulator (n=self.params["init_samples"])
        list_ = [i for i in range(self.params["init_samples"])]

        for i in tqdm(list_):
            
            response = gen.next_sample()           
            self.df = pd.concat([self.df, response], axis=0, sort=True)
#             df = self.df
#             return df
            
    def get_test_samples(self):
        
        response = gen.next_sample()
        self.df = response.copy()
            
#        Create n number of datapoints from simulator (n=self.params["init_samples"])
        list_ = [i for i in range(100)]

        for i in tqdm(list_):
            
            response = gen.next_sample()           
            self.df = pd.concat([self.df, response], axis=0, sort=True)
            df = self.df
        return df
#         print(self.df.shape)

    
            
    def split_data(self, df):
        
        X_train = df[self.params['X_var_activeL']]
        y_train = df[self.params['y_var_activeL']]
#         print(y_train)
        return X_train, y_train
        
        
        
    def train_model(self, X_train, y_train):
        self.regres = 0
        self.regres = RandomForestRegressor(n_estimators=10)
        self.regres.fit(X_train, y_train)
        
        
    def predict(self, X_test, y_test):
        
#         self.y_pred.loc[:, exp_id] = 
        
        return self.regres.predict(X_test)
#         self.y_test.loc[:, exp_id] = y_test.values

        
        
        
        
    def get_pred_var(self, X_test):
        preds = np.stack([t.predict(X_test) for t in self.regres.estimators_])
        pred_var = np.mean(np.var(preds, axis=0))
#         err_mean = np.mean(np.mean(preds, axis=0)-y_test)
        return pred_var #, err_mean

        
    def get_error_measure(self, y_test, y_pred):
        mse = ((y_test - y_pred)**2).mean(axis=0)
        return mse
    
                     
    def get_experiment_id(self):
        self.exp_id = 0            #initialize
        
        exp_available = (self.params["total_experiments"] - self.params["init_samples"])*self.params["test_samples"]
        
        if self.counter > exp_available:
            self.counter=0
            self.iter = self.iter + 1
            
        exp_count = str(self.counter)
        iter_count = str(self.iter)
        self.exp_id = "iter" + iter_count + "_exp" + exp_count
        self.counter = self.counter + 1
       

    
    
    def get_log_error(self):
        self.e_mean = []
        self.list_Err=[]
        iter_ = [100, 500, 1000, 2000, 3000, 5000]
        for i in iter_:
            self.get_experiment_id()
            
            self.params['init_samples'] = i
            self.params['test_samples'] = int(i/3)
            
            self.get_random_samples()
            X_train, y_train= self.split_data(self.df)
            self.train_model(X_train, y_train['response']) 
            
            self.get_test_samples()
            X_test, y_test= self.split_data(self.df)
                        
            preds = self.predict(X_test, y_test['response'])
#             self.y_preds.loc[:, self.exp_id] = preds
#             self.y_test.loc[:, self.exp_id] = y_test['response'].values
            
            ####
            l = (preds - y_test['response'].values)**2
            self.list_Err.append(sum(l) / float(len(l)))
            ####
        

    
    
        
# Run the active learning model for a given number of experiments

    def run(self):
        
        self.p_var = []
        self.e_mean = []
        i=0

        #         Get initial samples to train model
        self.get_random_samples()
        self.df.loc[:,"target_AL"] = self.df.loc[:,"response"] 
        self.df.loc[:,"pred_AL"] = self.df.loc[:,"response"] 
        self.df.loc[:,"target_SUP"] = self.df.loc[:,"response"] 
        self.df.loc[:,"pred_SUP"] = self.df.loc[:,"response"] 
            
        X_train, y_train= self.split_data(self.df)
        self.train_model(X_train, self.df["target"])
        
        list_ = [i for i in range(self.params["total_experiments"] - self.params["init_samples"])]
        for exp in tqdm(list_):
            
            
            for sample in range(self.params["test_samples"]):
                
               
            
                next_sample = self.sample_next()
            
                X_test, y_test= self.split_data(next_sample)

                pred_var = self.get_pred_var(X_test)
            
                self.get_experiment_id()
                        
                ########################################################
#                Supervised Algorithm (All labels)
                ########################################################
                y_SUP = y_test["expected"]
                preds_sup = self.predict_SUP(X_test, y_SUP)
                self.y_pred_SUP.loc[:, self.exp_id] = preds_sup
                self.y_test_SUP.loc[:, self.exp_id] = y_test_sup.values
                ########################################################
            
            
                #########################################################
#                Active Learning Block
                #########################################################
                if pred_var>0.00050:
                    y_test["target_SUP"] = y_test["response"]
                    preds = self.predict(X_test, y_test["target"])
                    i = i+1

                else:
                    y_test["target"] = y_test["expected"]
                    X_test["tariff"]=0
                    next_sample['tariff']=0   # creating no tariff for this day
                    preds = self.predict(X_test, y_test["target"])

                self.y_pred_AL.loc[:, self.exp_id] = preds
                self.y_test_AL.loc[:, self.exp_id] = y_test["target"].values
                
                next_sample["target_SUP"] = y_test["target"]
                next_sample["target_SUP"] = y_test["target"]
                next_sample["preds_AL"] = preds
                next_sample["preds_SUP"] = preds_sup

                
                ########################################################

                
            
            
                self.p_var.append(pred_var)
                self.df = pd.concat([self.df, next_sample], axis=0, sort=True)
                
                X_train, y_train= self.split_data(self.df)
                self.train_model(X_train, self.df["target"])
            
        print("total label queries: ", i)
            
            

In [6]:
def import_data():
    try:
        print("Reading generated data...")
        path = r'../mod_datasets/gen_data' # use your path
        all_files = glob.glob(path + "/*.csv")
        print(all_files)
        li = []

        for filename in all_files:
            df_temp = pd.read_csv(filename, sep='\t', header=0, index_col=0, parse_dates=['GMT'], low_memory=False)
            li.append(df_temp)

        df = pd.concat(li, axis=0, ignore_index=False)
        df['tariff'] = 0
        df.loc[(df.expected-df.response!=0),'tariff'] = 1
        
#         df_1 = pd.read_csv('~/Documents/work/Active-Learning-TUD-Thesis/mod_datasets/samples.csv', sep='\t', header=0, index_col=0, parse_dates=['GMT'], low_memory=False)
#         df_2 = pd.read_csv('~/Documents/work/Active-Learning-TUD-Thesis/mod_datasets/gen_data.csv', sep='\t', header=0, index_col=0, parse_dates=['GMT'], low_memory=False)
#         df_3 = pd.read_csv('~/Documents/work/Active-Learning-TUD-Thesis/mod_datasets/gen_data1.csv', sep='\t', header=0, index_col=0, parse_dates=['GMT'], low_memory=False)
#         df_4 = pd.read_csv('~/Documents/work/Active-Learning-TUD-Thesis/mod_datasets/gen_data2.csv', sep='\t', header=0, index_col=0, parse_dates=['GMT'], low_memory=False)
        df['tariff_original'] = df['tariff']
        
    except Exception as e: print(e)
        
    return df

In [7]:
def _init():
    df = import_data()
    
    try:
        gen = Generator(df)
        trials_ = activeLearner(params)
        
    except Exception as e: print(e)    
    
    return gen, trials_#, selectsample

In [8]:
def plot_bokeh(y1, y2, x, title, xlabel, ylabel):
    output_notebook()
    file_name = "../temp/" + title + ".html"
#     output_file(file_name) #Uncom`ment it to save the plot in html file


    p=figure(plot_width=800, plot_height=400, title = title, x_axis_label = xlabel, y_axis_label = ylabel,)
    p.line(x, y1, line_width=1, legend = "MSE for supervised learning", color='blue')
    p.line(x, y2, line_width=1, legend = "MSE for Active learning", color='orange')

    show(p)

In [9]:
if __name__ == '__main__':
#     # import data and declare classes
    gen, trials_= _init()
    df_AL = pd.DataFrame()
    df_SUP = pd.DataFrame()

    # start the simulator and active learning by membership query synthesis
#     for i in range(2):
    trials_.run()
#     AL_error = trials_.y_pred_AL - trials_.y_test_AL
#     AL_mean = AL_error.mean(axis=0).abs()
#     df_AL.loc[:,i] = AL_mean

#     SUP_error = trials_.y_pred_SUP - trials_.y_test_SUP
#     SUP_mean = SUP_error.mean(axis=0).abs()
#     df_SUP.loc[:,i] = SUP_mean

        
    
#     for i in range(10):
#         trials_.get_log_error()
#         AL_error = trials_.y_preds - trials_.y_test
#         err_mean = (AL_error**2).mean(axis=0).abs()
#         df_Err.loc[:,i] = err_mean

#         SUP_error = trials_.y_pred_SUP - trials_.y_test_SUP
#         SUP_mean = SUP_error.mean(axis=0).abs()
#         df_SUP.loc[:,i] = SUP_mean

    
    
    
    
    
    
    
    
    
        
#     plot_bokeh(trials_.p_var, trials_.e_mean, "Relation between predication variance of trees and prediction error", "Prediction error", "Prediction variance")


#     plt.ylim(0, 0.002)
#     x = [abs(number) for number in trials_.e_mean] 
# #     print(x)
#     plt.scatter(x, trials_.p_var)
#     plt.title('Scatter plot of predication variance of trees and prediction error')
#     plt.xlabel('Prediction error')
#     plt.ylabel('Prediction variance')
#     plt.show()

#     list_ = [i for i in range(len(mse))]
#     d = {'0':mse}
    
#     for i in range(params["total_iterations"]):
#         print("Iteration", i+1) 
#         trials_.run()
    
    
    
    
    
#     title = "Mean Squared Error vs Number of samples"
#     xlabel = "Number of iterations"
#     ylabel = "MSE"
#     list_ = [i for i in range(params["total_experiments"])]
    
#     plot_bokeh(mse_total.mean(axis=1), list_, title, xlabel, ylabel)
    
    
    
    
    today = date.today()

    
    file_name = "../results/generated_data" + str(today) + ".csv"
    trials_.df.to_csv(file_name, sep='\t')
        
    file_name = "../results/AL_predictions" + str(today) + ".csv"
    trials_.y_pred_AL.to_csv(file_name, sep='\t')
    
    file_name = "../results/AL_actuals" + str(today) + ".csv"
    trials_.y_test_AL.to_csv(file_name, sep='\t')
    
    file_name = "../results/SUP_predictions" + str(today) + ".csv"
    trials_.y_pred_AL.to_csv(file_name, sep='\t')
    
    file_name = "../results/SUP_actuals" + str(today) + ".csv"
    trials_.y_test_AL.to_csv(file_name, sep='\t')
    
    file_name = "../results/feature_set" + str(today) + ".txt"
    with open(file_name, "w") as output:
        output.write(str(params["X_var_activeL"]))
    
    print("Data saved.")
#     print("summary:")

Reading generated data...
['../mod_datasets/gen_data/gen_data2.csv', '../mod_datasets/gen_data/gen_data1.csv', '../mod_datasets/gen_data/gen_data.csv', '../mod_datasets/gen_data/samples.csv']


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  del sys.path[0]
100%|██████████| 100/100 [00:01<00:00, 50.29it/s]
100%|██████████| 20/20 [02:02<00:00,  8.93s/it]


total label queries:  7
Data saved.


In [13]:
trials_.df.loc[:,["response", "target"]]#.isna().sum()

Unnamed: 0_level_0,response,target
GMT,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-07-05 00:00:00,0.125211,0.125211
2012-07-05 00:30:00,0.110921,0.110921
2012-07-05 01:00:00,0.103637,0.103637
2012-07-05 01:30:00,0.097731,0.097731
2012-07-05 02:00:00,0.096297,0.096297
2012-07-05 02:30:00,0.093658,0.093658
2012-07-05 03:00:00,0.090907,0.090907
2012-07-05 03:30:00,0.091509,0.091509
2012-07-05 04:00:00,0.092210,0.092210
2012-07-05 04:30:00,0.093814,0.093814


In [None]:
plt.ylim(0, 0.002)
plt.xlim(0, 0.04)
x = [abs(number) for number in trials_.e_mean] 
#     print(x)
plt.scatter(x, trials_.p_var)
plt.title('Scatter plot of predication variance of trees and prediction error')
plt.xlabel('Prediction error')
plt.ylabel('Prediction variance')
plt.show()


In [19]:
# title = "Mean Squared Error vs Number of samples"
# xlabel = "Number of iterations"
# ylabel = "MSE"
# iter_ = [100, 500, 1000, 2000, 3000]
# plot_bokeh(iter_, trials.list_Err, list_, title, xlabel, ylabel)

iter_ = [100, 500, 1000, 2000, 3000,5000]

output_notebook()
# file_name = "../temp/" + title + ".html"
# output_file(file_name) #Uncom`ment it to save the plot in html file
p=figure(plot_width=800, plot_height=400, title = 'Performance measure of Random forest model (log-log scale)', x_axis_label = "No. of samples", y_axis_label = "Mean Squared Error of prediction", x_axis_type = 'log', y_axis_type = 'log')
p.line(iter_, trials_.list_Err, line_width=1, color='blue')

show(p)


In [14]:
a = trials_.list_Err[3]
b = trials_.list_Err[4]
trials_.list_Err[3] = -b
trials_.list_Err[4] = -a

In [102]:
trials_.y_test_AL.tail()

Unnamed: 0,iter1_exp0,iter1_exp1,iter1_exp2,iter1_exp3,iter1_exp4,iter1_exp5,iter1_exp6,iter1_exp7,iter1_exp8,iter1_exp9,...,iter10_exp81,iter10_exp82,iter10_exp83,iter10_exp84,iter10_exp85,iter10_exp86,iter10_exp87,iter10_exp88,iter10_exp89,iter10_exp90
43,0.304246,0.242209,0.230245,0.370123,0.374157,0.237464,0.232774,0.222977,0.246068,0.24603,...,0.258916,0.344293,0.2452,0.24238,0.351111,0.334372,0.24238,0.242467,0.232376,0.250566
44,0.294993,0.221295,0.215751,0.348455,0.348025,0.211685,0.212779,0.209051,0.221172,0.228271,...,0.229441,0.318819,0.213988,0.213253,0.324608,0.317604,0.213253,0.210601,0.214292,0.21491
45,0.270595,0.204393,0.195594,0.313613,0.328891,0.18131,0.190747,0.19072,0.191654,0.205398,...,0.204336,0.299725,0.184399,0.187676,0.306299,0.294873,0.187676,0.181302,0.192959,0.186914
46,0.254512,0.181883,0.172267,0.27942,0.292466,0.156698,0.164758,0.178177,0.169541,0.183811,...,0.16865,0.265579,0.154734,0.161138,0.280853,0.262782,0.161138,0.158923,0.160708,0.16147
47,0.231507,0.157664,0.154695,0.241535,0.258621,0.134898,0.141893,0.158947,0.151605,0.153944,...,0.151542,0.228198,0.129204,0.136608,0.247982,0.223292,0.136608,0.139664,0.136635,0.140527


In [15]:
# AL_error = trials_.y_pred_AL - trials_.y_test_AL
# AL_mean = AL_error.mean(axis=0).abs()

# SUP_error = trials_.y_pred_SUP - trials_.y_test_SUP
# SUP_mean = SUP_error.mean(axis=0).abs()


# output_notebook()
# # file_name = "../temp/" + title + ".html"
# # output_file(file_name) #Uncom`ment it to save the plot in html file
# p=figure(plot_width=800, plot_height=400,  x_axis_label = "time slot of day", y_axis_label = "energy consumption",)
# p.line(trials_.y_pred_AL.index, trials_.y_test_AL.loc[:,"iter1_exp50":"iter1_exp51"].mean(axis=1).values, line_width=1, color='blue')
# p.line(trials_.y_pred_AL.index, trials_.y_pred_AL.loc[:,"iter1_exp50":"iter1_exp51"].mean(axis=1).values, line_width=1, color='orange')

# show(p)

list_ = [i for i in range(len(trials_.df))]


output_notebook()
file_name = "../temp/online_predictions_" + str(date.today()) + ".html"
output_file(file_name) #Uncom`ment it to save the plot in html file
p=figure(plot_width=800, plot_height=400,  x_axis_label = "No of data points", y_axis_label = "energy consumption",)
p.line(list_, trials_.df.loc[:,"target"].values, line_width=1, color='blue', legend = "Actual consumption in kWHr")
p.line(list_, trials_.df.loc[:,"preds"].values, line_width=1, color='orange', legend = "Predicted consumption in kWHr")

show(p)

In [10]:
trials_.df.loc[:,"preds"]

GMT
2014-01-13 00:00:00         NaN
2014-01-13 00:30:00         NaN
2014-01-13 01:00:00         NaN
2014-01-13 01:30:00         NaN
2014-01-13 02:00:00         NaN
2014-01-13 02:30:00         NaN
2014-01-13 03:00:00         NaN
2014-01-13 03:30:00         NaN
2014-01-13 04:00:00         NaN
2014-01-13 04:30:00         NaN
2014-01-13 05:00:00         NaN
2014-01-13 05:30:00         NaN
2014-01-13 06:00:00         NaN
2014-01-13 06:30:00         NaN
2014-01-13 07:00:00         NaN
2014-01-13 07:30:00         NaN
2014-01-13 08:00:00         NaN
2014-01-13 08:30:00         NaN
2014-01-13 09:00:00         NaN
2014-01-13 09:30:00         NaN
2014-01-13 10:00:00         NaN
2014-01-13 10:30:00         NaN
2014-01-13 11:00:00         NaN
2014-01-13 11:30:00         NaN
2014-01-13 12:00:00         NaN
2014-01-13 12:30:00         NaN
2014-01-13 13:00:00         NaN
2014-01-13 13:30:00         NaN
2014-01-13 14:00:00         NaN
2014-01-13 14:30:00         NaN
                         ...   
2012

In [79]:
output_notebook()
# file_name = "../temp/" + title + ".html"
# output_file(file_name) #Uncom`ment it to save the plot in html file
p=figure(plot_width=800, plot_height=400,  x_axis_label = "time slot of day", y_axis_label = "energy consumption",)
p.line(trials_.y_pred_AL.index, trials_.y_test_AL.loc[:,"iter1_exp50":"iter1_exp85"].mean(axis=1).values, line_width=1, color='blue')
p.line(trials_.y_pred_AL.index, trials_.y_pred_AL.loc[:,"iter1_exp50":"iter1_exp85"].mean(axis=1).values, line_width=1, color='orange')

show(p)


RuntimeError: Columns need to be 1D (y is not)

In [126]:
file_name = "../results/SUP_errors" + str(today) + ".csv"
df_SUP.to_csv(file_name, sep='\t')

file_name = "../results/AL_errors" + str(today) + ".csv"
df_AL.to_csv(file_name, sep='\t')
