In [1]:
library(tidyr)
library(dplyr)
library(lubridate)

load = "Load_history.csv"
temp = "temperature_history.csv" 
solution = "Load_solution.csv"

setClass("num.with.commas")
setAs("character", "num.with.commas", function(from) as.numeric(gsub(",", "", from) ) )

loadRawClasses = c('factor', rep("numeric", 3), rep("num.with.commas", 24))

df = read.csv(load, stringsAsFactors=FALSE, colClasses=loadRawClasses)

"package 'dplyr' was built under R version 3.6.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'lubridate' was built under R version 3.6.3"
Attaching package: 'lubridate'

The following objects are masked from 'package:base':

    date, intersect, setdiff, union



In [2]:
head(df)

zone_id,year,month,day,h1,h2,h3,h4,h5,h6,...,h15,h16,h17,h18,h19,h20,h21,h22,h23,h24
1,2004,1,1,16853,16450,16517,16873,17064,17727,...,13518,13138,14130,16809,18150,18235,17925,16904,16162,14750
1,2004,1,2,14155,14038,14019,14489,14920,16072,...,16127,15448,15839,17727,18895,18650,18443,17580,16467,15258
1,2004,1,3,14439,14272,14109,14081,14775,15491,...,13507,13414,13826,15825,16996,16394,15406,14278,13315,12424
1,2004,1,4,11273,10415,9943,9859,9881,10248,...,14207,13614,14162,16237,17430,17218,16633,15238,13580,11727
1,2004,1,5,10750,10321,10107,10065,10419,12101,...,13845,14350,15501,17307,18786,19089,19192,18416,17006,16018
1,2004,1,6,15742,15682,16132,16761,17909,20234,...,18762,19162,21509,25314,28060,28768,28919,28653,27406,26507


In [3]:
tidyLoadDf = df %>%  gather(Hour, Consumption, h1:h24) %>%
                 mutate(Hour=as.numeric(substr(Hour, 2, nchar(Hour)))-1) %>%
                 unite(Date, year, month, day, sep='-') %>%
                 unite(DateTime, Date, Hour, sep=' ') %>%
                 mutate(DateTime=as.POSIXct(strptime(DateTime, "%Y-%m-%d %H", tz = "GMT"))) %>%
                 arrange(DateTime, zone_id)

In [4]:
#Spead consumption for each zone into seperate columns, so that we can easily select consumption for each zone later
tidyLoadDf = tidyLoadDf %>% mutate(zone_id=paste0('zone.',as.character(zone_id))) %>%
                      spread(zone_id, Consumption, fill = NA, convert = FALSE)
#Remove 2008/June data, because it's not complete, Create Zone 21 as sum of 20 zones consumption
tidyLoadDf = tidyLoadDf %>% filter(DateTime<as.POSIXct(strptime("2008-07-07 23:00:00", "%Y-%m-%d %H:%M:%S")))
#Add total consumption to zone.21
tidyLoadDf %>% select(zone.1:zone.20) %>% rowSums(na.rm=TRUE) -> tidyLoadDf$zone.21

In [5]:
head(tidyLoadDf)

DateTime,zone.1,zone.10,zone.11,zone.12,zone.13,zone.14,zone.15,zone.16,zone.17,...,zone.2,zone.20,zone.3,zone.4,zone.5,zone.6,zone.7,zone.8,zone.9,zone.21
2004-01-01 00:00:00,16853,23339,90700,118378,20673,21791,65970,28752,30645,...,126259,79830,136233,484,6829,133088,136233,3124,75243,906434
2004-01-01 01:00:00,16450,22100,86699,112480,19666,21400,64600,27851,30461,...,123313,77429,133055,457,6596,129909,133055,2956,67368,878111
2004-01-01 02:00:00,16517,21376,84243,108435,19020,20998,63843,27631,30197,...,119192,75558,128608,450,6525,125717,128608,2953,64050,858831
2004-01-01 03:00:00,16873,21335,84285,107224,18841,21214,64023,27986,30264,...,117507,75709,126791,448,6654,124162,126791,2914,63861,856402
2004-01-01 04:00:00,17064,21564,86087,108870,19310,21830,65679,29160,30907,...,118343,77475,127692,444,6977,125320,127692,3221,75852,876919
2004-01-01 05:00:00,17727,22241,90210,112395,19415,21794,63305,29226,31617,...,121228,77854,130805,490,7330,128558,130805,3361,79989,891806


In [6]:
ensembleClasses = c('NULL', 'factor', rep("numeric", 27))
df = read.csv(solution, stringsAsFactors=FALSE, colClasses=ensembleClasses)
tidyEnsembleDf = df %>%  gather(Hour, Consumption, h1:h24) %>%
                 mutate(Hour=as.numeric(substr(Hour, 2, nchar(Hour)))-1) %>%
                 unite(Date, year, month, day, sep='-') %>%
                 unite(DateTime, Date, Hour, sep=' ') %>%
                 mutate(DateTime=as.POSIXct(strptime(DateTime, "%Y-%m-%d %H", tz = "GMT"))) %>%
                 arrange(DateTime) %>%
                 mutate(zone_id=paste0('zone.',as.character(zone_id))) %>%
                 spread(zone_id, Consumption, fill = NA, convert = FALSE) %>% #Remove forecast, only need backcast
                 filter(DateTime<as.POSIXct(strptime("2008-07-07 23:00:00", "%Y-%m-%d %H:%M:%S")))

In [7]:
head(tidyEnsembleDf)

DateTime,zone.1,zone.10,zone.11,zone.12,zone.13,zone.14,zone.15,zone.16,zone.17,...,zone.2,zone.20,zone.21,zone.3,zone.4,zone.5,zone.6,zone.7,zone.8,zone.9
2005-03-06 00:00:00,19964,26459,113552,140417,21302,27740,74218,40411,36845,...,162096,89800,1719688,174901,528,9061,171157,174901,4091,61215
2005-03-06 01:00:00,19544,25979,112515,137418,20466,27713,73397,40408,36745,...,160890,88325,1703132,173600,499,8697,169587,173600,3971,61131
2005-03-06 02:00:00,19390,25727,113244,137204,20255,27946,73522,40840,37248,...,160924,87680,1720151,173637,469,8595,169519,173637,3975,73038
2005-03-06 03:00:00,19442,25916,113990,138264,20117,28668,74331,41700,37825,...,158962,87966,1725012,171521,486,8669,167631,171521,3966,74193
2005-03-06 04:00:00,19755,26132,115297,140202,20381,29345,75470,42943,38986,...,163197,88730,1762394,176090,497,8941,172138,176090,4031,73710
2005-03-06 05:00:00,20008,26571,118413,144008,19468,27609,70692,41868,39069,...,165197,86399,1758234,178248,524,9360,174557,178248,4158,73689


In [8]:
fullDf = tidyLoadDf
fullDf[is.na(fullDf$zone.1), ] = tidyEnsembleDf[, names(fullDf)]

ERROR: Error in `[<-.data.frame`(`*tmp*`, is.na(fullDf$zone.1), , value = structure(list(: replacement element 1 has 1344 rows, need 1527


In [None]:
head(fullDf)

In [None]:
write.csv(fullDf, "complete_load.csv", row.names=FALSE)