# Format LCLid half hourly data for a consistent dataset 

The steps to building the dataset we are using here can be found in notebook 1_4_data_wrangling_4_6_forecast_NN_hh

Here we export individual household data for consistent data total sample points and start/stop dates



In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [3]:
from fastai.structured import *
from fastai.column_data import *
import feather as ftr
from datetime import timedelta

In [4]:
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
PATH='../input/merged_data/'

In [6]:
from IPython.display import HTML, display

### Read in final pre-processed dataset

In [7]:
df = pd.read_csv(f'{PATH}hh_final_544_ids_735_days.csv', low_memory=False)

In [8]:
df.head(n=2)

Unnamed: 0.1,Unnamed: 0,index,LCLid,energy(kWh/hh),dayYear,dayMonth,dayWeek,dayDay,dayDayofweek,dayDayofyear,dayIs_month_end,dayIs_month_start,dayIs_quarter_end,dayIs_quarter_start,dayIs_year_end,dayIs_year_start,dayElapsed,delta_minutes,visibility,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,humidity,summary,is_bank_holiday,bank_holiday,day,cloudCover,uvIndex,moonPhase,from_sunrise,to_sunset,Afteris_bank_holiday,Beforeis_bank_holiday,stdorToU,Acorn,Acorn_grouped
0,0,2012-02-05 00:00:00,MAC000006,0.042,2012,2,5,5,6,36,False,False,False,False,False,False,1328400000,-360000,1.32,160.0,-0.12,-0.22,1024.21,-4.68,4.35,snow,0.99,Foggy,False,,2012-02-05,0.85,1.0,0.42,454.0,-1017.0,0,-87840,Std,ACORN-Q,Adversity
1,1,2012-02-05 00:00:00,MAC005178,0.561,2012,2,5,5,6,36,False,False,False,False,False,False,1328400000,-360000,1.32,160.0,-0.12,-0.22,1024.21,-4.68,4.35,snow,0.99,Foggy,False,,2012-02-05,0.85,1.0,0.42,454.0,-1017.0,0,-87840,Std,ACORN-E,Affluent


In [9]:
df.rename(columns={'index': 'day_time'}, inplace=True)

In [10]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [11]:
df['day_time'] = pd.to_datetime(df['day_time'], format='%Y-%m-%d %H:%M:%S')

Count NaNs in ouy target column

In [12]:
df['energy(kWh/hh)'].isna().sum()

6207

In [13]:
#which columns have nan
df.isna().any()

day_time                 False
LCLid                    False
energy(kWh/hh)            True
dayYear                  False
dayMonth                 False
dayWeek                  False
dayDay                   False
dayDayofweek             False
dayDayofyear             False
dayIs_month_end          False
dayIs_month_start        False
dayIs_quarter_end        False
dayIs_quarter_start      False
dayIs_year_end           False
dayIs_year_start         False
dayElapsed               False
delta_minutes            False
visibility               False
windBearing              False
temperature              False
dewPoint                 False
pressure                 False
apparentTemperature      False
windSpeed                False
precipType               False
humidity                 False
summary                  False
is_bank_holiday          False
bank_holiday              True
day                      False
cloudCover                True
uvIndex                   True
moonPhas

In [14]:
nan_energy = df[df['energy(kWh/hh)'].isnull()]

In [15]:
nan_energy.head(n=2)

Unnamed: 0,day_time,LCLid,energy(kWh/hh),dayYear,dayMonth,dayWeek,dayDay,dayDayofweek,dayDayofyear,dayIs_month_end,dayIs_month_start,dayIs_quarter_end,dayIs_quarter_start,dayIs_year_end,dayIs_year_start,dayElapsed,delta_minutes,visibility,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,humidity,summary,is_bank_holiday,bank_holiday,day,cloudCover,uvIndex,moonPhase,from_sunrise,to_sunset,Afteris_bank_holiday,Beforeis_bank_holiday,stdorToU,Acorn,Acorn_grouped
289,2012-02-05 00:00:00,MAC004954,,2012,2,5,5,6,36,False,False,False,False,False,False,1328400000,-360000,1.32,160.0,-0.12,-0.22,1024.21,-4.68,4.35,snow,0.99,Foggy,False,,2012-02-05,,,,,,0,-87840,Std,ACORN-E,Affluent
6100,2012-02-05 05:30:00,MAC000041,,2012,2,5,5,6,36,False,False,False,False,False,False,1328400000,-359670,4.165,101.5,0.38,0.015,1023.81,-1.47,1.605,snow,0.975,Mostly Cloudy,False,,2012-02-05,,,,,,0,-87840,Std,ACORN-Q,Adversity


Ideally we would investigate further and better interpolate/subset to remove nan but dont have time se we just replace all nan with zero

In [16]:
df.fillna(0,inplace=True)

In [1]:
#df.dtypes

Export each household as a separate file - for ML forecasting

We actually dont need all of these, as we only have time fore forecasting 10 or so, but exporting all anyway

In [17]:
f = lambda x: x.to_csv("{0}LCLid/clean/{1}.csv".format(PATH,x.name.lower()), index=False)
df.groupby('LCLid').apply(f)

Generate a numeric only dataset for KNN based clustering

In [20]:
df1 = df[['day_time','LCLid','energy(kWh/hh)','dayElapsed','visibility','windBearing','temperature','dewPoint','pressure','apparentTemperature','windSpeed','humidity']]

In [21]:
df1.to_csv(f'{PATH}hh_for_clustering_544_ids_735_days.csv')