In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv("/kaggle/input/house-hold-energy-data/D202.csv")

In [None]:
df_train

# Data Description

In [None]:
df_train.info()

In [None]:
df_train.head()

In [None]:
for feat in df_train.select_dtypes(exclude="number"):
    print("-----------------")
    print("{}\n".format(feat))
    print("{}\n".format(df_train[feat].value_counts()))

* Type: only value is "electric usage"
* date not in datetime format
* start / end time: in hh:mm format but not datetime
* maybe good idea to calculate duratio
* units: only value "kWh"
* cost: metric scaled variable is in string format

In [None]:
fig, sub = plt.subplots(1,2,figsize=(16,8))

for feat, subplot in zip(df_train.select_dtypes(include="number"),sub):
    sns.distplot(df_train[feat],ax=subplot, hist_kws={"edgecolor":"black"})
    subplot.grid()

* notes: no metric values in feature
* usage: right-skewed distribution

In [None]:
df_train.isnull().sum() / df_train.shape[0]

* NOTES is missing 100 % 

# Data Processing

In [None]:
categorical_feat = df_train.select_dtypes(exclude="number")
pd.concat([categorical_feat.head(),categorical_feat.tail()],axis=0)

* drop type and units, because only one value
* transform date, start time and end time to datetime
* transform cost to float

## drop type and units

In [None]:
categorical_feat.drop(["TYPE","UNITS"],axis=1,inplace=True)

## Transform date, start time and end time to datetime

In [None]:
categorical_feat["START"] = pd.to_datetime(categorical_feat["DATE"]+ ' '+categorical_feat["START TIME"],format="%m/%d/%Y %H:%M")
categorical_feat["END"] = pd.to_datetime(categorical_feat["DATE"]+ ' '+categorical_feat["END TIME"],format="%m/%d/%Y %H:%M")

In [None]:
categorical_feat.drop(["DATE","START TIME","END TIME"],axis=1,inplace=True)

In [None]:
categorical_feat["year"] = categorical_feat.apply(lambda x: x["START"].year,axis=1)
categorical_feat["month"] = categorical_feat.apply(lambda x: x["START"].month,axis=1)
categorical_feat["day"] = categorical_feat.apply(lambda x: x["START"].day,axis=1)
categorical_feat["hour"] = categorical_feat.apply(lambda x: x["START"].hour,axis=1)

In [None]:
categorical_feat

## transform cost to float

In [None]:
categorical_feat["COST"] = categorical_feat["COST"].apply(lambda x: float(x[1:]))

## calculate duration

In [None]:
categorical_feat["duration"] = (categorical_feat["END"] - categorical_feat["START"])

In [None]:
categorical_feat

## join cat and numerical data

In [None]:
df_train_trans = pd.concat([categorical_feat, df_train["USAGE"]], axis = 1)

# Data Analysis

## in which time frame lies the data?

In [None]:
print("first data point in time, START TIME: {}".format(df_train_trans["START"].min()))
print("last data point in time, START TIME: {}".format(df_train_trans["START"].max()))
print("\n")
print("first data point in time, END TIME: {}".format(df_train_trans["END"].min()))
print("last data point in time, END TIME: {}".format(df_train_trans["END"].max()))

## How long energy is consumed on average?

In [None]:
df_train_trans["duration"].value_counts()

* duration always 14 min 

## Usage throughout time 

### how are the values distributed on years, month, days, hours?

In [None]:
fig,sub = plt.subplots(1,4,figsize=(35,8))
year_count = df_train_trans.groupby("year")["USAGE"].count()
month_count = df_train_trans.groupby("month")["USAGE"].count()
day_count = df_train_trans.groupby("day")["USAGE"].count()
hour_count = df_train_trans.groupby("hour")["USAGE"].count()

for feat, subplot in zip([year_count, month_count,day_count, hour_count],sub):
    sns.barplot(x=feat.index, y=feat, ax=subplot,palette="Pastel1")
    subplot.grid()
fig.tight_layout()

* not many values from 2016
* values equally distributed between month, day and hour

### is the energy consumption on average different depending on the year, month, day or hour?

In [None]:
fig,sub = plt.subplots(2,4,figsize=(35,8))
year_usage_mean = df_train_trans.groupby("year")["USAGE"].mean()
month_usage_mean = df_train_trans.groupby("month")["USAGE"].mean()
day_usage_mean = df_train_trans.groupby("day")["USAGE"].mean()
hour_usage_mean = df_train_trans.groupby("hour")["USAGE"].mean()

year_cost_mean = df_train_trans.groupby("year")["COST"].mean()
month_cost_mean = df_train_trans.groupby("month")["COST"].mean()
day_cost_mean = df_train_trans.groupby("day")["COST"].mean()
hour_cost_mean = df_train_trans.groupby("hour")["COST"].mean()

list_mean = [year_usage_mean, month_usage_mean,day_usage_mean,hour_usage_mean,year_cost_mean,month_cost_mean,day_cost_mean,hour_cost_mean]

for feat, subplot in zip(list_mean,sub.flatten()):
    sns.barplot(x=feat.index, y=feat, ax=subplot,palette="Pastel1")
    subplot.grid()
fig.tight_layout()

* from 2016 to 2018 the energy consumption has on average been reduced (hypothesis: better isolation and more responsibility?)
* in the winter month Nov - March the energy consumption is higher than in the summer month
* no difference observable between days observable
* higher energy consumption from 19 to 23 h (evening activities?) and 7 - 8 h (rush hour to work?)

In [None]:
import scipy.stats as stats

In [None]:
fig, sub = plt.subplots(2,5,figsize=(20,7))

sns.distplot(df_train_trans[df_train_trans["month"]==5]["USAGE"],  ax=sub[0][0], hist_kws={"edgecolor":"black"})
sns.distplot(np.log1p(df_train_trans[df_train_trans["month"]==5]["USAGE"]),  ax=sub[0][1], hist_kws={"edgecolor":"black"})
sns.distplot((df_train_trans[df_train_trans["month"]==5]["USAGE"])**(1/2),  ax=sub[0][2], hist_kws={"edgecolor":"black"})
sns.distplot(1/(df_train_trans[df_train_trans["month"]==5]["USAGE"]),  ax=sub[0][3], hist_kws={"edgecolor":"black"})
sns.distplot(stats.boxcox(df_train_trans[df_train_trans["month"]==5]["USAGE"])[0],  ax=sub[0][4], hist_kws={"edgecolor":"black"})

prob = stats.probplot(df_train_trans[df_train_trans["month"]==5]["USAGE"], dist=stats.norm, plot=sub[1][0])
prob = stats.probplot(np.log1p(df_train_trans[df_train_trans["month"]==5]["USAGE"]), dist=stats.norm, plot=sub[1][1])
prob = stats.probplot((df_train_trans[df_train_trans["month"]==5]["USAGE"])**(1/2), dist=stats.norm, plot=sub[1][2])
prob = stats.probplot(1/(df_train_trans[df_train_trans["month"]==5]["USAGE"]), dist=stats.norm, plot=sub[1][3])
prob = stats.probplot(stats.boxcox(df_train_trans[df_train_trans["month"]==5]["USAGE"])[0], dist=stats.norm, plot=sub[1][4])

fig.tight_layout()

the distribution of usage is too skewed and even with common transformation techniques, it is not possible establish a normal distribution in order to apply anova / t-tests

In [None]:
stats.kruskal(*[df_train_trans[df_train_trans["year"]==year]["USAGE"] for year in df_train_trans["year"].unique()])

In [None]:
from itertools import combinations
import scipy.stats as stats

year_list = [2016,2017,2018]

for feat1, feat2 in combinations(year_list,2):
    p = stats.mannwhitneyu(df_train_trans[df_train_trans["year"]==feat1]["USAGE"],df_train_trans[df_train_trans["year"]==feat2]["USAGE"])
    print("p-value of mann-Whitney-test between {} and {}: {}".format(feat1,feat2, p))

* significant change in average energy consumption over the years

In [None]:
month_list = df_train_trans["month"].unique()

for feat1, feat2 in combinations(month_list,2):
    t, p = stats.mannwhitneyu(df_train_trans[df_train_trans["month"]==feat1]["USAGE"],df_train_trans[df_train_trans["month"]==feat2]["USAGE"])
    if p < 0.05:
        print("p-value of Mann-Whitney-Test between {} and {}: {}".format(feat1,feat2, p))

* there is a significant difference between most month, but several month (above) have no significant difference

In [None]:
stats.kruskal(*[df_train_trans[df_train_trans["day"]==day]["USAGE"].values for day in df_train_trans["day"].unique()])

* there is a significant difference between the unique days in month
* quite surprising

In [None]:
stats.kruskal(*[df_train_trans[df_train_trans["hour"]==hour]["USAGE"].values for hour in df_train_trans["hour"].unique()])

In [None]:
hour_list = df_train_trans["hour"].unique()

for feat1, feat2 in combinations(hour_list,2):
    t, p = stats.mannwhitneyu(df_train_trans[df_train_trans["hour"]==feat1]["USAGE"],df_train_trans[df_train_trans["hour"]==feat2]["USAGE"])
    if p < 0.05:
        print("p-value of Mann-Whitney-Test between {} and {}: {}".format(feat1,feat2, p))

## Visualization

Rolling mean

In [None]:
fig, sub = plt.subplots(1,1,figsize=(16,8))
sns.lineplot(x = df_train_trans["START"], y = df_train_trans["USAGE"],ax = sub)
sns.lineplot(x = df_train_trans["START"], y = df_train_trans["USAGE"].rolling(window=1440).mean(),ax = sub, color="red");
sub.grid()

Cumulated Usage

In [None]:
cumSum_usage = df_train_trans.set_index("START")["USAGE"].cumsum()

fig,sub = plt.subplots(1,1,figsize=(12,4))
sns.lineplot(x=cumSum_usage.index,y=cumSum_usage.values,ax = sub)
sub.set_ylabel("Usage", fontdict={"fontsize":12})
sub.grid()

- from 2016 to 2017 high growth rate
- 2017 negative growth rate
- end 2017 to mid 2018 consumption grows

In [None]:
from statsmodels.distributions.empirical_distribution import ECDF

fig,sub=plt.subplots(1,1,figsize=(12,4))

ecdf = ECDF(df_train_trans["USAGE"])
sns.lineplot(x=ecdf.x, y=ecdf.y,ax=sub)
sub.grid()

* 80 % of the values of the energy consumption is below ~ 0.2 kwh

In [None]:
df_train_trans.groupby("year")["USAGE"].mean()

In [None]:
df_train_trans.groupby("month")["USAGE"].mean()

In [None]:
fig, sub = plt.subplots(1,1,figsize=(16,8))
sns.lineplot(x = df_train_trans["START"], y = df_train_trans["COST"],ax = sub)
sns.lineplot(x = df_train_trans["START"], y = df_train_trans["COST"].rolling(window=1440).mean(),ax = sub, color="red");
sub.grid()

In [None]:
fig, sub = plt.subplots(1,1,figsize=(16,8))
sub2 = sub.twinx()
sns.lineplot(x = df_train_trans["START"], y = df_train_trans["USAGE"].rolling(window=1440).sum(),ax = sub, color="blue", label = "Usage");
sns.lineplot(x = df_train_trans["START"], y = df_train_trans["COST"].rolling(window=1440).sum(),ax = sub2, color="red", label = "Cost");
sub.grid()
sub.legend(loc="upper right")
sub2.legend(loc="upper left")