In [46]:
import pandas as pd
import numpy as np
import pickle
import glob
from itertools import combinations_with_replacement, combinations
from scipy.stats import ks_2samp
from scipy.special import rel_entr

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, MeanShift, AffinityPropagation, SpectralClustering
from sklearn.metrics import mean_squared_error

from matplotlib import pyplot as plt
import seaborn as sns

# Prceprocessing REDD

In [47]:
def hourly_data_redd(df, time_column='Time'):
    """
    This function returns the first reading of each ho|ur
    """
    df_copy = df.copy()
    df_copy[time_column] = pd.to_datetime(df_copy[time_column])
    df_copy.sort_values(by=time_column, ascending=True, inplace=True)
    #df_copy.set_index(time_column, inplace=True)
    #df_copy = df_copy.resample('H').first()
    df_copy['H'] =  df_copy[time_column].dt.floor('H')
    df_copy = df_copy.groupby('H').first()
    df_copy.index = [i for i in range(len(df_copy))]
    return df_copy

In [48]:
def filter_sort_date_redd(df_list, date_col_name='date'):
    """
    Filters dataframes of a list based on a roughly common date rangeXXXXX
    In the case of REDD, we can just align based on an hour range, we have insufficient data 
    """
    dates = [(min(df[date_col_name].tolist()), max(df[date_col_name].tolist())) for df in df_list]
    print([(i, d) for i, d in enumerate(dates)])
    min_date, max_date = max([x[0] for x in dates]), min([x[-1] for x in dates])
    #print(min_date, max_date)
    filtered_df_list = [df.loc[(df[date_col_name] >= min_date) & (df[date_col_name] <= max_date)].reset_index().sort_values(date_col_name) for df in df_list]
    return filtered_df_list

In [49]:
def harmonize_timestamps_redd(df_list, time_column='Time'):
    """
    This function filters datasets to keep hours in common
    No possibility of picking an hour range in common for REDD, small data
    """
    result = []
    lengths = []
    for df in df_list:
        # to fully harmonize
        start_index = df[time_column].dt.hour.eq(1).idxmax()
        #print(df['Time'].dt.hour.tolist())
        filtered = df.iloc[start_index:]
        result.append(filtered)
        lengths.append(len(df))
    return [df.iloc[:min(lengths)] for df in df_list]

In [50]:
#harmonize_timestamps_redd(df_list_new)[1]

In [51]:
def create_category_cols_redd(df, categories, remove_other=False):
    df_copy = df.copy()
    cols = df_copy.columns.tolist()
    cat_cols = {c: [col for col in cols if any(col.startswith(cat) for cat in categories[c])] for c in categories.keys()}

    for cat_col, val in cat_cols.items():
        df_copy[cat_col] = df_copy[val].sum(axis=1).values


    for col in df_copy.columns:
        if col not in ["Time", "mains_2"] + list(categories.keys()):
            try:
                df_copy.drop(col, axis=1, inplace=True)
            except KeyError:
                continue

    if remove_other and 'other' in df_copy.columns:
        df_copy.drop('other', axis=1, inplace=True)
    return df_copy

In [52]:
df_list_new = []
house_nbs = []
all_files_new = glob.glob("*.csv")


step_size = 1
for ind, f in enumerate(all_files_new):
    house_nb = int(f[f.index('House')+5])
    if house_nb != 5:
        house_nbs.append(house_nb)
        current_df = pd.read_csv(f)

        #current_df.columns = ["Time", "mains_2"] + real_col_names[str(house_nb)]
        current_df = hourly_data_redd(current_df)

        print(house_nb, ' dataframe size is ', len(current_df))
        #print('nans', (current_df.isna().sum()/len(current_df)).tolist())
        df_list_new.append(current_df.iloc[::step_size])
        del current_df



6  dataframe size is  315
4  dataframe size is  472
1  dataframe size is  447
2  dataframe size is  338
3  dataframe size is  429


In [53]:
set.intersection(*[set((pd.to_datetime(df_list_new[ind]['Time']).astype(int) / 10**9) // 3600 *3600) 
                   for ind in range(len(df_list_new)) if ind != 3])


{1306202400.0,
 1306206000.0,
 1306209600.0,
 1306213200.0,
 1306216800.0,
 1306220400.0,
 1306224000.0,
 1306227600.0,
 1306231200.0,
 1306234800.0,
 1306238400.0,
 1306242000.0,
 1306245600.0,
 1306249200.0,
 1306252800.0,
 1306256400.0,
 1306260000.0}

In [54]:
1306234800 in set((pd.to_datetime(df_list_new[3]['Time']).astype(int) / 10**9) // 3600 *3600)

False

In [55]:
[set((pd.to_datetime(d['Time']).astype(int) / 10**9) // 3600 *3600) for d in df_list_new]

[{1306004400.0,
  1306008000.0,
  1306011600.0,
  1306015200.0,
  1306018800.0,
  1306022400.0,
  1306026000.0,
  1306029600.0,
  1306033200.0,
  1306036800.0,
  1306040400.0,
  1306044000.0,
  1306047600.0,
  1306202400.0,
  1306206000.0,
  1306209600.0,
  1306213200.0,
  1306216800.0,
  1306220400.0,
  1306224000.0,
  1306227600.0,
  1306231200.0,
  1306234800.0,
  1306238400.0,
  1306242000.0,
  1306245600.0,
  1306249200.0,
  1306252800.0,
  1306256400.0,
  1306260000.0,
  1306288800.0,
  1306292400.0,
  1306296000.0,
  1306299600.0,
  1306303200.0,
  1306306800.0,
  1306310400.0,
  1306314000.0,
  1306317600.0,
  1306321200.0,
  1306324800.0,
  1306328400.0,
  1306332000.0,
  1306335600.0,
  1306339200.0,
  1306342800.0,
  1306346400.0,
  1306350000.0,
  1306353600.0,
  1306357200.0,
  1306360800.0,
  1306364400.0,
  1306368000.0,
  1306371600.0,
  1306375200.0,
  1306378800.0,
  1306382400.0,
  1306386000.0,
  1306389600.0,
  1306393200.0,
  1306396800.0,
  1306400400.0,
  130640

In [56]:
REDD_CATEGORIES = {
    'fridge' : ['refrigerator',], # house4 doesn't have it
    'washing_drying': ['washer_dryer',],
    #'dishwasher': ['dishwasher',],
    #'computer_tv': [],
    #'heating': ['kitchen_outlets', 'electric_heat',],
}

In [57]:
redd = harmonize_timestamps_redd(df_list_new)

In [58]:
redd = [create_category_cols_redd(df, REDD_CATEGORIES) for df in redd]

In [59]:
k = 3
#df_list_new[k].isna().sum()/len(df_list_new[k])

In [60]:
#with pd.option_context('display.max_rows', None,):
#   print(df_list_new[1])

In [61]:
redd

[                   Time  mains_2  fridge  washing_drying
 0   2011-05-21 19:39:51   255.62     1.0             2.0
 1   2011-05-21 20:00:00   409.37   164.0             2.0
 2   2011-05-21 21:00:05   222.18     0.0             3.0
 3   2011-05-21 22:00:01   202.21     0.0             3.0
 4   2011-05-21 23:00:05   361.92   142.0             3.0
 ..                  ...      ...     ...             ...
 310 2011-06-13 21:00:00   167.92     1.0             3.0
 311 2011-06-13 22:00:00   169.90     0.0             3.0
 312 2011-06-13 23:00:02   176.82     0.0             3.0
 313 2011-06-14 00:00:02   352.80   133.0             3.0
 314 2011-06-14 01:00:09   373.92   142.0             3.0
 
 [315 rows x 4 columns],
                    Time  mains_2  fridge  washing_drying
 0   2011-04-17 01:16:32    86.58     0.0             0.0
 1   2011-04-17 02:04:00   120.96     0.0             0.0
 2   2011-04-17 03:00:01    84.18     0.0             0.0
 3   2011-04-17 04:00:02    47.84     0.0    

## Harmonizing REFIT with REDD

In [62]:
with open('refit_data_by_category.pk', 'rb') as f:
    ref = pickle.load(f)

In [63]:
with open('refit_data_all_appliances.pk', 'rb') as f:
    refit = pickle.load(f)

In [64]:
REFIT_CATEGORIES = {
    'fridge' : ['Fridge-Freezer', 'fridge', 'Freezer', 'Chest Freezer'],
    'washing_drying': ['Washing Machine', 'Tumble Dryer'],
    #'dishwasher': ['Dishwasher',],
    #'heating': ['kitchen_outlets', 'electric_heat',],
}

In [65]:
def categorize_refit(df, categories):
    df_copy = df.copy()
    cols = df_copy.columns.tolist()
    cat_cols = {c: [col for col in cols if any(col.startswith(cat) for cat in categories[c])] for c in categories.keys()}

    print(cat_cols)
    if not all([v != [] for k, v in cat_cols.items()]):
        return None
    
    for cat_col, val in cat_cols.items():
        df_copy[cat_col] = df_copy[val].sum(axis=1).values


    for col in df_copy.columns:
        if col not in ["Unix", "Aggregate"] + list(categories.keys()):
            try:
                df_copy.drop(col, axis=1, inplace=True)
            except KeyError:
                continue

    return df_copy

In [66]:
cat_refit = [categorize_refit(df, REFIT_CATEGORIES) for df in refit]
cat_refit = [house for house in cat_refit if house is not None]

{'fridge': ['Fridge-Freezer', 'Freezer'], 'washing_drying': ['Tumble Dryer', 'Washing Machine']}
{'fridge': ['Chest Freezer'], 'washing_drying': ['Tumble Dryer', 'Washing Machine']}
{'fridge': ['Freezer', 'Chest Freezer (In Garage)', 'Fridge-Freezer'], 'washing_drying': ['Washing Machine']}
{'fridge': ['Freezer', 'Fridge-Freezer'], 'washing_drying': ['Washing Machine (1)', 'Washing Machine (2)']}
{'fridge': ['Fridge-Freezer'], 'washing_drying': ['Washing Machine']}
{'fridge': ['Fridge-Freezer'], 'washing_drying': ['Tumble Dryer 3', 'Washing Machine']}
{'fridge': ['Fridge-Freezer'], 'washing_drying': []}
{'fridge': ['Freezer (Utility Room)'], 'washing_drying': ['Washing Machine']}
{'fridge': [], 'washing_drying': ['Washing Machine', 'Tumble Dryer']}
{'fridge': ['Fridge-Freezer'], 'washing_drying': ['Tumble Dryer', 'Washing Machine']}
{'fridge': ['Freezer (Garage)', 'Freezer'], 'washing_drying': ['Tumble Dryer', 'Washing Machine']}
{'fridge': ['Fridge-Freezer (1)', 'Fridge-Freezer (2)'],

In [67]:
#length = len(refit['train'])


#refit_X = {cat: pd.concat([pd.concat([refit['train'][ind]['X'][cat], refit['test'][ind]['X'][cat]], axis=1)
#                for ind in range(length)
#                ])
#           for cat in refit['train'][0]['X'].keys()
#          }

#refit_X_bar = pd.concat([pd.concat([refit['train'][ind]['X_bar'], refit['test'][ind]['X_bar']], axis=1)
#                for ind in range(length)
#                ])



In [68]:
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#    print(redd[1])

In [69]:
# redd date range is May 22 to May 27. we don't have that in the harmonized refit.
#We'll pick 3 weeks after. June 12 to June 17.
date_range = [1402530000, 1402983000]

def harmonize_refit_with_redd(df, date_range, redd_df=None):
    df_copy = df.copy()
    #df_copy = df_copy[(df_copy.index >= date_range[0]) & (df_copy.index <= date_range[-1])]
    #df_copy['Time'] = df_copy['Time'] - pd.DateOffset(years=3, weeks=3)
    df_copy.columns = [col.replace('target', '') for col in df_copy.columns]
    return df_copy.iloc[:len(redd_df[0])]
    #redd_df['Time'].dt.hour
    #return df_copy


In [70]:
harmonized_refit = [harmonize_refit_with_redd(house, date_range, redd) for house in cat_refit]


In [71]:
#HOUSES = ['House_3', 'House_1', 'House_10', 'House_4', 'House_11', 'House_5', 'House_12', 'House_6', 'House_13',
# 'House_15', 'House_7', 'House_16', 'House_8', 'House_17', 'House_9', 'House_18', 'House_19', 'House_2',
# 'House_20', 'House_21']

def package_data(df_list, house_nb_list):
    cat_dict = {}
    cols = [col for col in df_list[0].columns if col not in ['Unix',]]
    for col in cols:
        if col != 'Aggregate':
            df = pd.DataFrame([house[col].tolist() for house in harmonized_refit]).T
            df.columns = ['house_' + str(col) for col in df.columns] #HOUSES
            cat_dict[col] = df
    
    x_bar = pd.DataFrame([house['Aggregate'].tolist() for house in harmonized_refit]).T
    x_bar.columns = ['house_' + str(col) for col in x_bar.columns] #HOUSES
    return cat_dict, x_bar

In [72]:
X, X_bar = package_data(harmonized_refit, HOUSES)

In [73]:
#X = {cat: harmonize_refit_with_redd(refit_X[cat], date_range, redd) for cat in refit['train'][0]['X'].keys()}
#X_bar = harmonize_refit_with_redd(refit_X_bar, date_range, redd)

## Create final data (transfer might help with the inconsistent timestamps)

In [74]:
def merge_refit_redd(refit, redd, column, house_nbs):
    source_data = refit.reset_index()
    target_data = pd.DataFrame([df[column].tolist() for df in redd]).T
    target_data.columns = ['house_' + str(house_nbs[ind]) + '_target' for ind in range(len(house_nbs))]

    data = pd.concat([source_data, target_data], axis = 1)
    data.drop('index', axis=1, inplace=True)
    return data

In [44]:
################
## DEPRECATED ##
################

from collections import defaultdict

final_data_train = defaultdict(lambda: defaultdict())
final_data_test = defaultdict(lambda: defaultdict())

X_dict = dict()
for cat in refit['train'][0]['X'].keys():
    if cat != 'computer_tv':
        final_data = merge_refit_redd(X[cat], redd, cat, house_nbs)
        final_data_train['X'][cat] = final_data.iloc[:,:-3]
        final_data_test['X'][cat] = final_data.iloc[:,-3:]
        
    
final_x_bar = merge_refit_redd(X_bar, redd, 'mains_2', house_nbs)
final_data_train['X_bar'] = final_x_bar.iloc[:,:-3]
final_data_test['X_bar'] = final_x_bar.iloc[:,-3:]

final_data_train = dict(final_data_train)
final_data_test = dict(final_data_test)

final_data = {
    'train': final_data_train,
    'test': final_data_test,
}

TypeError: list indices must be integers or slices, not str

In [75]:
from collections import defaultdict

final_data_train = defaultdict(lambda: defaultdict())
final_data_test = defaultdict(lambda: defaultdict())

X_dict = dict()
for cat in X.keys():
    final_data = merge_refit_redd(X[cat], redd, cat, house_nbs)
    final_data_train['X'][cat] = final_data.iloc[:,:-3]
    final_data_test['X'][cat] = final_data.iloc[:,-3:]
        
    
final_x_bar = merge_refit_redd(X_bar, redd, 'mains_2', house_nbs)
final_data_train['X_bar'] = final_x_bar.iloc[:,:-3]
final_data_test['X_bar'] = final_x_bar.iloc[:,-3:]

final_data_train = dict(final_data_train)
final_data_test = dict(final_data_test)

final_data = {
    'train': final_data_train,
    'test': final_data_test,
}

In [76]:
#redd[0]

In [77]:
#final_data['train']['X_bar']

In [78]:
#merge_refit_redd(X, redd, 'fridge', house_nbs)

In [79]:
with open('refit_redd_2nd_attempt.pk', 'wb') as f:
    pickle.dump(dict(final_data), f)

In [381]:
#merge_refit_redd(X[cat], redd, cat, house_nbs)

In [202]:
#redd[0]