In [1]:
import pandas as pd
import pickle

In [2]:
s = """House, Occupancy, Construction Year, Appliances Owned, Type, Size
1	,	2	,	1975-1980				, 35 , Detached			, 4 bed
2	,	4	,	-						, 15 , Semi-detached	, 3 bed
3	,	2	,	1988					, 27 , Detached			, 3 bed
4	,	2	,	1850-1899 				, 33 , Detached			, 4 bed
5	,	4	,	1878					, 44 , Mid-terrace		, 4 bed
6	,	2	,	2005					, 49 , Detached			, 4 bed
7	,	4	,	1965-1974				, 25 , Detached			, 3 bed
8	,	2	,	1966					, 35 , Detached			, 2 bed
9	,	2	,	1919-1944				, 24 , Detached			, 3 bed
10	,	4	,	1919-1944				, 31 , Detached			, 3 bed
11	,	1	,	1945-1964				, 25 , Detached			, 3 bed
12	,	3	,	1991-1995				, 26 , Detached			, 3 bed
13	,	4	,	post 2002				, 28 , Detached			, 4 bed
15	,	1	,	1965-1974				, 19 , Semi-detached	, 3 bed
16	,	6	,	1981-1990				, 48 , Detached			, 5 bed
17	,	3	,	mid 60s					, 22 , Detached			, 3 bed
18	,	2	,	1965-1974				, 34 , Detached			, 3 bed
19	,	4	,	1945-1964				, 26 , Semi-detached	, 3 bed
20	,	2	,	1965-1974				, 39 , Detached			, 3 bed
21	,	4	,	1981-1990				, 23 , Detached			, 3 bed"""

In [4]:
l = [x.replace('\t', '').replace(' ', '').split(',') for x in s.split('\n')]

In [5]:
df = pd.DataFrame(l[1:], columns = l[0])

In [6]:
df = df[~df['House'].isin({'12', '13'})]

In [7]:
df['AppliancesOwned'] = df['AppliancesOwned'].astype(int)
df['Occupancy'] = df['Occupancy'].astype(int)

In [10]:
df

Unnamed: 0,House,Occupancy,ConstructionYear,AppliancesOwned,Type,Size
0,1,2,1975-1980,35,Detached,4bed
1,2,4,-,15,Semi-detached,3bed
2,3,2,1988,27,Detached,3bed
3,4,2,1850-1899,33,Detached,4bed
4,5,4,1878,44,Mid-terrace,4bed
5,6,2,2005,49,Detached,4bed
6,7,4,1965-1974,25,Detached,3bed
7,8,2,1966,35,Detached,2bed
8,9,2,1919-1944,24,Detached,3bed
9,10,4,1919-1944,31,Detached,3bed


In [8]:
df[df['AppliancesOwned'] > 30]

Unnamed: 0,House,Occupancy,ConstructionYear,AppliancesOwned,Type,Size
0,1,2,1975-1980,35,Detached,4bed
3,4,2,1850-1899,33,Detached,4bed
4,5,4,1878,44,Mid-terrace,4bed
5,6,2,2005,49,Detached,4bed
7,8,2,1966,35,Detached,2bed
9,10,4,1919-1944,31,Detached,3bed
14,16,6,1981-1990,48,Detached,5bed
16,18,2,1965-1974,34,Detached,3bed
18,20,2,1965-1974,39,Detached,3bed


In [9]:
df[df['AppliancesOwned'] < 30]

Unnamed: 0,House,Occupancy,ConstructionYear,AppliancesOwned,Type,Size
1,2,4,-,15,Semi-detached,3bed
2,3,2,1988,27,Detached,3bed
6,7,4,1965-1974,25,Detached,3bed
8,9,2,1919-1944,24,Detached,3bed
10,11,1,1945-1964,25,Detached,3bed
13,15,1,1965-1974,19,Semi-detached,3bed
15,17,3,mid60s,22,Detached,3bed
17,19,4,1945-1964,26,Semi-detached,3bed
19,21,4,1981-1990,23,Detached,3bed


# Data permutations

In [11]:
with open('data/refit_data_by_category.pk', 'rb') as f:
    data = pickle.load(f)
    
train, test = data['train'], data['test']

In [14]:
[x.shape for x in list(train[0]['X'].values())]

[(168, 13), (168, 13), (168, 13), (168, 13)]

In [10]:
def concatenate_data(train, test):
    data = []
    for week in range(len(train)):
        d = dict()
        d['X_bar'] = pd.concat([train[week]['X_bar'], test[week]['X_bar']], axis=1)
        d['X'] = {cat: pd.concat([train[week]['X'][cat], test[week]['X'][cat]], axis=1) for cat in train[week]['X'].keys()}
        data.append(d)
        
    return data

In [11]:
data = concatenate_data(train, test)

In [12]:
def permute_concatenated_data(data, source_houses, target_houses_in_train, test_houses):
    train, test = [], []
    for week in range(len(data)):
        d = dict()
        d['X_bar'] = data[week]['X_bar'][source_houses + target_houses_in_train]
        for house in source_houses:
            if '_target' in house:
                d['X_bar'] = d['X_bar'].rename(columns = {house: house.replace('_target', '')})
        for house in target_houses_in_train:
            if '_target' not in house:
                d['X_bar'] = d['X_bar'].rename(columns = {house: house + '_target'})
        
        
        d['X'] = {cat: data[week]['X'][cat][source_houses + target_houses_in_train] for cat in data[week]['X'].keys()}
        for cat in d.keys():
            for house in source_houses:
                if '_target' in house:
                    for cat in data[week]['X'].keys():
                        d['X'][cat] = d['X'][cat].rename(columns = {house: house.replace('_target', '')})

            for house in target_houses_in_train:
                if '_target' not in house:
                    for cat in data[week]['X'].keys():
                        d['X'][cat] = d['X'][cat].rename(columns = {house: house + '_target'})

        train.append(d)

        d = dict()
        d['X_bar'] = data[week]['X_bar'][test_houses]
        for house in test_houses:
            if '_target' in house:
                d['X_bar'] = d['X_bar'].rename(columns = {house: house.replace('_target', '')})
        
        
        d['X'] = {cat: data[week]['X'][cat][test_houses] for cat in data[week]['X'].keys()}
        for cat in d.keys():
            for house in test_houses:
                if '_target' in house:
                    for cat in data[week]['X'].keys():
                        d['X'][cat] = d['X'][cat].rename(columns = {house: house.replace('_target', '')})


        test.append(d)
        
            
    return train, test

In [13]:
#len(source_houses + target_houses_in_train + test_houses), len(df)

In [33]:
#sorted(source_houses + target_houses_in_train + test_houses)

### Less target train houses

In [32]:
source_houses = ['house_21', 'house_3', 'house_10', 'house_5', 'house_7', 'house_16', 'house_8', 'house_9']
target_houses_in_train = ['house_2_target', 'house_15_target', 'house_18_target']
test_houses = ['house_19_target', 'house_4_target', 'house_20', 'house_1', 'house_11', 'house_6', 'house_17']

train, test = permute_concatenated_data(data, source_houses, target_houses_in_train, test_houses)

train[0]['X']['heating'].head()

Unnamed: 0_level_0,house_21,house_3,house_10,house_5,house_7,house_16,house_8,house_9,house_2_target,house_15_target,house_18_target
Unix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1401793200,0,0,0,2,0,1,0,4,2,0,1
1401796800,0,0,0,2,0,1,0,4,2,0,1
1401800400,0,0,0,2,0,1,0,4,2,0,1
1401804000,0,0,0,2,0,1,0,4,2,0,1
1401807600,0,0,0,2,0,0,0,4,1,0,1


In [35]:
with open('data/refit_data_by_category_less_target.pk', 'wb') as f:
    pickle.dump({'train': train, 'test': test}, f)

### Based on total owned appliances

In [14]:
#source_houses = df[df['AppliancesOwned'] < 30]['House'].tolist()
source_houses = ['house_2_target',
 'house_3',
 'house_7',
 'house_9',
 'house_11',
 'house_15_target',
 'house_17',
 'house_19_target',
 'house_21']
target_houses_in_train = ['house_1',
 'house_4_target',
 'house_5',
 'house_6']

test_houses = ['house_8',
 'house_10',
 'house_16',
 'house_18_target',
 'house_20']


train, test = permute_concatenated_data(data, source_houses, target_houses_in_train, test_houses)

train[0]['X_bar'].head()

Unnamed: 0_level_0,house_2,house_3,house_7,house_9,house_11,house_15,house_17,house_19,house_21,house_1_target,house_4_target,house_5_target,house_6_target
Unix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1401793200,212,4418,270,245,1777,171,145,383,632,211,229,400,378
1401796800,202,2446,277,292,1351,198,336,181,1256,151,3094,268,398
1401800400,181,590,280,228,2302,171,289,148,1532,223,171,516,408
1401804000,181,346,213,205,920,199,351,542,465,221,95,534,529
1401807600,2395,213,288,346,1039,173,174,136,830,175,159,458,563


In [34]:
with open('data/refit_data_by_category_owned_appliances.pk', 'wb') as f:
    pickle.dump({'train': train, 'test': test}, f)

#### Less Target in Train

In [16]:
#source_houses = df[df['AppliancesOwned'] < 30]['House'].tolist()
source_houses = ['house_2_target',
 'house_3',
 'house_7',
 'house_9',
 'house_11',
 'house_15_target',
 'house_17',
 'house_19_target',
 'house_21']
target_houses_in_train = ['house_1',
 #'house_4_target'
]

test_houses = [
#'house_5',
 #'house_6',
 'house_8',
 'house_10',
 'house_16',
 'house_18_target',
 'house_20']


train, test = permute_concatenated_data(data, source_houses, target_houses_in_train, test_houses)

train[0]['X_bar'].head()

Unnamed: 0_level_0,house_2,house_3,house_7,house_9,house_11,house_15,house_17,house_19,house_21,house_1_target
Unix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1401793200,212,4418,270,245,1777,171,145,383,632,211
1401796800,202,2446,277,292,1351,198,336,181,1256,151
1401800400,181,590,280,228,2302,171,289,148,1532,223
1401804000,181,346,213,205,920,199,351,542,465,221
1401807600,2395,213,288,346,1039,173,174,136,830,175


In [17]:
with open('data/refit_data_by_category_owned_appliances_less_target_2.pk', 'wb') as f:
    pickle.dump({'train': train, 'test': test}, f)

### Based on Occupancy

In [38]:
df[df['Occupancy'] < 3]['House'].tolist(), df[df['Occupancy'] >= 3]['House'].tolist(), 

(['1', '3', '4', '6', '8', '9', '11', '15', '18', '20'],
 ['2', '5', '7', '10', '16', '17', '19', '21'])

In [39]:
source_houses = ['house_1',
 'house_3',
 'house_4_target',
 'house_6',
 'house_8',
 'house_9',
 'house_11',
 'house_15_target',
 'house_18_target',
 'house_20']
target_houses_in_train = ['house_2_target',
 'house_5',
 'house_7',
 'house_10']

test_houses = ['house_16',
 'house_17',
 'house_19_target',
 'house_21']


train, test = permute_concatenated_data(data, source_houses, target_houses_in_train, test_houses)

train[0]['X_bar'].head()

Unnamed: 0_level_0,house_1,house_3,house_4,house_6,house_8,house_9,house_11,house_15,house_18,house_20,house_2_target,house_5_target,house_7_target,house_10_target
Unix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1401793200,211,4418,229,378,108,245,1777,171,435,365,212,400,270,1120
1401796800,151,2446,3094,398,197,292,1351,198,677,219,202,268,277,367
1401800400,223,590,171,408,256,228,2302,171,378,206,181,516,280,623
1401804000,221,346,95,529,224,205,920,199,539,261,181,534,213,605
1401807600,175,213,159,563,127,346,1039,173,581,230,2395,458,288,479


In [40]:
with open('data/refit_data_by_category_occupancy.pk', 'wb') as f:
    pickle.dump({'train': train, 'test': test}, f)