In [33]:
import pandas as pd
import numpy as np

In [34]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['patch.force_edgecolor'] = True
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [35]:
energy_data = pd.read_csv("energydata_complete_revised.csv")
day_name_encoding = pd.get_dummies(energy_data['day_name'], drop_first=True)
data = pd.concat([energy_data,day_name_encoding],axis=1 )
data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,week,day_name,day_of_week,weekday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,2,Monday,0,0,1,0,0,0,0,0
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,2,Monday,0,0,1,0,0,0,0,0
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,2,Monday,0,0,1,0,0,0,0,0
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,2,Monday,0,0,1,0,0,0,0,0
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,2,Monday,0,0,1,0,0,0,0,0


# Boruta

Boruta is an all relevant feature selection method 

it tries to find all features carrying information usable for prediction, rather than finding a possibly compact subset of features on which some classifier has a minimal error.

In [36]:
data_1 = data.drop(['Appliances', 'date', 'day_name'],axis=1)
X_boruta = data_1
X_boruta = X_boruta.values

y_boruta = data['Appliances']
y_boruta = y_boruta.values
y_boruta

array([ 60,  60,  50, ..., 270, 420, 430], dtype=int64)

In [37]:
from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

In [38]:
from boruta import BorutaPy
feat_selector = BorutaPy(rf1, n_estimators='auto', verbose=5, random_state=1)
feat_selector.fit(X_boruta,y_boruta)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	42
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	39
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	39
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	39
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	39
Iteration: 	12 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	39
Iteration: 	13 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	39
Iteration: 	14 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	39
Iteration: 	15 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	39
Iteration: 	16 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	40
I

BorutaPy(alpha=0.05,
     estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=28, n_jobs=-1, oob_score=False,
            random_state=<mtrand.RandomState object at 0x000001F649DEA870>,
            verbose=0, warm_start=False),
     max_iter=100, n_estimators='auto', perc=100,
     random_state=<mtrand.RandomState object at 0x000001F649DEA870>,
     two_step=True, verbose=5)

In [39]:
feat_selector.support_
print(feat_selector.ranking_)

[29  8  6 12 18  1 19  9 17 10  2  7 22 15 24 12 26 15 25 10 14 28 23  3
 26  4  5 41 35 20 30 31 41 20 34 39 36 32 37 33 39 39]


In [16]:
ranking = pd.DataFrame({'Index':data_1.columns, 'Ranking': feat_selector.ranking_})
ranking

Unnamed: 0,Index,Ranking
0,lights,29
1,T1,8
2,RH_1,6
3,T2,12
4,RH_2,18
5,T3,1
6,RH_3,19
7,T4,9
8,RH_4,17
9,T5,10


# TPOT

TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming.

In [48]:
X = data.drop(['Appliances', 'date', 'day_name'],axis=1)
y = data['Appliances']

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [52]:
from tpot import TPOTRegressor

tpot = TPOTRegressor(generations=10, population_size=10,offspring_size=None, mutation_rate=0.9, verbosity=3)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_pipeline.py')

28 operators have been imported by TPOT.


Optimization Progress:  18%|█▊        | 20/110 [02:21<09:04,  6.05s/pipeline]

Generation 1 - Current Pareto front scores:
-1	-6160.052847414862	RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=False, RandomForestRegressor__max_features=0.25, RandomForestRegressor__min_samples_leaf=16, RandomForestRegressor__min_samples_split=6, RandomForestRegressor__n_estimators=100)



Optimization Progress:  19%|█▉        | 21/110 [02:21<06:26,  4.34s/pipeline]

Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.


Optimization Progress:  27%|██▋       | 30/110 [04:55<23:30, 17.63s/pipeline]

Generation 2 - Current Pareto front scores:
-1	-5955.735069288081	RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=False, RandomForestRegressor__max_features=0.15, RandomForestRegressor__min_samples_leaf=11, RandomForestRegressor__min_samples_split=6, RandomForestRegressor__n_estimators=100)



Optimization Progress:  36%|███▋      | 40/110 [09:19<22:27, 19.25s/pipeline]  

Generation 3 - Current Pareto front scores:
-1	-5924.9334463632185	RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=False, RandomForestRegressor__max_features=0.25, RandomForestRegressor__min_samples_leaf=12, RandomForestRegressor__min_samples_split=6, RandomForestRegressor__n_estimators=100)



Optimization Progress:  36%|███▋      | 40/110 [09:20<22:27, 19.25s/pipeline]

_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required.
_pre_test decorator: _random_mutation_operator: num_test=1 __init__() got an unexpected keyword argument 'max_depth'


Optimization Progress:  45%|████▌     | 50/110 [13:05<10:53, 10.89s/pipeline]

Generation 4 - Current Pareto front scores:
-1	-5924.9334463632185	RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=False, RandomForestRegressor__max_features=0.25, RandomForestRegressor__min_samples_leaf=12, RandomForestRegressor__min_samples_split=6, RandomForestRegressor__n_estimators=100)



Optimization Progress:  55%|█████▍    | 60/110 [18:17<23:58, 28.77s/pipeline]

Generation 5 - Current Pareto front scores:
-1	-5698.956283861636	RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=False, RandomForestRegressor__max_features=0.15, RandomForestRegressor__min_samples_leaf=8, RandomForestRegressor__min_samples_split=6, RandomForestRegressor__n_estimators=100)

_pre_test decorator: _random_mutation_operator: num_test=0 Expected n_neighbors <= n_samples,  but n_samples = 50, n_neighbors = 54


Optimization Progress:  64%|██████▎   | 70/110 [27:45<28:33, 42.83s/pipeline]

Generation 6 - Current Pareto front scores:
-1	-5698.956283861636	RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=False, RandomForestRegressor__max_features=0.15, RandomForestRegressor__min_samples_leaf=8, RandomForestRegressor__min_samples_split=6, RandomForestRegressor__n_estimators=100)
-2	-5698.880219592264	RandomForestRegressor(RobustScaler(input_matrix), RandomForestRegressor__bootstrap=False, RandomForestRegressor__max_features=0.15, RandomForestRegressor__min_samples_leaf=8, RandomForestRegressor__min_samples_split=6, RandomForestRegressor__n_estimators=100)



Optimization Progress:  64%|██████▎   | 70/110 [27:46<28:33, 42.83s/pipeline]

_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required.


Optimization Progress:  65%|██████▍   | 71/110 [27:46<19:48, 30.47s/pipeline]

Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.


Optimization Progress:  73%|███████▎  | 80/110 [31:54<18:10, 36.36s/pipeline]

Generation 7 - Current Pareto front scores:
-1	-5064.940050386366	RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=False, RandomForestRegressor__max_features=0.25, RandomForestRegressor__min_samples_leaf=3, RandomForestRegressor__min_samples_split=6, RandomForestRegressor__n_estimators=100)

_pre_test decorator: _random_mutation_operator: num_test=0 __init__() got an unexpected keyword argument 'max_depth'


Optimization Progress:  74%|███████▎  | 81/110 [31:56<12:32, 25.95s/pipeline]

Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.


Optimization Progress:  82%|████████▏ | 90/110 [39:25<13:24, 40.22s/pipeline]

Generation 8 - Current Pareto front scores:
-1	-5064.940050386366	RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=False, RandomForestRegressor__max_features=0.25, RandomForestRegressor__min_samples_leaf=3, RandomForestRegressor__min_samples_split=6, RandomForestRegressor__n_estimators=100)



Optimization Progress:  82%|████████▏ | 90/110 [39:27<13:24, 40.22s/pipeline]

_pre_test decorator: _random_mutation_operator: num_test=0 __init__() got an unexpected keyword argument 'max_depth'
_pre_test decorator: _random_mutation_operator: num_test=0 __init__() got an unexpected keyword argument 'max_depth'


Optimization Progress:  91%|█████████ | 100/110 [47:05<07:20, 44.05s/pipeline]

Generation 9 - Current Pareto front scores:
-1	-5064.940050386366	RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=False, RandomForestRegressor__max_features=0.25, RandomForestRegressor__min_samples_leaf=3, RandomForestRegressor__min_samples_split=6, RandomForestRegressor__n_estimators=100)



Optimization Progress:  91%|█████████ | 100/110 [47:07<07:20, 44.05s/pipeline]

_pre_test decorator: _random_mutation_operator: num_test=0 precomputed was provided as affinity. Ward can only work with euclidean distances.


                                                                              

Generation 10 - Current Pareto front scores:
-1	-5064.940050386366	RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=False, RandomForestRegressor__max_features=0.25, RandomForestRegressor__min_samples_leaf=3, RandomForestRegressor__min_samples_split=6, RandomForestRegressor__n_estimators=100)

-4177.2478239


True

In [4]:
!pip install features

Collecting features
  Downloading features-0.5.8-py2.py3-none-any.whl
Collecting concepts<1.0,>=0.7 (from features)
  Downloading concepts-0.7.12-py2.py3-none-any.whl
Collecting fileconfig<1.0,>=0.5 (from features)
  Downloading fileconfig-0.5.6-py2.py3-none-any.whl
Collecting graphviz<1.0,>=0.3 (from features)
  Downloading graphviz-0.8.2-py2.py3-none-any.whl
Collecting bitsets<1.0,>=0.7 (from concepts<1.0,>=0.7->features)
  Downloading bitsets-0.7.12-py2.py3-none-any.whl
Installing collected packages: bitsets, graphviz, concepts, fileconfig, features
Successfully installed bitsets-0.7.12 concepts-0.7.12 features-0.5.8 fileconfig-0.5.6 graphviz-0.8.2


In [6]:
!pip install featuretools

Collecting featuretools
  Downloading featuretools-0.1.18.tar.gz (128kB)
Collecting s3fs>=0.1.2 (from featuretools)
  Downloading s3fs-0.1.4.tar.gz
Collecting tqdm>=4.19.2 (from featuretools)
  Downloading tqdm-4.19.8-py2.py3-none-any.whl (52kB)
Collecting future>=0.16.0 (from featuretools)
  Downloading future-0.16.0.tar.gz (824kB)
Collecting pympler>=0.5 (from featuretools)
  Downloading Pympler-0.5.tar.gz (170kB)
Collecting boto3 (from s3fs>=0.1.2->featuretools)
  Downloading boto3-1.6.11-py2.py3-none-any.whl (128kB)
Collecting botocore<1.10.0,>=1.9.11 (from boto3->s3fs>=0.1.2->featuretools)
  Downloading botocore-1.9.11-py2.py3-none-any.whl (4.1MB)
Collecting s3transfer<0.2.0,>=0.1.10 (from boto3->s3fs>=0.1.2->featuretools)
  Downloading s3transfer-0.1.13-py2.py3-none-any.whl (59kB)
Collecting jmespath<1.0.0,>=0.7.1 (from boto3->s3fs>=0.1.2->featuretools)
  Downloading jmespath-0.9.3-py2.py3-none-any.whl
Building wheels for collected packages: featuretools, s3fs, future, pympler
  

In [17]:
import featuretools as ft

In [18]:
energy_data = energy_data.drop(['rv1','rv2'],axis=1)

y_featuretools = energy_data[['date','Appliances']]
X_featuretools = energy_data.drop(['Appliances'],axis=1)
entities ={"appliances" :(y_featuretools,"date"),
          "rest" :(X_featuretools,"date")}
relationships = [("appliances","date","rest","date")]

In [19]:
feature_matrix_app,features_defs = ft.dfs(entities=entities,relationships=relationships,target_entity="appliances")

In [20]:
feature_matrix_app1 = feature_matrix_app

In [21]:
feature_matrix_app1.dtypes

Appliances                     int64
SUM(rest.lights)               int64
SUM(rest.T1)                 float64
SUM(rest.RH_1)               float64
SUM(rest.T2)                 float64
SUM(rest.RH_2)               float64
SUM(rest.T3)                 float64
SUM(rest.RH_3)               float64
SUM(rest.T4)                 float64
SUM(rest.RH_4)               float64
SUM(rest.T5)                 float64
SUM(rest.RH_5)               float64
SUM(rest.T6)                 float64
SUM(rest.RH_6)               float64
SUM(rest.T7)                 float64
SUM(rest.RH_7)               float64
SUM(rest.T8)                 float64
SUM(rest.RH_8)               float64
SUM(rest.T9)                 float64
SUM(rest.RH_9)               float64
SUM(rest.T_out)              float64
SUM(rest.Press_mm_hg)        float64
SUM(rest.RH_out)             float64
SUM(rest.Windspeed)          float64
SUM(rest.Visibility)         float64
SUM(rest.Tdewpoint)          float64
SUM(rest.year)                 int64
S

In [32]:
y_featuretools = y_featuretools.drop('date',axis=1)
y_featuretools = y_featuretools.values

AttributeError: 'numpy.ndarray' object has no attribute 'drop'

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature_matrix_app1, y_featuretools, test_size=0.25, random_state=42)