In [15]:
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
df = pd.read_csv('../Data/external/datatest.txt', parse_dates=True, index_col=1 )
df.drop(columns=['date'], inplace=True)

In [17]:
df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
2015-02-02 14:19:00,23.7,26.272,585.2,749.2,0.004764,1
2015-02-02 14:19:59,23.718,26.29,578.4,760.4,0.004773,1
2015-02-02 14:21:00,23.73,26.23,572.666667,769.666667,0.004765,1
2015-02-02 14:22:00,23.7225,26.125,493.75,774.75,0.004744,1
2015-02-02 14:23:00,23.754,26.2,488.6,779.0,0.004767,1


In [18]:
def add_time_features(df):
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['dayofmonth'] = df.index.day
    
    df['sine_hr'] = np.sin(df.index.hour/6)
    df['cos_hr'] = np.cos(df.index.hour/6)
    return df

In [19]:
df_features = add_time_features(df)

In [20]:
df_features.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,hour,dayofweek,dayofmonth,sine_hr,cos_hr
2015-02-02 14:19:00,23.7,26.272,585.2,749.2,0.004764,1,14,0,2,0.723086,-0.690758
2015-02-02 14:19:59,23.718,26.29,578.4,760.4,0.004773,1,14,0,2,0.723086,-0.690758
2015-02-02 14:21:00,23.73,26.23,572.666667,769.666667,0.004765,1,14,0,2,0.723086,-0.690758
2015-02-02 14:22:00,23.7225,26.125,493.75,774.75,0.004744,1,14,0,2,0.723086,-0.690758
2015-02-02 14:23:00,23.754,26.2,488.6,779.0,0.004767,1,14,0,2,0.723086,-0.690758


In [21]:
df_features.rename(columns={'Occupancy': 'class'}, inplace=True)

In [22]:
df_features.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,class,hour,dayofweek,dayofmonth,sine_hr,cos_hr
2015-02-02 14:19:00,23.7,26.272,585.2,749.2,0.004764,1,14,0,2,0.723086,-0.690758
2015-02-02 14:19:59,23.718,26.29,578.4,760.4,0.004773,1,14,0,2,0.723086,-0.690758
2015-02-02 14:21:00,23.73,26.23,572.666667,769.666667,0.004765,1,14,0,2,0.723086,-0.690758
2015-02-02 14:22:00,23.7225,26.125,493.75,774.75,0.004744,1,14,0,2,0.723086,-0.690758
2015-02-02 14:23:00,23.754,26.2,488.6,779.0,0.004767,1,14,0,2,0.723086,-0.690758


In [23]:
df_features.isnull().mean()

Temperature      0.0
Humidity         0.0
Light            0.0
CO2              0.0
HumidityRatio    0.0
class            0.0
hour             0.0
dayofweek        0.0
dayofmonth       0.0
sine_hr          0.0
cos_hr           0.0
dtype: float64

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(df_features.drop(columns=['class'], axis=1), df_features['class'], test_size=0.2, stratify=df_features['class'])

In [26]:
from tpot import TPOTClassifier
tpot = TPOTClassifier(verbosity=2, max_time_mins=60*2, max_eval_time_mins=5, population_size=40)
tpot.fit(X_train.values, Y_train.values)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=40.0, style=ProgressStyle(des…


Generation 1 - Current best internal CV score: 0.9854614022935427
Generation 2 - Current best internal CV score: 0.9854614022935427
Generation 3 - Current best internal CV score: 0.9878055216545173
Generation 4 - Current best internal CV score: 0.9878055216545173
Generation 5 - Current best internal CV score: 0.9878055216545173
Generation 6 - Current best internal CV score: 0.9878066211476509
Generation 7 - Current best internal CV score: 0.9878110191201855
Generation 8 - Current best internal CV score: 0.9882794031951271
Generation 9 - Current best internal CV score: 0.9882794031951271
Generation 10 - Current best internal CV score: 0.9882794031951271
Generation 11 - Current best internal CV score: 0.9882794031951271
Generation 12 - Current best internal CV score: 0.9892172708381436
Generation 13 - Current best internal CV score: 0.9892172708381436
Generation 14 - Current best internal CV score: 0.9892172708381436
Generation 15 - Current best internal CV score: 0.9892172708381436
Gen

TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=100,
               log_file=<ipykernel.iostream.OutStream object at 0x000002A24E8CCC88>,
               max_eval_time_mins=5, max_time_mins=120, memory=None,
               mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=40,
               random_state=None, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)

In [27]:
tpot.export('best_pipeline.py')