## Solar

Project Solar is an attempt to equire information about sourounding envrionment in a living space, analize it, and build predictions in order to answer:

> Determine value of artificial light to counteract its natural deficit

+ Given time of the day, provide an answer about the level of light
+ Given time of the day (and day of the week) - provide an answer if light should be on

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_pickle('data/sensing_numeric.pkl')
df.columns

Index(['motion', 'light_scaled', 'sound_scaled', 'location_black',
       'location_blue', 'location_green', 'location_orange', 'location_purple',
       'sun_evening', 'sun_morning', 'sun_night', 'sun_noon', 'sun_sunrise',
       'sun_sunset', 'dot_week', 'sound_log_scaled', 'light_log_scaled'],
      dtype='object')

#### Adding classification feature (predictable benchmark)

In [3]:
df['light_log_scaled'].describe()

count    373632.000000
mean          3.119393
std           2.613058
min           0.000000
25%           0.803937
50%           1.984868
75%           6.535657
max           9.339017
Name: light_log_scaled, dtype: float64

Calculate 25 percentile of light at each location.

In [4]:
def get_daily_light_percentile(dataframe, q):
    light_log_scaled_percentile = lambda x: np.percentile(x, q = q)
    
    light_daily_banchmark = dataframe.groupby([
        dataframe.index.get_level_values('timestamp').day,
        'location_black', 
        'location_blue', 
        'location_green', 
        'location_orange', 
        'location_purple']).agg({
            'light_log_scaled': light_log_scaled_percentile
        }).rename(columns={'light_log_scaled': 'light_log_scaled_percentile_{}'.format(q)})
    return light_daily_banchmark

daily_light_percentile_25 = get_daily_light_percentile(df, 25)

Add percentile to data set.

In [5]:
def add_quantile(dataframe):
    location_indexer = [
        'location_black', 
        'location_blue', 
        'location_green', 
        'location_orange', 
        'location_purple'
    ]

    for time in dataframe.index.get_level_values('timestamp').unique():
        day_group = dataframe.xs(time, level='timestamp', axis=0)
        for location_name in location_indexer:
            percentile = day_group.xs(1, level=location_name)['light_log_scaled_percentile_25']
            df.loc[((df.index.day == time) & (df[location_name] == 1)), 'daily_light_percentile_25'] = percentile.values[0] if len(percentile.values) else 0
        
add_quantile(daily_light_percentile_25)

Hourly motion deviation per day of the week at each location 

In [6]:
motion_hourly_banchmark = df.groupby([
    df.index.get_level_values('timestamp').hour,
    'dot_week',
    'location_black', 
    'location_blue', 
    'location_green', 
    'location_orange', 
    'location_purple']).agg({'motion': ['sum', 'std']})

In [7]:
def add_motion_std(dataframe):
    location_indexer = [
        'location_black', 
        'location_blue', 
        'location_green', 
        'location_orange', 
        'location_purple'
    ]

    for time in dataframe.index.get_level_values('timestamp').unique():
        day_hour_group = dataframe.xs(time, level='timestamp', axis=0)
        for dotw in day_hour_group.index.get_level_values('dot_week').unique():
            day_group = day_hour_group.xs(dotw, level='dot_week', axis=0)
            for location_name in location_indexer:
                std_dev = day_group.xs(1, level=location_name)['motion']['std']
                df.loc[((df.index.day == time) & (df['dot_week'] == dotw) & (df[location_name] == 1)), 'daily_motion_std_dev'] = std_dev.values[0] if len(std_dev.values) else 0
        
add_motion_std(motion_hourly_banchmark)

In [8]:
MINIMUM_DAILY_HOUR_MOTION_DEVIATION = .15

def get_light_expected(row):
    return 1 if ((row['daily_motion_std_dev'] > MINIMUM_DAILY_HOUR_MOTION_DEVIATION) &
        (row['light_log_scaled'] < row['daily_light_percentile_25'])) else 0

df['light_expected'] = df.apply(get_light_expected, axis=1)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 373632 entries, 2018-05-01 01:09:00 to 2018-05-11 23:29:00
Data columns (total 20 columns):
motion                       373632 non-null int64
light_scaled                 373632 non-null float64
sound_scaled                 373632 non-null float64
location_black               373632 non-null uint8
location_blue                373632 non-null uint8
location_green               373632 non-null uint8
location_orange              373632 non-null uint8
location_purple              373632 non-null uint8
sun_evening                  373632 non-null uint8
sun_morning                  373632 non-null uint8
sun_night                    373632 non-null uint8
sun_noon                     373632 non-null uint8
sun_sunrise                  373632 non-null uint8
sun_sunset                   373632 non-null uint8
dot_week                     373632 non-null int64
sound_log_scaled             373632 non-null float64
light_log_scaled             3736

#### Feature selection (including sensing)

In [10]:
y = df['light_expected']
X = df[[
    'motion',
    'light_log_scaled', 
    'sound_log_scaled', 
    'location_black', 
    'location_blue', 
    'location_green', 
    'location_orange', 
    'location_purple', 
    'sun_evening', 
    'sun_morning', 
    'sun_night', 
    'sun_noon', 
    'sun_sunrise', 
    'sun_sunset',
    'dot_week']]

#### Train split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

#### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
from helpers import print_predict_scores
from helpers import fit_predict
logReg = LogisticRegression()

In [13]:
(model, prediction) = fit_predict(X_train, y_train, logReg)

p-values:
[  3.36373750e-135   0.00000000e+000   5.02931700e-050   1.11887020e-116
   2.51581127e-147   0.00000000e+000   0.00000000e+000   0.00000000e+000
   1.61331000e-185   0.00000000e+000   0.00000000e+000   0.00000000e+000
   6.87339696e-209   9.08480429e-082   0.00000000e+000]

coefficients:
[[-0.12354448 -1.27128969  1.0061933   2.06246922  0.37854194 -2.50466148
   0.13207178  0.47877156 -1.50853732 -0.97097978 -1.10801083  0.66569838
  -0.7275598  -1.83053609 -0.19143677]]

intercept: [ 0.54719303]

score: 0.8920970366563892

mean square root: 0.3284858647546509



In [14]:
print_predict_scores(y_train, prediction)

mean_squared_error:
0.10790296334361082

precision_score:
0.6917362594678579

recall_score:
0.3735938114592894

classification_report:
             precision    recall  f1-score   support

          0       0.91      0.97      0.94    242089
          1       0.69      0.37      0.49     38135

avg / total       0.88      0.89      0.88    280224


mean square root: 0.3284858647546509



In [15]:
print('Test score r2:', logReg.score(X_test,y_test))

Test score r2: 0.89391700925


#### Stochastic Gradient Descent classifier

In [16]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()

In [17]:
(model, prediction) = fit_predict(X_train, y_train, logReg)

p-values:
[  3.36373750e-135   0.00000000e+000   5.02931700e-050   1.11887020e-116
   2.51581127e-147   0.00000000e+000   0.00000000e+000   0.00000000e+000
   1.61331000e-185   0.00000000e+000   0.00000000e+000   0.00000000e+000
   6.87339696e-209   9.08480429e-082   0.00000000e+000]

coefficients:
[[-0.12354448 -1.27128969  1.0061933   2.06246922  0.37854194 -2.50466148
   0.13207178  0.47877156 -1.50853732 -0.97097978 -1.10801083  0.66569838
  -0.7275598  -1.83053609 -0.19143677]]

intercept: [ 0.54719303]

score: 0.8920970366563892

mean square root: 0.3284858647546509



In [18]:
print_predict_scores(y_train, prediction)

mean_squared_error:
0.10790296334361082

precision_score:
0.6917362594678579

recall_score:
0.3735938114592894

classification_report:
             precision    recall  f1-score   support

          0       0.91      0.97      0.94    242089
          1       0.69      0.37      0.49     38135

avg / total       0.88      0.89      0.88    280224


mean square root: 0.3284858647546509



In [19]:
print('Test score:', model.score(X_test,y_test))

Test score: 0.89391700925


#### Parameters optimization - cross validation

In [20]:
from sklearn.model_selection import GridSearchCV, KFold

In [21]:
param_grid = [
    {'C': [10**-i for i in range(-5, 5)], 'class_weight': [None, 'balanced']}
]
grid = GridSearchCV(
    estimator=logReg,
    param_grid=param_grid,
    cv=7,
    scoring = 'neg_mean_squared_error'
)

In [22]:
grid.fit(X_train, y_train)
grid.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
(model, prediction) = fit_predict(X_train, y_train, grid.best_estimator_)

p-values:
[  3.36373750e-135   0.00000000e+000   5.02931700e-050   1.11887020e-116
   2.51581127e-147   0.00000000e+000   0.00000000e+000   0.00000000e+000
   1.61331000e-185   0.00000000e+000   0.00000000e+000   0.00000000e+000
   6.87339696e-209   9.08480429e-082   0.00000000e+000]

coefficients:
[[-0.12354448 -1.27128969  1.0061933   2.06246922  0.37854194 -2.50466148
   0.13207178  0.47877156 -1.50853732 -0.97097978 -1.10801083  0.66569838
  -0.7275598  -1.83053609 -0.19143677]]

intercept: [ 0.54719303]

score: 0.8920970366563892

mean square root: 0.3284858647546509



In [24]:
print_predict_scores(y_train, prediction)

mean_squared_error:
0.10790296334361082

precision_score:
0.6917362594678579

recall_score:
0.3735938114592894

classification_report:
             precision    recall  f1-score   support

          0       0.91      0.97      0.94    242089
          1       0.69      0.37      0.49     38135

avg / total       0.88      0.89      0.88    280224


mean square root: 0.3284858647546509



In [25]:
print('Test score r2:', model.score(X_test,y_test))

Test score r2: 0.89391700925


#### Testing Lasso, Ridge, ElasticNet

In [383]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error

In [384]:
params = [
    {'alpha': [0.1, 0.2, 0.3, 0.5]}
]
lasso = Lasso()
grid_search_lasso = GridSearchCV(lasso, params, cv=5, scoring='neg_mean_squared_error')
grid_search_lasso.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'alpha': [0.1, 0.2, 0.3, 0.5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [385]:
grid_search_lasso.best_estimator_
lasso_model = grid_search_lasso.best_estimator_
lasso_model.fit(X_train, y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [388]:
prediction2 = lasso_model.predict(X_train)
mse2 = mean_squared_error(prediction2, y_train)
print('mean square root error :\n{}\n'.format(np.sqrt(mse2)))

mean square root error :
0.3302167456607731



In [389]:
print('Train score r2:', lasso_model.score(X_train,y_train))
print('Test score r2:', lasso_model.score(X_test,y_test))

Train score r2: 0.0686572129826
Test score r2: 0.0693025889915


In [390]:
ridge = Ridge()
grid_seaerch_ridge = GridSearchCV(ridge, params, cv = 3, scoring='neg_mean_squared_error')
grid_seaerch_ridge.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'alpha': [0.1, 0.2, 0.3, 0.5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [391]:
grid_seaerch_ridge.best_score_
ridge_model = grid_seaerch_ridge.best_estimator_
ridge_model.fit(X_train, y_train)

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [392]:
prediction3 = ridge_model.predict(X_train)
mse3 = mean_squared_error(prediction3, y_train)
print('square root:\n{}\n'.format(np.sqrt(mse3)))

square root:
0.299346314449808



In [393]:
print('Train score r2:', ridge_model.score(X_train,y_train))
print('Test score r2:', ridge_model.score(X_test,y_test))

Train score r2: 0.234651561786
Test score r2: 0.238879212989


In [394]:
en = ElasticNet()
grid_seaerch_en = GridSearchCV(en, params, cv = 3, scoring='neg_mean_squared_error')
grid_seaerch_en.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'alpha': [0.1, 0.2, 0.3, 0.5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [395]:
grid_seaerch_en.best_score_
en_model = grid_seaerch_en.best_estimator_
en_model.fit(X_train, y_train)

ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [396]:
prediction4 = en_model.predict(X_train)
mse4 = mean_squared_error(prediction4, y_train)
print('square root:\n{}\n'.format(np.sqrt(mse4)))

square root:
0.3277927369762515



In [397]:
print('Train score r2:', en_model.score(X_train,y_train))
print('Test score r2:', en_model.score(X_test,y_test))

Train score r2: 0.0822803678317
Test score r2: 0.0835774566158


#### KNeighbors

In [398]:
df.columns

Index(['motion', 'light_scaled', 'sound_scaled', 'location_black',
       'location_blue', 'location_green', 'location_orange', 'location_purple',
       'sun_evening', 'sun_morning', 'sun_night', 'sun_noon', 'sun_sunrise',
       'sun_sunset', 'dot_week', 'sound_log_scaled', 'light_log_scaled',
       'daily_light_percentile_25', 'daily_motion_std_dev', 'light_expected'],
      dtype='object')

In [407]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
knn = KNeighborsClassifier(n_neighbors=5)

In [408]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [409]:
prediction5 = knn.predict(X_train)

In [410]:
print_predict_scores(y_train, prediction5)

mean_squared_error:
0.0012525693730729701

precision_score:
0.9958325640281698

recall_score:
0.9949140929693264

classification_report:
             precision    recall  f1-score   support

          0       1.00      1.00      1.00    242276
          1       1.00      0.99      1.00     37948

avg / total       1.00      1.00      1.00    280224


mean square root: 0.03539165682859408



In [411]:
print(confusion_matrix(y_train, prediction5))

[[242118    158]
 [   193  37755]]


In [412]:
print('Train score r2:', knn.score(X_train,y_train))
print('Test score r2:', knn.score(X_test,y_test))

Train score r2: 0.998747430627
Test score r2: 0.997537684138


In [413]:
from sklearn.model_selection import cross_val_predict

In [414]:
y_pred = cross_val_predict(knn, X_train, y_train)

In [415]:
conf_mx = confusion_matrix(y_train, y_pred)
conf_mx

array([[241899,    377],
       [   443,  37505]])

Regression models:
    - linear support regressor
    - linear vs non-linear kernel
    - random forest
    - XG Boost model

Keras - Python library for LSTMs modeling
- PyTorch
