In [50]:
import pandas as pd
import numpy as np


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import tsfresh


from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling, margin_sampling, entropy_sampling

In [46]:
# 1) Load the old dataset
df = pd.read_csv('./2020-06-13Box0PeopleCounting.csv')
df.sort_values(by='time', inplace=True)
df.drop('Unnamed: 0', axis=1, inplace=True)

df


Unnamed: 0,box_name,channel_name,time,value,label
0,Box0,Accel X,2020-06-13 15:40:00,-0.060,12345
17,Box0,B,2020-06-13 15:40:00,397.000,12345
16,Box0,Pressure,2020-06-13 15:40:00,1020.746,12345
15,Box0,Audio,2020-06-13 15:40:00,0.557,12345
14,Box0,Magnet Z,2020-06-13 15:40:00,-77.250,12345
...,...,...,...,...,...
48601,Box0,Pressure,2020-06-13 16:25:00,1020.597,1245
48600,Box0,G,2020-06-13 16:25:00,1591.000,1245
48616,Box0,Color Temp (K),2020-06-13 16:25:00,4216.000,1245
48607,Box0,B,2020-06-13 16:25:00,1183.000,1245


In [39]:
df['label'].unique()

array(['1,2,3,4,5', '1,2,4,5', '2,3,4', '2,3,4,5', '2,3'], dtype=object)

In [10]:
# 2) get the segments by seeing where the labels change

start_indices = [df.iloc[0]['time']]

for i in range(1, len(df)):
    if df.iloc[i]['label'] != df.iloc[i-1]['label']:
        start_indices.append(df.iloc[i]['time'])

start_indices.append(df.iloc[-1]['time'])

start_indices

['2020-06-13 15:40:00',
 '2020-06-13 15:52:00',
 '2020-06-13 15:56:00',
 '2020-06-13 16:01:00',
 '2020-06-13 16:13:00',
 '2020-06-13 16:16:00',
 '2020-06-13 16:17:00',
 '2020-06-13 16:21:00',
 '2020-06-13 16:25:00']

In [11]:
segments = []
for i in range(1, len(start_indices)):
    segments.append((start_indices[i-1], start_indices[i]))
    
print(segments)

[('2020-06-13 15:40:00', '2020-06-13 15:52:00'), ('2020-06-13 15:52:00', '2020-06-13 15:56:00'), ('2020-06-13 15:56:00', '2020-06-13 16:01:00'), ('2020-06-13 16:01:00', '2020-06-13 16:13:00'), ('2020-06-13 16:13:00', '2020-06-13 16:16:00'), ('2020-06-13 16:16:00', '2020-06-13 16:17:00'), ('2020-06-13 16:17:00', '2020-06-13 16:21:00'), ('2020-06-13 16:21:00', '2020-06-13 16:25:00')]


In [47]:
# 3) Relabel and normalize all labels to numerals (0, 1, 2, 3+)
for segment in segments:
    print(df.loc[df['time'] == segment[0], 'label'].iloc[0])
    numPeopleinSegment = len(str(df.loc[df['time'] == segment[0], 'label'].iloc[0]).split(","))
    print(numPeopleinSegment)
    df.loc[((segment[0] <= df['time']) & (df['time'] < segment[1])), 'label'] = numPeopleinSegment

# fix for last time stamp
df.loc[df['time'] == start_indices[-1], 'label'] = 4


df['label'].unique()
    


1,2,3,4,5
5
1,2,4,5
4
1,2,3,4,5
5
2,3,4
3
1,2,3,4,5
5
2,3,4,5
4
2,3
2
1,2,4,5
4


array([5, 4, 3, 2], dtype=int64)

In [52]:
df

Unnamed: 0,box_name,channel_name,time,value,label
0,Box0,Accel X,2020-06-13 15:40:00,-0.060,5
17,Box0,B,2020-06-13 15:40:00,397.000,5
16,Box0,Pressure,2020-06-13 15:40:00,1020.746,5
15,Box0,Audio,2020-06-13 15:40:00,0.557,5
14,Box0,Magnet Z,2020-06-13 15:40:00,-77.250,5
...,...,...,...,...,...
48601,Box0,Pressure,2020-06-13 16:25:00,1020.597,4
48600,Box0,G,2020-06-13 16:25:00,1591.000,4
48616,Box0,Color Temp (K),2020-06-13 16:25:00,4216.000,4
48607,Box0,B,2020-06-13 16:25:00,1183.000,4


In [54]:
# 4) get features for each segment

segment_features = np.empty((0, 13734), float)

for index, segment in enumerate(segments):
    if index == len(segments) - 1:
        # only last segment fix, must include the last index
        # note the change in <= from < in second line
        segment_df = df.loc[((segment[0] <= df['time']) & 
                             (df['time'] <= segment[1]))]
    else:
        segment_df = df.loc[((segment[0] <= df['time']) & 
                             (df['time'] < segment[1]))]
        
    features = tsfresh.extract_features(
                    segment_df, 
                    column_id="box_name",
                    column_sort="time", 
                    column_kind="channel_name", 
                    column_value="value")
    print(features)
    print(len(features))
    print(type(features))
    print(features.to_numpy().shape)
    segment_features = np.append(segment_features, 
                        features.to_numpy(), axis=0)
    print(segment_features)
    
print("Done computing the features: ", segment_features.shape)



Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 9/9 [00:17<00:00,  1.96s/it]
Feature Extraction:   0%|                                                                        | 0/9 [00:00<?, ?it/s]

variable  Accel X__abs_energy  Accel X__absolute_sum_of_changes  \
id                                                                
Box0                   2.9647                              6.06   

variable  Accel X__agg_autocorrelation__f_agg_"mean"__maxlag_40  \
id                                                                
Box0                                              -0.003101       

variable  Accel X__agg_autocorrelation__f_agg_"median"__maxlag_40  \
id                                                                  
Box0                                               0.003086         

variable  Accel X__agg_autocorrelation__f_agg_"var"__maxlag_40  \
id                                                               
Box0                                               0.001537      

variable  Accel X__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"  \
id                                                                                 
Box0                 

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.17it/s]
Feature Extraction:   0%|                                                                        | 0/9 [00:00<?, ?it/s]

variable  Accel X__abs_energy  Accel X__absolute_sum_of_changes  \
id                                                                
Box0                   0.8797                              8.35   

variable  Accel X__agg_autocorrelation__f_agg_"mean"__maxlag_40  \
id                                                                
Box0                                              -0.010761       

variable  Accel X__agg_autocorrelation__f_agg_"median"__maxlag_40  \
id                                                                  
Box0                                              -0.015203         

variable  Accel X__agg_autocorrelation__f_agg_"var"__maxlag_40  \
id                                                               
Box0                                               0.006724      

variable  Accel X__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"  \
id                                                                                 
Box0                 

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 9/9 [00:08<00:00,  1.07it/s]
Feature Extraction:   0%|                                                                        | 0/9 [00:00<?, ?it/s]

variable  Accel X__abs_energy  Accel X__absolute_sum_of_changes  \
id                                                                
Box0                  16.4503                             15.25   

variable  Accel X__agg_autocorrelation__f_agg_"mean"__maxlag_40  \
id                                                                
Box0                                               0.446345       

variable  Accel X__agg_autocorrelation__f_agg_"median"__maxlag_40  \
id                                                                  
Box0                                               0.419491         

variable  Accel X__agg_autocorrelation__f_agg_"var"__maxlag_40  \
id                                                               
Box0                                                0.02817      

variable  Accel X__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"  \
id                                                                                 
Box0                 

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 9/9 [00:16<00:00,  1.85s/it]
Feature Extraction:   0%|                                                                        | 0/9 [00:00<?, ?it/s]

variable  Accel X__abs_energy  Accel X__absolute_sum_of_changes  \
id                                                                
Box0                  27.4645                             22.16   

variable  Accel X__agg_autocorrelation__f_agg_"mean"__maxlag_40  \
id                                                                
Box0                                               0.592864       

variable  Accel X__agg_autocorrelation__f_agg_"median"__maxlag_40  \
id                                                                  
Box0                                               0.591403         

variable  Accel X__agg_autocorrelation__f_agg_"var"__maxlag_40  \
id                                                               
Box0                                               0.000342      

variable  Accel X__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"  \
id                                                                                 
Box0                 

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 9/9 [00:06<00:00,  1.43it/s]
Feature Extraction:   0%|                                                                        | 0/9 [00:00<?, ?it/s]

variable  Accel X__abs_energy  Accel X__absolute_sum_of_changes  \
id                                                                
Box0                   3.0537                              7.35   

variable  Accel X__agg_autocorrelation__f_agg_"mean"__maxlag_40  \
id                                                                
Box0                                              -0.014377       

variable  Accel X__agg_autocorrelation__f_agg_"median"__maxlag_40  \
id                                                                  
Box0                                              -0.007964         

variable  Accel X__agg_autocorrelation__f_agg_"var"__maxlag_40  \
id                                                               
Box0                                               0.012098      

variable  Accel X__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"  \
id                                                                                 
Box0                 

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 9/9 [00:06<00:00,  1.44it/s]
Feature Extraction:   0%|                                                                        | 0/9 [00:00<?, ?it/s]

variable  Accel X__abs_energy  Accel X__absolute_sum_of_changes  \
id                                                                
Box0                   0.9402                               2.6   

variable  Accel X__agg_autocorrelation__f_agg_"mean"__maxlag_40  \
id                                                                
Box0                                               0.009492       

variable  Accel X__agg_autocorrelation__f_agg_"median"__maxlag_40  \
id                                                                  
Box0                                               0.002965         

variable  Accel X__agg_autocorrelation__f_agg_"var"__maxlag_40  \
id                                                               
Box0                                               0.029966      

variable  Accel X__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"  \
id                                                                                 
Box0                 

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.28it/s]
Feature Extraction:   0%|                                                                        | 0/9 [00:00<?, ?it/s]

variable  Accel X__abs_energy  Accel X__absolute_sum_of_changes  \
id                                                                
Box0                   6.8465                              6.05   

variable  Accel X__agg_autocorrelation__f_agg_"mean"__maxlag_40  \
id                                                                
Box0                                               0.025343       

variable  Accel X__agg_autocorrelation__f_agg_"median"__maxlag_40  \
id                                                                  
Box0                                              -0.006043         

variable  Accel X__agg_autocorrelation__f_agg_"var"__maxlag_40  \
id                                                               
Box0                                                0.01226      

variable  Accel X__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"  \
id                                                                                 
Box0                 

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.23it/s]


variable  Accel X__abs_energy  Accel X__absolute_sum_of_changes  \
id                                                                
Box0                   5.8112                              4.39   

variable  Accel X__agg_autocorrelation__f_agg_"mean"__maxlag_40  \
id                                                                
Box0                                               0.010794       

variable  Accel X__agg_autocorrelation__f_agg_"median"__maxlag_40  \
id                                                                  
Box0                                               0.016192         

variable  Accel X__agg_autocorrelation__f_agg_"var"__maxlag_40  \
id                                                               
Box0                                               0.008708      

variable  Accel X__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"  \
id                                                                                 
Box0                 

In [59]:
#5) train classifier on these features

labels = [5, 4, 5, 3, 5, 4, 2, 4]
X_training, y_training = np.nan_to_num(segment_features.astype('float32')), labels

rf = RandomForestClassifier(random_state=1)

#initialize learner
learner = ActiveLearner(estimator= rf,  ## nn works awful, rf is best
                    query_strategy = margin_sampling,  ## margin sampling worked best
                    X_training=X_training, y_training=y_training)




In [58]:
segment_features.dtype

dtype('float64')