# Feature mapping using ML

## Loading user's data

In [62]:
import numpy as np;
import gzip;
import io as StringIO;

def parse_header_of_csv(csv_str):
    # Isolate the headline columns:

    headline = csv_str[:csv_str.index('\n')]
    columns = headline.split(',')

    # The first column should be timestamp:
    assert columns[0] == 'timestamp'
    # The last column should be label_source:
    assert columns[-1] == 'label_source'
    
#     print('columns', columns, len(columns))
    
    # Search for the first column that starts with 'label:''
    for (ci,col) in enumerate(columns):
        if col.startswith('label:'):
            first_label_ind = ci
            break
        pass

    # The columns that we use as 'features' come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind]
    
    # The columns that we use as labels: 
    # From the first label" column' all the way to the penultimate column "label source"
    label_names = columns[first_label_ind:-1]
    
    # Remove the redundant prefix 'label:' for all the selected columns
    for (li, label) in enumerate(label_names):
        
        assert label.startswith('label:')
        label_names[li] = label.replace('label:','')
        pass
    
    return (feature_names,label_names)

def parse_body_of_csv(csv_str,n_features):
    # Read the entire CSV body into a single numeric matrix:
    full_table = np.loadtxt(StringIO.StringIO(csv_str),delimiter=',',skiprows=1)
    
    # Timestamp is the primary key for the records (examples):
    timestamps = full_table[:,0].astype(int)
    
    # Read the sensor features:
    X = full_table[:,1:(n_features+1)]
    
    # Read the binary label values, and the 'missing label' indicators:
    # This should have values of either 0., 1. or NaN
    trinary_labels_mat = full_table[:,(n_features+1):-1]
    
    # M is the missing label matrix
    M = np.isnan(trinary_labels_mat)
    
    # Y is the label matrix
    Y = np.where(M, 0, trinary_labels_mat) > 0.
    
    return (X,Y,M,timestamps)

'''
Read the data (precomputed sensor-features and labels) for a user.
This function assumes the user's data file is present.
'''
def read_user_data(uuid):
    user_data_file = 'ExtraSensoryData/%s.features_labels.csv.gz' % uuid

    # Read the entire csv file of the user:
    with gzip.open(user_data_file,'rb') as fid:
        csv_str = fid.read().decode()
        pass

    feature_names, label_names = parse_header_of_csv(csv_str)
    n_features = len(feature_names)
    X, Y, M, timestamps  = parse_body_of_csv(csv_str,n_features)

    return (X,Y,M,timestamps,feature_names,label_names)

In [63]:
import os 
uuid_list = []

for file in os.listdir('ExtraSensoryData/'):
    if file.split('.')[-1] == 'gz':
        
        uuid_list.append(file.split('.')[0])
print('We have %d users data in total.' %(len(uuid_list)) )

We have 60 users data in total.


In [64]:
uuid = uuid_list[0]
print('Load the first user: %s' %(uuid))
X,Y,M,timestamps,feature_names,label_names = read_user_data(uuid)
print('Number of instance: %d (~1 min per instance)' %X.shape[0])
print('Features: %d' %(X.shape[1]))
print('Labels: %d' %(Y.shape[1]))
print('Feature matrix X: (%d , %d)' % (X.shape[0], X.shape[1]))
print('Label matrix Y: (%d , %d)' % (Y.shape[0], Y.shape[1]))
print('Missing label matrix M: (%d , %d)' % (M.shape[0], M.shape[1]))

Load the first user: 00EABED2-271D-49D8-B599-1D4A09240601
Number of instance: 2287 (~1 min per instance)
Features: 225
Labels: 51
Feature matrix X: (2287 , 225)
Label matrix Y: (2287 , 51)
Missing label matrix M: (2287 , 51)


In [65]:
n_examples_per_label = np.sum(Y, axis = 0)
labels_and_counts = zip(label_names,n_examples_per_label)

sorted_labels_and_counts = sorted(labels_and_counts,reverse=True,key=lambda pair:pair[1])
print("Total context labels the user has and the time spent on each of them:")
print("-"*20)
for (label,count) in sorted_labels_and_counts:
    print("label %s - %d minutes" % (label, count))
    pass
print(len(sorted_labels_and_counts))

Total context labels the user has and the time spent on each of them:
--------------------
label PHONE_ON_TABLE - 1594 minutes
label OR_indoors - 1284 minutes
label SITTING - 1106 minutes
label LOC_home - 995 minutes
label LYING_DOWN - 657 minutes
label SLEEPING - 477 minutes
label WITH_CO-WORKERS - 331 minutes
label IN_A_MEETING - 306 minutes
label IN_CLASS - 214 minutes
label LOC_main_workplace - 197 minutes
label OR_standing - 193 minutes
label FIX_walking - 163 minutes
label COMPUTER_WORK - 89 minutes
label COOKING - 55 minutes
label EATING - 54 minutes
label ON_A_BUS - 49 minutes
label CLEANING - 38 minutes
label SHOPPING - 19 minutes
label DRESSING - 17 minutes
label BATHING_-_SHOWER - 15 minutes
label WATCHING_TV - 2 minutes
label FIX_running - 0 minutes
label BICYCLING - 0 minutes
label LAB_WORK - 0 minutes
label OR_outside - 0 minutes
label IN_A_CAR - 0 minutes
label DRIVE_-_I_M_THE_DRIVER - 0 minutes
label DRIVE_-_I_M_A_PASSENGER - 0 minutes
label FIX_restaurant - 0 minutes
l

In [66]:
# The labels are provided with standardized names
# we can prettify them in a very straightforward way

def get_label_pretty_name(label):
    if label == 'FIX_walking':
        return 'Walking';
    if label == 'FIX_running':
        return 'Running';
    if label == 'LOC_main_workplace':
        return 'At main workplace';
    if label == 'OR_indoors':
        return 'Indoors';
    if label == 'OR_outside':
        return 'Outside';
    if label == 'LOC_home':
        return 'At home';
    if label == 'FIX_restaurant':
        return 'At a restaurant';
    if label == 'OR_exercise':
        return 'Exercise';
    if label == 'LOC_beach':
        return 'At the beach';
    if label == 'OR_standing':
        return 'Standing';
    if label == 'WATCHING_TV':
        return 'Watching TV'
    
    if label.endswith('_'):
        label = label[:-1] + ')';
        pass;
    
    label = label.replace('__',' (').replace('_',' ');
    label = label[0] + label[1:].lower();
    label = label.replace('i m','I\'m');
    return label;

In [67]:
def get_sensor_names_from_features(feature_names):
    feat_sensor_names = np.array([None for feat in feature_names]);
    for (fi,feat) in enumerate(feature_names):
        if feat.startswith('raw_acc'):
            feat_sensor_names[fi] = 'Acc';
            pass;
        elif feat.startswith('proc_gyro'):
            feat_sensor_names[fi] = 'Gyro';
            pass;
        elif feat.startswith('raw_magnet'):
            feat_sensor_names[fi] = 'Magnet';
            pass;
        elif feat.startswith('watch_acceleration'):
            feat_sensor_names[fi] = 'WAcc';
            pass;
        elif feat.startswith('watch_heading'):
            feat_sensor_names[fi] = 'Compass';
            pass;
        elif feat.startswith('location'):
            feat_sensor_names[fi] = 'Loc';
            pass;
        elif feat.startswith('location_quick_features'):
            feat_sensor_names[fi] = 'Loc';
            pass;
        elif feat.startswith('audio_naive'):
            feat_sensor_names[fi] = 'Aud';
            pass;
        elif feat.startswith('audio_properties'):
            feat_sensor_names[fi] = 'AP';
            pass;
        elif feat.startswith('discrete'):
            feat_sensor_names[fi] = 'PS';
            pass;
        elif feat.startswith('lf_measurements'):
            feat_sensor_names[fi] = 'LF';
            pass;
        else:
            raise ValueError("!!! Unsupported feature name: %s" % feat);

        pass;

    return feat_sensor_names; 

In [68]:
feature_translation={"accelerometer":['Acc'],
"gyroscopic":['Gyro'],
"magnetometer":['Magnet'],
"watch_accelerometer":['WAcc'],
"watch_compass":['Compass'],
"location":['Loc'],
"audio_magnitude":['Aud','AP'],
"phone_state":['PS'],
"additional":['LF']}

In [69]:
feat_sensor_names = get_sensor_names_from_features(feature_names);

for (fi,feature) in enumerate(feature_names):
    print("%3d) %s %s" % (fi,feat_sensor_names[fi].ljust(10),feature));
    pass;

  0) Acc        raw_acc:magnitude_stats:mean
  1) Acc        raw_acc:magnitude_stats:std
  2) Acc        raw_acc:magnitude_stats:moment3
  3) Acc        raw_acc:magnitude_stats:moment4
  4) Acc        raw_acc:magnitude_stats:percentile25
  5) Acc        raw_acc:magnitude_stats:percentile50
  6) Acc        raw_acc:magnitude_stats:percentile75
  7) Acc        raw_acc:magnitude_stats:value_entropy
  8) Acc        raw_acc:magnitude_stats:time_entropy
  9) Acc        raw_acc:magnitude_spectrum:log_energy_band0
 10) Acc        raw_acc:magnitude_spectrum:log_energy_band1
 11) Acc        raw_acc:magnitude_spectrum:log_energy_band2
 12) Acc        raw_acc:magnitude_spectrum:log_energy_band3
 13) Acc        raw_acc:magnitude_spectrum:log_energy_band4
 14) Acc        raw_acc:magnitude_spectrum:spectral_entropy
 15) Acc        raw_acc:magnitude_autocorrelation:period
 16) Acc        raw_acc:magnitude_autocorrelation:normalized_ac
 17) Acc        raw_acc:3d:mean_x
 18) Acc        raw_acc:3d:mean_y


## Data pipeline

In [70]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

class select_features_by_sensors(BaseEstimator, TransformerMixin):
    
    def __init__(self, sensors_to_use, feature_names):
        self.sensors_to_use = sensors_to_use
        self.feature_names = feature_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        fi = []
        for i, feature in enumerate(self.feature_names):
            if sensor_name_abbriviation[feature.split(':')[0]] in self.sensors_to_use:
                fi.append(i)
    
        return X[:, fi]
    
def select_target_label(y, target_label, label_names):
    for i, label_name in enumerate(label_names):
        if target_label == label_name:
            break
    return y[:, i]

def split_by_users(X, y, test_uuid, user_index):
    X_train, y_train = [], []
    X_test, y_test = [], []

    for i in range(60):
        if i in test_uuid:
            X_test.append(X[user_index[i]:user_index[i+1], :])
            y_test.append(y[user_index[i]:user_index[i+1]])
        else:
            X_train.append(X[user_index[i]:user_index[i+1], :])
            y_train.append(y[user_index[i]:user_index[i+1]])
    X_train = np.concatenate(X_train)
    y_train = np.concatenate(y_train)
    X_test = np.concatenate(X_test)
    y_test = np.concatenate(y_test)
    
    return X_train, y_train, X_test, y_test

In [71]:
def load_all_data(uuid_list):
    X, y = [], []
    user_index = [0]
    for i, uuid in enumerate(uuid_list):
        X_i,y_i,M,timestamps,feature_names,label_names = read_user_data(uuid)
        user_index.append(user_index[i]+X_i.shape[0])
        X.append(X_i)
        y.append(y_i)
        
    X = np.concatenate(X)
    y = np.concatenate(y)

    return X, y, user_index, feature_names, label_names

In [72]:
import pandas as pd
import numpy as np
df1=pd.read_csv('Combined labels.csv')

In [73]:
feature_importances=[]
myDict={}
columns=['raw_acc:magnitude_stats:mean', 'raw_acc:magnitude_stats:std', 'raw_acc:magnitude_stats:moment3', 'raw_acc:magnitude_stats:moment4', 'raw_acc:magnitude_stats:percentile25', 'raw_acc:magnitude_stats:percentile50', 'raw_acc:magnitude_stats:percentile75', 'raw_acc:magnitude_stats:value_entropy', 'raw_acc:magnitude_stats:time_entropy', 'raw_acc:magnitude_spectrum:log_energy_band0', 'raw_acc:magnitude_spectrum:log_energy_band1', 'raw_acc:magnitude_spectrum:log_energy_band2', 'raw_acc:magnitude_spectrum:log_energy_band3', 'raw_acc:magnitude_spectrum:log_energy_band4', 'raw_acc:magnitude_spectrum:spectral_entropy', 'raw_acc:magnitude_autocorrelation:period', 'raw_acc:magnitude_autocorrelation:normalized_ac', 'raw_acc:3d:mean_x', 'raw_acc:3d:mean_y', 'raw_acc:3d:mean_z', 'raw_acc:3d:std_x', 'raw_acc:3d:std_y', 'raw_acc:3d:std_z', 'raw_acc:3d:ro_xy', 'raw_acc:3d:ro_xz', 'raw_acc:3d:ro_yz', 'proc_gyro:magnitude_stats:mean', 'proc_gyro:magnitude_stats:std', 'proc_gyro:magnitude_stats:moment3', 'proc_gyro:magnitude_stats:moment4', 'proc_gyro:magnitude_stats:percentile25', 'proc_gyro:magnitude_stats:percentile50', 'proc_gyro:magnitude_stats:percentile75', 'proc_gyro:magnitude_stats:value_entropy', 'proc_gyro:magnitude_stats:time_entropy', 'proc_gyro:magnitude_spectrum:log_energy_band0', 'proc_gyro:magnitude_spectrum:log_energy_band1', 'proc_gyro:magnitude_spectrum:log_energy_band2', 'proc_gyro:magnitude_spectrum:log_energy_band3', 'proc_gyro:magnitude_spectrum:log_energy_band4', 'proc_gyro:magnitude_spectrum:spectral_entropy', 'proc_gyro:magnitude_autocorrelation:period', 'proc_gyro:magnitude_autocorrelation:normalized_ac', 'proc_gyro:3d:mean_x', 'proc_gyro:3d:mean_y', 'proc_gyro:3d:mean_z', 'proc_gyro:3d:std_x', 'proc_gyro:3d:std_y', 'proc_gyro:3d:std_z', 'proc_gyro:3d:ro_xy', 'proc_gyro:3d:ro_xz', 'proc_gyro:3d:ro_yz', 'raw_magnet:magnitude_stats:mean', 'raw_magnet:magnitude_stats:std', 'raw_magnet:magnitude_stats:moment3', 'raw_magnet:magnitude_stats:moment4', 'raw_magnet:magnitude_stats:percentile25', 'raw_magnet:magnitude_stats:percentile50', 'raw_magnet:magnitude_stats:percentile75', 'raw_magnet:magnitude_stats:value_entropy', 'raw_magnet:magnitude_stats:time_entropy', 'raw_magnet:magnitude_spectrum:log_energy_band0', 'raw_magnet:magnitude_spectrum:log_energy_band1', 'raw_magnet:magnitude_spectrum:log_energy_band2', 'raw_magnet:magnitude_spectrum:log_energy_band3', 'raw_magnet:magnitude_spectrum:log_energy_band4', 'raw_magnet:magnitude_spectrum:spectral_entropy', 'raw_magnet:magnitude_autocorrelation:period', 'raw_magnet:magnitude_autocorrelation:normalized_ac', 'raw_magnet:3d:mean_x', 'raw_magnet:3d:mean_y', 'raw_magnet:3d:mean_z', 'raw_magnet:3d:std_x', 'raw_magnet:3d:std_y', 'raw_magnet:3d:std_z', 'raw_magnet:3d:ro_xy', 'raw_magnet:3d:ro_xz', 'raw_magnet:3d:ro_yz', 'raw_magnet:avr_cosine_similarity_lag_range0', 'raw_magnet:avr_cosine_similarity_lag_range1', 'raw_magnet:avr_cosine_similarity_lag_range2', 'raw_magnet:avr_cosine_similarity_lag_range3', 'raw_magnet:avr_cosine_similarity_lag_range4', 'watch_acceleration:magnitude_stats:mean', 'watch_acceleration:magnitude_stats:std', 'watch_acceleration:magnitude_stats:moment3', 'watch_acceleration:magnitude_stats:moment4', 'watch_acceleration:magnitude_stats:percentile25', 'watch_acceleration:magnitude_stats:percentile50', 'watch_acceleration:magnitude_stats:percentile75', 'watch_acceleration:magnitude_stats:value_entropy', 'watch_acceleration:magnitude_stats:time_entropy', 'watch_acceleration:magnitude_spectrum:log_energy_band0', 'watch_acceleration:magnitude_spectrum:log_energy_band1', 'watch_acceleration:magnitude_spectrum:log_energy_band2', 'watch_acceleration:magnitude_spectrum:log_energy_band3', 'watch_acceleration:magnitude_spectrum:log_energy_band4', 'watch_acceleration:magnitude_spectrum:spectral_entropy', 'watch_acceleration:magnitude_autocorrelation:period', 'watch_acceleration:magnitude_autocorrelation:normalized_ac', 'watch_acceleration:3d:mean_x', 'watch_acceleration:3d:mean_y', 'watch_acceleration:3d:mean_z', 'watch_acceleration:3d:std_x', 'watch_acceleration:3d:std_y', 'watch_acceleration:3d:std_z', 'watch_acceleration:3d:ro_xy', 'watch_acceleration:3d:ro_xz', 'watch_acceleration:3d:ro_yz', 'watch_acceleration:spectrum:x_log_energy_band0', 'watch_acceleration:spectrum:x_log_energy_band1', 'watch_acceleration:spectrum:x_log_energy_band2', 'watch_acceleration:spectrum:x_log_energy_band3', 'watch_acceleration:spectrum:x_log_energy_band4', 'watch_acceleration:spectrum:y_log_energy_band0', 'watch_acceleration:spectrum:y_log_energy_band1', 'watch_acceleration:spectrum:y_log_energy_band2', 'watch_acceleration:spectrum:y_log_energy_band3', 'watch_acceleration:spectrum:y_log_energy_band4', 'watch_acceleration:spectrum:z_log_energy_band0', 'watch_acceleration:spectrum:z_log_energy_band1', 'watch_acceleration:spectrum:z_log_energy_band2', 'watch_acceleration:spectrum:z_log_energy_band3', 'watch_acceleration:spectrum:z_log_energy_band4', 'watch_acceleration:relative_directions:avr_cosine_similarity_lag_range0', 'watch_acceleration:relative_directions:avr_cosine_similarity_lag_range1', 'watch_acceleration:relative_directions:avr_cosine_similarity_lag_range2', 'watch_acceleration:relative_directions:avr_cosine_similarity_lag_range3', 'watch_acceleration:relative_directions:avr_cosine_similarity_lag_range4', 'watch_heading:mean_cos', 'watch_heading:std_cos', 'watch_heading:mom3_cos', 'watch_heading:mom4_cos', 'watch_heading:mean_sin', 'watch_heading:std_sin', 'watch_heading:mom3_sin', 'watch_heading:mom4_sin', 'watch_heading:entropy_8bins', 'location:num_valid_updates', 'location:log_latitude_range', 'location:log_longitude_range', 'location:min_altitude', 'location:max_altitude', 'location:min_speed', 'location:max_speed', 'location:best_horizontal_accuracy', 'location:best_vertical_accuracy', 'location:diameter', 'location:log_diameter', 'location_quick_features:std_lat', 'location_quick_features:std_long', 'location_quick_features:lat_change', 'location_quick_features:long_change', 'location_quick_features:mean_abs_lat_deriv', 'location_quick_features:mean_abs_long_deriv', 'audio_naive:mfcc0:mean', 'audio_naive:mfcc1:mean', 'audio_naive:mfcc2:mean', 'audio_naive:mfcc3:mean', 'audio_naive:mfcc4:mean', 'audio_naive:mfcc5:mean', 'audio_naive:mfcc6:mean', 'audio_naive:mfcc7:mean', 'audio_naive:mfcc8:mean', 'audio_naive:mfcc9:mean', 'audio_naive:mfcc10:mean', 'audio_naive:mfcc11:mean', 'audio_naive:mfcc12:mean', 'audio_naive:mfcc0:std', 'audio_naive:mfcc1:std', 'audio_naive:mfcc2:std', 'audio_naive:mfcc3:std', 'audio_naive:mfcc4:std', 'audio_naive:mfcc5:std', 'audio_naive:mfcc6:std', 'audio_naive:mfcc7:std', 'audio_naive:mfcc8:std', 'audio_naive:mfcc9:std', 'audio_naive:mfcc10:std', 'audio_naive:mfcc11:std', 'audio_naive:mfcc12:std', 'audio_properties:max_abs_value', 'audio_properties:normalization_multiplier', 'discrete:app_state:is_active', 'discrete:app_state:is_inactive', 'discrete:app_state:is_background', 'discrete:app_state:missing', 'discrete:battery_plugged:is_ac', 'discrete:battery_plugged:is_usb', 'discrete:battery_plugged:is_wireless', 'discrete:battery_plugged:missing', 'discrete:battery_state:is_unknown', 'discrete:battery_state:is_unplugged', 'discrete:battery_state:is_not_charging', 'discrete:battery_state:is_discharging', 'discrete:battery_state:is_charging', 'discrete:battery_state:is_full', 'discrete:battery_state:missing', 'discrete:on_the_phone:is_False', 'discrete:on_the_phone:is_True', 'discrete:on_the_phone:missing', 'discrete:ringer_mode:is_normal', 'discrete:ringer_mode:is_silent_no_vibrate', 'discrete:ringer_mode:is_silent_with_vibrate', 'discrete:ringer_mode:missing', 'discrete:wifi_status:is_not_reachable', 'discrete:wifi_status:is_reachable_via_wifi', 'discrete:wifi_status:is_reachable_via_wwan', 'discrete:wifi_status:missing', 'lf_measurements:light', 'lf_measurements:pressure', 'lf_measurements:proximity_cm', 'lf_measurements:proximity', 'lf_measurements:relative_humidity', 'lf_measurements:battery_level', 'lf_measurements:screen_brightness', 'lf_measurements:temperature_ambient', 'discrete:time_of_day:between0and6', 'discrete:time_of_day:between3and9', 'discrete:time_of_day:between6and12', 'discrete:time_of_day:between9and15', 'discrete:time_of_day:between12and18', 'discrete:time_of_day:between15and21', 'discrete:time_of_day:between18and24', 'discrete:time_of_day:between21and3', 'labels']
for i in range(51):
    row_list = df1.loc[i, :].values.flatten().tolist()
    for i in range(len(columns)):
        myDict[columns[i]]=row_list[i]
    feature_importances.append(myDict)
    myDict={}

In [75]:
feature_groups={"accelerometer":26,
"gyroscopic":26,
"magnetometer":31,
"watch_accelerometer":46,
"watch_compass":9,
"location":17,
"audio_magnitude":28,
"phone_state":34,
"additional":8,
}

In [76]:
group_importance={}
label_importance={}
list2=[]
for i in range(51):
    x=feature_importances[i]
    list1=[]
    for k,v in x.items():
        list1.append(v)
    count=0
    for a,b in feature_groups.items():
        sum1=0
        for j in range(count,count+b):
            sum1+=list1[j]
            count+=1
        group_importance[a]=sum1
    label_importance[list1[-1]]=dict(group_importance)
    list2.append(dict(label_importance))
print(len(list2))

51


In [77]:
x=list2[-1]
from collections import Counter
count=0
dict2={}
for k,v in x.items():
    print(k)
    print(v)
    print("")
    m = Counter(v)
    high = m.most_common(3)
    list1=[]
    for i in high:
        print(i[0]," :",i[1]," ")
        list1.append(i[0])
    dict2[k]=list1
    print("*************************************************************************************************************************")
    count+=1
print(dict2)


label:LYING_DOWN
{'accelerometer': 0.09379395600000001, 'gyroscopic': 0.040052418, 'magnetometer': 0.09718521799999998, 'watch_accelerometer': 0.09375775299999999, 'watch_compass': 0.0067001249999999995, 'location': 0.041370646999999997, 'audio_magnitude': 0.12989850200000003, 'phone_state': 0.43092785400000005, 'additional': 0.066313677}

phone_state  : 0.43092785400000005  
audio_magnitude  : 0.12989850200000003  
magnetometer  : 0.09718521799999998  
*************************************************************************************************************************
label:SITTING
{'accelerometer': 0.12620790299999998, 'gyroscopic': 0.119565303, 'magnetometer': 0.129539885, 'watch_accelerometer': 0.13023277600000002, 'watch_compass': 0.01356418, 'location': 0.064194493, 'audio_magnitude': 0.13502479200000003, 'phone_state': 0.215925041, 'additional': 0.065745755}

phone_state  : 0.215925041  
audio_magnitude  : 0.13502479200000003  
watch_accelerometer  : 0.13023277600000002  
**

In [78]:
list2=[]
for k,v in dict2.items():
    list1=[]
    for m in v:
        for i,j in feature_translation.items():
            if(m==i):
                for f in j:
                    list1.append(f)
    dict2[k]=list1
print(dict2)


{'label:LYING_DOWN': ['PS', 'Aud', 'AP', 'Magnet'], 'label:SITTING': ['PS', 'Aud', 'AP', 'WAcc'], 'label:FIX_walking': ['Gyro', 'Acc', 'Magnet'], 'label:FIX_running': ['Magnet', 'Loc', 'Aud', 'AP'], 'label:BICYCLING': ['Magnet', 'Loc', 'Acc'], 'label:SLEEPING': ['PS', 'Magnet', 'Acc'], 'label:LAB_WORK': ['Magnet', 'Loc', 'Aud', 'AP'], 'label:IN_CLASS': ['Magnet', 'Acc', 'Loc'], 'label:IN_A_MEETING': ['Magnet', 'LF', 'Aud', 'AP'], 'label:LOC_main_workplace': ['Magnet', 'Loc', 'LF'], 'label:OR_indoors': ['Acc', 'Magnet', 'Aud', 'AP'], 'label:OR_outside': ['Magnet', 'Acc', 'Aud', 'AP'], 'label:IN_A_CAR': ['Magnet', 'Acc', 'Loc'], 'label:ON_A_BUS': ['Magnet', 'Aud', 'AP', 'Loc'], 'label:DRIVE_-_I_M_THE_DRIVER': ['Magnet', 'Loc', 'Acc'], 'label:DRIVE_-_I_M_A_PASSENGER': ['Magnet', 'Acc', 'WAcc'], 'label:LOC_home': ['Loc', 'Aud', 'AP', 'Magnet'], " 'label:FIX_restaurant'": ['Magnet', 'LF', 'Aud', 'AP'], " 'label:PHONE_IN_POCKET'": ['Acc', 'Magnet', 'Aud', 'AP'], " 'label:OR_exercise'": ['Mag

In [82]:
from sklearn.pipeline import Pipeline
for k,v in dict2.items():
    parameters = {
        'sensors_to_use': v,
        'target_label': str(k),
        'test_uuid': list(range(56, 61)),
        'Imputation_strategy': 'mean'
    }

    pipe_line = Pipeline([
        ('feature_selector', select_features_by_sensors(parameters['sensors_to_use'], feature_names)),
        ('imputer', SimpleImputer(strategy=parameters['Imputation_strategy'])),
        ('std_scaler', StandardScaler())
    ])

    sensor_name_abbriviation = {
        'raw_acc': 'Acc',
        'proc_gyro': 'Gyro', 
        'raw_magnet': 'Magnet',
        'watch_acceleration': 'WAcc',
        'watch_heading': 'Compass',
        'location': 'Loc',
        'location_quick_features': 'Loc',
        'audio_naive': 'Aud',
        'audio_properties': 'AP',
        'discrete': 'PS', 
        'lf_measurements': 'LF'
    }

    X, y, user_index, feature_names, lable_names = load_all_data(uuid_list)
    y_new = select_target_label(y, parameters['target_label'], label_names)
    X_new = pipe_line.fit_transform(X, y_new)
    print(k)
    print("Features used:",v)
    print(X_new.shape, y_new.shape)
    X_train, y_train, X_test, y_test = split_by_users(X_new, y_new, parameters['test_uuid'], user_index)
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    import sklearn.linear_model as linear_model
    import sklearn.ensemble as ensemble
    #lr_clf = linear_model.LogisticRegression(class_weight='balanced')
    #lr_clf.fit(X_train, y_train)
    rf_clf = ensemble.RandomForestClassifier(class_weight='balanced')
    rf_clf.fit(X_train, y_train)
    #print('Logistic Regression: accuracy on training data %f' %lr_clf.score(X_train, y_train))
    #print('Logistic Regression: accuracy on test data %f' %lr_clf.score(X_test, y_test))
    print('Random Forest: accuracy on training data %f' %rf_clf.score(X_train, y_train))
    print('Random Forest: accuracy on test data %f' %rf_clf.score(X_test, y_test))
    list1.append(k)
    list1.append(v)
    list1.append(rf_clf.score(X_train, y_train))
    list1.append(rf_clf.score(X_test, y_test))
    print("*************************************************************************************************************************")

label:LYING_DOWN
Features used: ['PS', 'Aud', 'AP', 'Magnet']
(377346, 93) (377346,)
(363136, 93) (14210, 93) (363136,) (14210,)
Random Forest: accuracy on training data 0.999882
Random Forest: accuracy on test data 0.945250
*************************************************************************************************************************
label:SITTING
Features used: ['PS', 'Aud', 'AP', 'WAcc']
(377346, 108) (377346,)
(363136, 108) (14210, 108) (363136,) (14210,)
Random Forest: accuracy on training data 0.999725
Random Forest: accuracy on test data 0.944335
*************************************************************************************************************************
label:FIX_walking
Features used: ['Gyro', 'Acc', 'Magnet']
(377346, 83) (377346,)
(363136, 83) (14210, 83) (363136,) (14210,)
Random Forest: accuracy on training data 0.999992
Random Forest: accuracy on test data 0.944968
**************************************************************************************

label:BATHING_-_SHOWER
Features used: ['Magnet', 'LF', 'Aud', 'AP']
(377346, 67) (377346,)
(363136, 67) (14210, 67) (363136,) (14210,)
Random Forest: accuracy on training data 0.999920
Random Forest: accuracy on test data 0.945320
*************************************************************************************************************************
label:CLEANING
Features used: ['Magnet', 'Acc', 'Gyro']
(377346, 83) (377346,)
(363136, 83) (14210, 83) (363136,) (14210,)
Random Forest: accuracy on training data 0.999986
Random Forest: accuracy on test data 0.944828
*************************************************************************************************************************
label:DOING_LAUNDRY
Features used: ['Magnet', 'Gyro', 'Acc']
(377346, 83) (377346,)
(363136, 83) (14210, 83) (363136,) (14210,)
Random Forest: accuracy on training data 0.999989
Random Forest: accuracy on test data 0.944968
**********************************************************************************

label:PHONE_ON_TABLE
Features used: ['Acc', 'Magnet', 'Aud', 'AP']
(377346, 85) (377346,)
(363136, 85) (14210, 85) (363136,) (14210,)
Random Forest: accuracy on training data 0.999989
Random Forest: accuracy on test data 0.945320
*************************************************************************************************************************
label:WITH_CO-WORKERS
Features used: ['Magnet', 'Acc', 'LF']
(377346, 65) (377346,)
(363136, 65) (14210, 65) (363136,) (14210,)
Random Forest: accuracy on training data 0.999994
Random Forest: accuracy on test data 0.945320
*************************************************************************************************************************
label:WITH_FRIENDS
Features used: ['Magnet', 'Acc', 'Aud', 'AP']
(377346, 85) (377346,)
(363136, 85) (14210, 85) (363136,) (14210,)
Random Forest: accuracy on training data 0.999994
Random Forest: accuracy on test data 0.945320
**************************************************************************

SyntaxError: invalid syntax (2558433233.py, line 1)