In [None]:
!git clone https://github.com/mitll/MIT-Matrix-Data.git

In [None]:
#Import modules
import pandas as pd
from collections import defaultdict
import string
import datetime
from datetime import timedelta
import os
from tqdm.notebook import tqdm
import time

directory = r'MIT-Matrix-Data'
grandparent = r'LoggedMatrixData'
datetimeFormat = '%Y-%m-%d_%H:%M:%S.%f'

#Hyper Parameter for the size of each "bucket"
TIME_INTERVAL = 1

d = defaultdict(dict)
counts = dict()
r = False
total = 0
start = cur = ""
bucket = 0
filenames = list()
ALLOW_LIST = ["BlueProxTx"]
# for each file inside of directory use os.walk:
# Add it to a list so that progress can be shown with tqdm
for root, dirs, files in os.walk(directory):
    for fn in files:
        if not fn.endswith(".txt"): continue
        full_fn = os.path.join(root, fn)
        filenames.append(full_fn)

for fn in tqdm(filenames):
    # do fn splitting to get features  
    exp = fn.split('_')
    exp1 = exp[0]
    exp2 = exp[1]
    dist = exp[3][0:len(exp[3])-2]
    if not dist in d:
        d[dist] = dict()
    f = open(fn)
    # set the starting time as the first time entry
    bucket = 0
    start = None
    for line in f:
        line_split = line.strip().split(',')
        tstamp = line_split[0].lower()
        tstamp = tstamp.replace('t', '_').replace('z', '').replace(' ', '_')
        #If start is None, this is the first iteration
        if start is None:
            start = tstamp
        #This difference determines the bucket that the line goes in
        diff = datetime.datetime.strptime(tstamp, datetimeFormat) - datetime.datetime.strptime(start, datetimeFormat)
        bucket_id = diff.seconds // TIME_INTERVAL
        if not bucket_id in d[dist]:
            d[dist][bucket_id] = dict()
        feature_name = line_split[1]
        #        
        if feature_name == "Bluetooth" and not line_split[4] in ALLOW_LIST:
            continue
        #Add to the dictionary
        if not feature_name in d[dist][bucket_id]:
            d[dist][bucket_id][feature_name] = list()
        #Taking some statistics on the counts of features
        if not feature_name in counts:
            counts[feature_name] = 0
        counts[feature_name] += 1
        d[dist][bucket_id][feature_name].append(line_split[1:])
    f.close()

In [None]:
#last_found_value
#Take in the bucket_dictionary and the feature name and the bucket id
def last_found_value(bucket_dictionary, feature_name, bucket_id):
    #Decrement the bucket id and see if it is found:
    best = -1
    for new_bucket_id in bucket_dictionary.keys():
        if feature_name in bucket_dictionary[new_bucket_id]:
            if abs(new_bucket_id - bucket_id) < abs(best - bucket_id):
                best = new_bucket_id
    #This feature is not in the bucket
    if best == -1:
        print("ERROR: KEY ", feature_name, "IS MISSING")
    else:
        return bucket_dictionary[best][feature_name]

In [None]:
#The Normal mean for a column
def mean(feature_values, col_index):
    total = 0
    cnt = 0
    for row in range(len(feature_values)):
        total += float(feature_values[row][col_index])
        cnt += 1
    return total / cnt * 1.0
#The specialized mean for a column of RSSI measurements
ALLOW_LIST = ["BlueProxTx"]
def bluetooth_mean(feature_values, col_index):
    total = 0
    cnt = 0
    for row in range(len(feature_values)):
        #Only Devices on the Allowlist should be used in this average, otherwise they should be ignored
        if feature_values[row][col_index+1] in ALLOW_LIST:
            total += float(feature_values[row][col_index])
            cnt += 1
    #Prevents a divide by zero issue
    if cnt == 0:
        print("ERROR: Empty List of Valid Bluetooth")
    else:
        return total / cnt * 1.0
def UUID(feature_values, col_index):
    total = 0
    cnt = 0
    for row in range(len(feature_values)):
        #Only Devices on the Allowlist should be used in this average, otherwise they should be ignored
        if feature_values[row][col_index+2] in ALLOW_LIST:
            return feature_values[row][col_index]

In [None]:
#A dictionary for the functions that should be used for each column of each feature
feature_functions = {
'beacon_subject' : ['None', 'None', 'None'] ,
'Range' : ['None', 'Mean'] ,
'Angle' : ['None', 'Mean'] ,
'Bluetooth' : ['None', 'UUID', 'BluetoothMean', 'Device_Name', 'Mean', 'Mean'] ,
'Heading' : ['None', 'Mean', 'Mean', 'Mean', 'Mean', 'Mean', 'Mean'] ,
'Accelerometer' : ['None', 'Mean', 'Mean', 'Mean'] ,
'Gyroscope' : ['None', 'Mean', 'Mean', 'Mean'] ,
'Attitude' : ['None', 'Mean', 'Mean', 'Mean'] ,
'Gravity' : ['None', 'Mean', 'Mean', 'Mean'] ,
}

#A function that calls the appropriate function for a column of a feature
def call_appropriate_function(feature_values, feature_name, column_head):
    result = ""
    if not type(feature_values) == list:
        length = 1
    else:
        length = len(feature_values[0])
    for col_index in range(1, length):
        add = ""
        if feature_functions[feature_name][col_index] == 'Mean':
            add = (str(mean(feature_values, col_index)))
        elif feature_functions[feature_name][col_index] == 'UUID':
            add = str(UUID(feature_values,col_index))
        elif feature_functions[feature_name][col_index] == 'Device_Name':
            add = ALLOW_LIST[0]
        elif feature_functions[feature_name][col_index] == 'BluetoothMean':
            add = str(bluetooth_mean(feature_values,col_index))
        else:
            if type(feature_values[0][col_index]) == list:
                add = feature_values[0][col_index][0]
            else:
                add = (feature_values[0][col_index])
        result += add + ", "
        #Make sure that no columns are empty
        if(len(add) == 0):
            print("ERROR: ", feature_name, " COLUMN: ", col_index, "IS EMPTY")
    return result, column_head
        
            

In [None]:
#proccess_bucket
#Take in the bucket and the bucket_dictionary and the bucket ID
ranges = defaultdict(set)
def proccess_bucket(bucket, bucket_dictionary, bucket_id):
    output = ""
    column_head = defaultdict(int)
    total = 0
    #Go through each feature
    for feature_name in feature_functions.keys():
        
        if feature_name in bucket:    
            feature_values = bucket[feature_name]
        else:
            #The feature is not in the bucket, so find values by calling last_found_value
            feature_values = last_found_value(d[distance], feature_name, bucket_id)
        #Call the appropriate method on the data with call_appropriate_function
        addition, column_head = call_appropriate_function(feature_values, feature_name, column_head)
        column_head[feature_name] = len(addition.split(","))
        total += len(addition.split(","))
        #Add the addition to the general output string
        ranges[feature_name].add(len(addition.split(",")))
        if len(addition) > 0:
            output += (addition)
    #Return the general output string
    return output


In [None]:
column_head = ""
for feature_name, feature_function in feature_functions.items():
    for i in range(1, len(feature_function)):
        column_head += feature_name.replace("_","-")  + "-"+ str(i) + ","
#Open a main CSV for all files to be put in
f = open("Logged-Matrix-Data"+str(TIME_INTERVAL)+".csv" ,"w")
column_head += "Bucket-Id,Distance"
f.write(column_head + "\n")
#Go through each of the different distances in the dictionary
for distance, buckets in tqdm(d.items()):
    #Go through each of the buckets in the distances
    for bucket_id, bucket in buckets.items():
        #process the Bucket using proccess_bucket
        result = proccess_bucket(bucket, buckets, bucket_id) + str(bucket_id) + ", " + str(distance)
        #Print the processed result using proccess
        f.write(result + "\n")
#Close the file        
f.close()