In [None]:
########################################################################
## This file builds the dissimilarity matrices from one day of data
## Required files: 'locations.csv' and 'associations_times.csv'
## Produces files:
##                - Average Building Time Apart Matrix: buildings_dissimilarity_mat.csv 
##                - Building Labels: building_names.csv
##                - Average Building Time Apart Scaled Matrix: scaled_buildings_dissimilarity_mat.csv 
##                - Subset of Average Access Point Time Apart Matrix: test_dissimilarity_mat.csv
##                - Access Point Labels: test_ids.csv 
########################################################################

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import sys
from numpy import inf

In [8]:
# Import data on installation_id and names as locations_df
# schema: installation_id, name, latitude, longitude, accuracy
locations_df = pd.read_csv('locations.csv')
#locations_df

In [9]:
# Import data generated from preprocessing notebook
# schema: (ignore 1st col (mistake in DataPreprocessing)), ap_1, ap_2, total_time, frequency
associations_times_df = pd.read_csv('associations_times.csv')
#associations_times_df

## DISSIMILARITY MATRIX B (each building to building connection)
using all data  
generates files for the ds matrix and the labels

In [12]:
# generate mappings to make look ups when building ds matrix faster

map_id_names = {} # id is key, name is value
building_ids = {} # maps building name to index between 0 and number of unique buildings
building_indices = {} # maps index to building name
building_names = [] # list of all building names for reference
prev_name = None
cnt_buildings = 0

# iterate over locations dataframe
for i, row in locations_df.iterrows():
    # get name and id of current row
    curr_name = row['name'].split('-')[0]
    curr_id = row['installation_id']
    if prev_name == None:
        prev_name = curr_name
    elif prev_name != curr_name:
        # if on to new building name store info about previous
        building_ids[prev_name] = cnt_buildings
        building_indices[cnt_buildings] = prev_name
        building_names.append(prev_name)
        prev_name = curr_name
        cnt_buildings += 1
    # maintain map of all access point ids to respective buildings
    if curr_id not in map_id_names:
        map_id_names[curr_id] = curr_name

building_names.append(curr_name)
# catch last building name to store info
if prev_name != curr_name:
    building_ids[prev_name] = cnt_buildings
    building_indices[cnt_buildings] = prev_name
    building_names.append(curr_name)

# n_buildings is number of unique buildings that have sensors
n_buildings = len(building_names)

In [225]:
# init matrices for calculating ds_mat by buildings
ds_buildings_sums = np.zeros((n_buildings,n_buildings))
ds_buildings_freq = np.zeros((n_buildings,n_buildings))
ds_buildings_mat = np.zeros((n_buildings,n_buildings))

In [226]:
# sum times and frequencies of each connect between any sensor of one building 
# to any sensor of another building for each combo of buildings
for i, row in associations_times_df.iterrows():
    if row['ap_1'] in map_id_names and row['ap_2'] in map_id_names:
        name1 = map_id_names[row['ap_1']]
        name2 = map_id_names[row['ap_2']]
        if name1 != name2:
            i1 = building_ids[name1]
            i2 = building_ids[name2]
            if i1 < i2:
                ds_buildings_sums[i1][i2] += row['total_time']
                ds_buildings_freq[i1][i2] += row['frequency']
            elif i1 > i2:
                ds_buildings_sums[i2][i1] += row['total_time']
                ds_buildings_freq[i2][i1] += row['frequency']

In [237]:
# comput ds matrix
for i in range(n_buildings):
    for j in range(i+1,n_buildings):
        if ds_buildings_freq[i][j] != 0:
            ds_buildings_mat[i][j] = ds_buildings_mat[j][i] = ds_buildings_sums[i][j] / ds_buildings_freq[i][j]

In [251]:
# catch and toss overflow errors
ds_buildings_mat[np.isnan(ds_buildings_mat)] = 0

In [301]:
for i in range(n_buildings):
    for j in range(i+1,n_buildings):
        if not ds_buildings_mat[i][j] >= 0:
            print(i,j)

In [14]:
# save building names
np.savetxt("building_names.csv", building_names, delimiter=',', fmt='%s')

In [265]:
#building_names

In [254]:
# matrix to csv file
np.savetxt('buildings_dissimilarity_mat.csv', ds_buildings_mat, delimiter=',', fmt='%d')

## DISSIMILARITY MATRIX B_1 (each building to building connection)
try tf-idf sort of adaptation of tf-idf becuase some are much more frequent than others
generates: scaled_buildings_dissimilarity_mat.csv
  
    tf = term frequency (number of times the connection between building a, b appears)
    idf = inverse document frequency = log(N/df)
    N = total number of connections between all buildings
    df = document frequency (sum of freq of union of connections that building a and building b each have)


In [276]:
buildings_TF_IDF = np.zeros((n_buildings,n_buildings))
N = sum(sum(ds_buildings_freq))
buildings_DF = ds_buildings_freq.sum(axis=0)

In [298]:
# calculate idf
for i in range(n_buildings):
    for j in range(i+1,n_buildings):
        if (i != j) or (ds_buildings_freq[i][j] != 0) or ((buildings_DF[i] != 0) and (buildings_DF[j] != 0)) or ((buildings_DF[i] + buildings_DF[j] - ds_buildings_freq[i][j]) != 0):
            buildings_TF_IDF[i][j] = ds_buildings_freq[i][j] * np.log(N/(buildings_DF[i] + buildings_DF[j] - ds_buildings_freq[i][j]))
buildings_TF_IDF[buildings_TF_IDF == inf] = 0

  """
  """


In [299]:
scaled_buildings_ds_mat = np.zeros((n_buildings,n_buildings))
for i in range(n_buildings):
    for j in range(i+1,n_buildings):
        if (i != j):
            scaled_buildings_ds_mat[i][j] = scaled_buildings_ds_mat[j][i] = buildings_TF_IDF[i][j] * ds_buildings_mat[i][j]


In [300]:
# catch and toss overflow errors
scaled_buildings_ds_mat[np.isnan(scaled_buildings_ds_mat)] = 0

In [305]:
# matrix to csv file
np.savetxt('scaled_buildings_dissimilarity_mat.csv', scaled_buildings_ds_mat, delimiter=',', fmt='%d')

## DISSIMILARITY MATRIX A (each access point to access point connection)
generates:  test dissimilarity matrix and labels for access points
including only aps that have more than 100 connections to other aps 

In [224]:
# generate hashmaps to make look ups in building matrix faster
# ids: installation_id as key, index as value
# indices: index as key, installation_id as value
ids = {}
indices = {}
for i, row in locations_df.iterrows():
    ids[row['installation_id']] = i
    indices[i] = row['installation_id']

In [38]:
# initalize matrices for calculations
n = len(locations_df)
ds_sums = np.zeros((n,n))
ds_freq = np.zeros((n,n))
ds_mat = np.zeros((n,n))

In [39]:
# sum times
# sum frequencies
for i, row in associations_times_df.iterrows():
    id1 = row['ap_1']
    id2 = row['ap_2']
    if id1 in ids and id2 in ids:
        i1 = ids[id1]
        i2 = ids[id2]
        if i1 < i2:
            ds_sums[i1][i2] += row['total_time']
            ds_freq[i1][i2] += row['frequency']
        elif i1 > i2:
            ds_sums[i2][i1] += row['total_time']
            ds_freq[i2][i1] += row['frequency']

In [44]:
# checking what fraction of transistions between sensors exist in test data
cnt = 0
for i in range(n):
    for j in range(n):
        if ds_sums[i][j] != 0:
            cnt += 1
print(cnt/(n*n - n))

3916113


In [46]:
# compute avg time for the dissimilarity matrix
for i in range(n-1):
    for j in range(i+1,n):
        if ds_freq[i][j] != 0:
            ds_mat[i][j] = ds_sums[i][j] / ds_freq[i][j]
        

In [51]:
# check seeing freq of each access point in test data
index_freq = ds_freq.sum(axis=0)
index_freq

array([0., 0., 0., ..., 0., 0., 0.])

In [150]:
# check seeing number of connections each access point has to other access points
connections = np.count_nonzero(ds_mat, axis=0)

In [171]:
# compute how many access points have connections above threshold = 1000
n_connected = 0
for i in range(n):
    if connections[i] != 0:
        if connections[i] > 1000:
            n_connected += 1

9009 942


In [57]:
# PREVIOUS methods for trimming dataset down to most common connections
#functioning_ids = {}
#count = 0
#for i in range(n):
#    if index_freq[i] != 0:
#        functioning_ids[count] = indices[i]
#        count += 1

#accurate_test = 0
#for i in range(count):
#    if index_freq[i] > 1500000:
#        accurate_test += 1
#accurate_test

In [185]:
# init and fill test_ids with interesting test_ids
test_ids = np.zeros((n_connected,))
c = 0
for i in range(n):
    if connections[i] > 1000:
        if i not in indices:
            print('error')
        test_ids[c] = indices[i]
        c += 1
        
# array --> dataframe
test_ids_df = pd.DataFrame(test_ids)
  
# dataframe --> csv file
test_ids_df.to_csv("test_ids.csv", index=False)

In [227]:
# init and fill test_ds_mat with data for the access points interested in
# builds symmetric matrix of average time between access points
test_ds_mat = np.zeros((n_connected,n_connected))
for i in range(n_connected-2):
    temp_i1 = ids[test_ids[i]]
    for j in range(i+1,n_connected-1):
        if test_ids[j] == 0:
            print('error')
            break
        temp_i2 = ids[test_ids[j]]
        test_ds_mat[i][j] = ds_mat[temp_i1][temp_i2]
        test_ds_mat[j][i] = ds_mat[temp_i1][temp_i2]
        
# matrix to csv file
np.savetxt('test_dissimilarity_mat.csv', test_ds_mat, delimiter=',', fmt='%d')

In [190]:
# check size of test produced
print(len(test_ids))
print(np.shape(test_ds_mat))

(942, 942)