In [2]:
import os, datetime
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from chissl import chissl_mongo as cm
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
#Download location: http://vacommunity.org/dl428
VAST_PATH = '/Users/aren438/data/VAST Challenge 2014/VASTChal2014MC2-20140430'
EXTRAS_PATH = '/Users/aren438/data/VAST Challenge 2014/extras'

In [4]:
gps_df = pd.read_csv(os.path.join(VAST_PATH, 'gps.csv'),
                     parse_dates=[0],
                     date_parser=lambda s: datetime.datetime.strptime(s, '%m/%d/%Y %H:%M:%S'))\
    .set_index('Timestamp')\
    .sort_index()
    
gps_df.head()

Unnamed: 0_level_0,id,lat,long
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-01-06 06:28:01,35,36.076225,24.874689
2014-01-06 06:28:01,35,36.07622,24.874596
2014-01-06 06:28:03,35,36.076211,24.874443
2014-01-06 06:28:05,35,36.076217,24.874253
2014-01-06 06:28:06,35,36.076214,24.874167


# Resample
to 1-minute time chunks and fill in missing times

In [5]:
gps_df_expanded = gps_df.groupby('id', group_keys=False, as_index=False)\
    .resample('t')\
    .apply(lambda x: x.iloc[-1] if len(x) else None)

# drop minutes where data is logged becuase we're interested in when the vehicle was off (not logging)
gps_df_expanded = gps_df_expanded.ffill()[gps_df_expanded['id'].isnull()]

gps_df_expanded.head()

Unnamed: 0_level_0,id,lat,long
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-01-06 07:23:00,1.0,36.063658,24.885866
2014-01-06 07:24:00,1.0,36.063658,24.885866
2014-01-06 07:25:00,1.0,36.063658,24.885866
2014-01-06 07:26:00,1.0,36.063658,24.885866
2014-01-06 07:27:00,1.0,36.063658,24.885866


# Location Metadata

In [6]:
from sklearn.neighbors import KDTree

named_places_df = pd.read_csv(os.path.join(EXTRAS_PATH,
                                           'NamedPlacesWithCategories.csv'))

named_places_df.head()

Unnamed: 0,name,lat,long,canonical,category
0,Industrial Supplies Consolidated,36.046021,24.901365,True,Shopping
1,GAStech-Kronos,36.048021,24.879565,True,GASTech
2,Kronos International Airport,36.05092,24.825863,True,Public
3,Ouzeri Elian,36.05192,24.870764,True,Dining
4,Shoppers' Delight,36.05282,24.868564,True,Shopping


In [7]:
dist, nei = KDTree(named_places_df[['long', 'lat']], metric='euclidean')\
    .query(gps_df_expanded[['long', 'lat']])

gps_df_expanded['location_distance'] = dist
gps_df_expanded['location_index'] = nei.flatten()

gps_df_expanded.head()

Unnamed: 0_level_0,id,lat,long,location_distance,location_index
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-01-06 07:23:00,1.0,36.063658,24.885866,3.7e-05,46
2014-01-06 07:24:00,1.0,36.063658,24.885866,3.7e-05,46
2014-01-06 07:25:00,1.0,36.063658,24.885866,3.7e-05,46
2014-01-06 07:26:00,1.0,36.063658,24.885866,3.7e-05,46
2014-01-06 07:27:00,1.0,36.063658,24.885866,3.7e-05,46


In [8]:
gps_df_expanded_with_locations = pd.merge(
    gps_df_expanded,
    named_places_df,
    left_on='location_index',
    right_index=True,
    suffixes=('', '_y')
).drop(['lat_y', 'long_y', 'canonical'], axis=1)

gps_df_expanded_with_locations.head()

Unnamed: 0_level_0,id,lat,long,location_distance,location_index,name,category
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-01-06 07:23:00,1.0,36.063658,24.885866,3.7e-05,46,Hallowed Grounds,Coffee
2014-01-06 07:24:00,1.0,36.063658,24.885866,3.7e-05,46,Hallowed Grounds,Coffee
2014-01-06 07:25:00,1.0,36.063658,24.885866,3.7e-05,46,Hallowed Grounds,Coffee
2014-01-06 07:26:00,1.0,36.063658,24.885866,3.7e-05,46,Hallowed Grounds,Coffee
2014-01-06 07:27:00,1.0,36.063658,24.885866,3.7e-05,46,Hallowed Grounds,Coffee


In [9]:
features_by_hour_category = gps_df_expanded_with_locations.groupby([
    'id',
    pd.Grouper(freq='d'),
    gps_df_expanded_with_locations.index.hour//2,
    'category'
])\
    .size()\
    .unstack(level=[2,3])\
    .fillna(0)\
    .sort_index(axis=1)

features_by_hour_category.head()

Unnamed: 0_level_0,Timestamp,0,0,0,0,0,0,0,0,1,1,...,10,11,11,11,11,11,11,11,11,11
Unnamed: 0_level_1,category,Apartment,Dining,GASTech,Home,Industrial,Lodging,Recreation,Shopping,Apartment,Dining,...,Shopping,Apartment,Dining,GASTech,Gas,Home,Industrial,Lodging,Recreation,Shopping
id,Timestamp,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1.0,2014-01-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,27.0,0.0,0.0,103.0,0.0,11.0,0.0,0.0,0.0,0.0
1.0,2014-01-07,0.0,0.0,70.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0
1.0,2014-01-08,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,113.0,0.0,2.0,0.0,0.0,0.0,0.0
1.0,2014-01-09,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0
1.0,2014-01-10,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,112.0,0.0,0.0,0.0,0.0


In [10]:
features_by_location = gps_df_expanded_with_locations.groupby([
    'id',
    pd.Grouper(freq='d'),
    'name'
]).size().unstack().fillna(0)

features_by_location.head()

Unnamed: 0_level_0,name,Abila Scrapyard,Abila Zacharo,Ada Campo-Corrente home,Ahaggo Museum,Al's Affordable Apartments,Albert's Fine Clothing,Bean There Done That,Brew've Been Served,Brewed Awakenings,Carlyle Chemical Inc.,...,Park,Roberts and Sons,Seaside Living,Shoppers' Delight,Stenig Fusil home,Stewart and Sons Fabrication,U-Pump,Vira Frente home,Willem Vasco-Pais home,Windward Apartments
id,Timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1.0,2014-01-06,0.0,0.0,0.0,0.0,0.0,43.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,2014-01-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,2014-01-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,2014-01-09,0.0,87.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,2014-01-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
trajectory = gps_df.groupby([
    'id',
    pd.Grouper(freq='d')
])\
    [['long', 'lat']]\
    .apply(lambda df: df.sort_index().values.tolist())

trajectory.head()

id  Timestamp 
1   2014-01-06    [[24.88258237, 36.066457], [24.88258921, 36.06...
    2014-01-07    [[24.8795682, 36.0480284], [24.879568799999998...
    2014-01-08    [[24.88265237, 36.0664288], [24.88265805, 36.0...
    2014-01-09    [[24.88261133, 36.06645717], [24.88256875, 36....
    2014-01-10    [[24.88265407, 36.0665048], [24.88260906, 36.0...
dtype: object

In [12]:
cars_df = pd.read_csv(os.path.join(VAST_PATH, 'car-assignments.csv'))\
    .dropna(axis=0)
    
cars_df['CarID'] = cars_df.CarID.astype(int)

cars_df['FullName'] = cars_df.LastName.str.cat(cars_df.FirstName, sep=', ')
full_names_dict = dict(cars_df[['CarID', 'FullName']].values)

truck_drivers = pd.DataFrame([{'CarID': i,
  'LastName': 'Driver',
  'FirstName': 'Truck',
  'FullName': 'Truck Driver',
  'CurrentEmploymentType': 'Facilities',
  'CurrentEmploymentTitle': 'Truck Driver'
 }
 for i in [101, 104, 105, 106, 107]])

cars_df = pd.concat((cars_df, truck_drivers)).set_index('CarID')
cars_df

Unnamed: 0_level_0,CurrentEmploymentTitle,CurrentEmploymentType,FirstName,FullName,LastName
CarID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,IT Helpdesk,Information Technology,Lucas,"Alcazar, Lucas",Alcazar
2,Engineer,Engineering,Lars,"Azada, Lars",Azada
3,Engineer,Engineering,Felix,"Balas, Felix",Balas
4,SVP/CFO,Executive,Ingrid,"Barranco, Ingrid",Barranco
5,IT Technician,Information Technology,Isak,"Baza, Isak",Baza
6,IT Group Manager,Information Technology,Linnea,"Bergen, Linnea",Bergen
7,Drill Technician,Engineering,Isande,"Borrasca, Isande",Borrasca
8,IT Technician,Information Technology,Nils,"Calixto, Nils",Calixto
9,Drill Technician,Engineering,Axel,"Calzas, Axel",Calzas
10,SVP/CIO,Executive,Ada,"Campo-Corrente, Ada",Campo-Corrente


In [13]:
def as_dict(row):
    return row[row > 0]\
        .reset_index()\
        .values.tolist()
        
def get_person_day(x):
    vehicle, date = x
    date_str = date.strftime('%Y-%m-%d')
    _id = f'{vehicle}-{date_str}'

    doc = cars_df.loc[vehicle].to_dict()
    doc['_id'] = _id
    doc['date'] = date_str    
    doc['hour_category'] = as_dict(features_by_hour_category.loc[x])
    doc['location'] = as_dict(features_by_location.loc[x])
    doc['trajectory'] = trajectory[x]
    
    return doc

docs = [get_person_day(i) for i in trajectory.index]

In [14]:
docs[0].keys()

dict_keys(['CurrentEmploymentTitle', 'CurrentEmploymentType', 'FirstName', 'FullName', 'LastName', '_id', 'date', 'hour_category', 'location', 'trajectory'])

In [15]:
xmin, ymin = gps_df.min(axis=0)[['long', 'lat']]
xmax, ymax = gps_df.max(axis=0)[['long', 'lat']]

domain = {
    'x': [xmin, xmax],
    'y': [ymin, ymax]
}

In [17]:
chissl = cm.ChisslMongo(verbose=True)

chissl.create_collection('VAST 2014 MC2', docs, drop=True)

chissl.create_application('VAST Movement Analysis',
                          'VAST 2014 MC2',
                          'VastHistogramComponent',
                          'chissl.pipelines.vast.pipeline',
                          props={'domain': domain},
                          params={'nmf__n_components': 24},
                          drop=True)

{'_id': 'VAST Movement Analysis',
 'collection': 'VAST 2014 MC2',
 'component': 'VastHistogramComponent',
 'params': {'nmf__n_components': 24},
 'pipeline': 'chissl.pipelines.vast.pipeline',
 'props': {'domain': {'x': [24.82508806, 24.90848537],
   'y': [36.04802098, 36.08995956]}}}

In [None]:
chissl.db.transduction_.delete_many({'_id.application': 'VAST Movement Analysis'})