In [1]:
import pandas as pd
#import os, re
#import glob

import skmob
from skmob.measures.individual import jump_lengths
from skmob.measures.individual import radius_of_gyration
from tqdm.notebook import tqdm
import numpy as np
#import warnings
#warnings.filterwarnings('ignore')

In [4]:
print(pd.read_csv("input_file_new_columns.csv", nrows = 5))

   unix_start_t                                            user_ID   orig_lat  \
0    1552522827  f98247efb013243c91c247aa6c6e119e71404a698005d1...  47.651589   
1    1552850582  f98247efb013243c91c247aa6c6e119e71404a698005d1...  47.999315   
2    1552851399  f98247efb013243c91c247aa6c6e119e71404a698005d1...  47.999042   
3    1553827223  f98247efb013243c91c247aa6c6e119e71404a698005d1...  47.590755   
4    1553840035  f98247efb013243c91c247aa6c6e119e71404a698005d1...  47.998033   

    orig_long  orig_unc  
0 -122.327397       900  
1 -122.221824       122  
2 -122.221574       104  
3 -122.330222       103  
4 -122.221373       128  


In [4]:
def get_daily_metrics(df):
    df['date'] = df['time'].dt.date
    # number of records each day each user
    num_of_records_df = df.groupby(['user_id','date']).size().reset_index()
    num_of_records_df.columns = ['user_id','date','num_of_records']
    # temporal occupancy each day each user
    df['half_hour_index'] = df['time'].dt.hour * 2 + df['time'].dt.minute // 30
    df = df.drop_duplicates(['user_id','date','half_hour_index'])
    temporal_occupancy_df = df.groupby(['user_id','date']).size().reset_index()
    temporal_occupancy_df.columns = ['user_id','date','intra_day_temporal_occupancy']
    # merge
    merge_df = num_of_records_df.merge(temporal_occupancy_df,how='left',on=['user_id','date'])
    return merge_df

In [5]:
def get_acc(df):
    return df[['acc']]

In [6]:
def get_jump_length(df):
    tdf = skmob.TrajDataFrame(df, latitude='lat', longitude='lon', datetime='time', user_id='user_id')
    # Euclidean distance
    distance_mean_df = jump_lengths(tdf,False)
    distance_mean_df['jump_lengths'] = distance_mean_df.jump_lengths
    distance_mean_df.columns = ['user_id','jump_length']
    return distance_mean_df[['jump_length']]

In [7]:
def get_longterm_metrics(df):
    # high acc rate
    high_acc_df = df.groupby('user_id')['acc'].apply(lambda x: (x < 100).mean()).reset_index()
    high_acc_df.columns = ['user_id','acc_rate']
    # radius of gyration
    tdf = skmob.TrajDataFrame(df, latitude='lat', longitude='lon', datetime='time', user_id='user_id')
    radius_of_gyration_df = radius_of_gyration(tdf,False)
    radius_of_gyration_df.columns = ['user_id','radius_of_gyration']
    # Euclidean distance mean
    distance_mean_df = jump_lengths(tdf,False)
    distance_mean_df['jump_lengths'] = distance_mean_df.jump_lengths.apply(lambda x: np.mean(x) if len(x) > 0 else np.nan)
    distance_mean_df.columns = ['user_id','euclidean_distance_mean']
    # merge
    merge_df = pd.merge(pd.merge(high_acc_df, radius_of_gyration_df, on='user_id'), distance_mean_df, on='user_id')
    return merge_df

In [8]:
def write_to_csv(dataframe, output_path, mode='a', header=False):
    if not os.path.exists(output_path) or mode == 'w':
        header = True
    dataframe.to_csv(output_path, mode=mode, header=header, index=False)

In [9]:
# os.remove('Metrics/daily_metrics.csv')
# os.remove('Metrics/longterm_metrics.csv')
# os.remove('Metrics/acc.csv')

for zip_file in tqdm(zip_files):
    if 'Sample_MSAs_Ind' in zip_file:
        continue
    match = re.search(r'/([^/]+)\.zip$', zip_file)
    MSA = match.group(1)
    
    df = read_concat_csv_from_zip(zip_file,100)
    # daily_metrics_df = get_daily_metrics(df)
    # daily_metrics_df['MSA'] = MSA
    # write_to_csv(daily_metrics_df,'Metrics/daily_metrics.csv')
    # longterm_metrics_df = get_longterm_metrics(df)
    # longterm_metrics_df['MSA'] = MSA
    # write_to_csv(longterm_metrics_df,'Metrics/longterm_metrics.csv')
    acc_df = get_acc(df)
    acc_df['MSA'] = MSA
    write_to_csv(acc_df,'Metrics/acc.csv')
    jump_length_df = get_jump_length(df)
    jump_length_df['MSA'] = MSA
    write_to_csv(jump_length_df,'Metrics/jump_length.csv')

  0%|          | 0/12 [00:00<?, ?it/s]