In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import pickle
from os import path
from time import sleep, time
from datetime import datetime
from shapely.geometry import Point
from geopy.distance import vincenty
%matplotlib inline

In [None]:
# Loading the saved file is as easy as running these lines of code
with open('data/cache/' + 'offline_gdf' + '.pkl', 'rb') as f:
    obj = pickle.load(f)

In [None]:
obj['day'] = obj['createdAt'].map(lambda x: x.day)
obj['month'] = obj['createdAt'].map(lambda x: x.month)
obj['year'] = obj['createdAt'].map(lambda x: x.year)
daily_user = ['userId', 'day', 'month', 'year']
obj['daily_tweets'] = obj.groupby(by=daily_user)['userId'].transform('count')

In [None]:
# Remove rows corresponding to people who have less than a threshold value in one day
threshold_tweets = 2
obj = obj[obj['daily_tweets'] >= threshold_tweets].reset_index(drop=True)

In [None]:
def data_denoising(sub_df, crt_speed = 60):
    zipped_columns = sub_df.tolist()
    lst = list(zip(*zipped_columns))
    lat = lst[0]
    lng = lst[1]
    tw_time = lst[2]
    states = lst[3]
    points = list(zip(lat,lng))

    if len(sub_df) == 2:
        d = vincenty(points[1], points[0]).meters
        t = tw_time[1] - tw_time[0]
        t = t.total_seconds()
        v = d / t if t else float('inf')
        if v > crt_speed:
            return [float('NaN'), float('NaN')]
        else:
            return states
    else:
        orig = points[0:-2]
        dest1 = points[1:-1]
        dest2 = points[2::]        
        denoised = list(states)

        for index in range(len(orig)):
            d1 = vincenty(dest1[index], orig[index]).meters
            t1 = tw_time[index+1] - tw_time[index]
            t1 = t1.total_seconds()
            v1 = d1 / t1 if t1 else float('inf')
            
            d2 = vincenty(dest2[index], dest1[index]).meters
            t2 = tw_time[index+2] - tw_time[index+1]
            t2 = t2.total_seconds()
            v2 = d2 / t2 if t2 else float('inf')

            d3 = vincenty(dest2[index], orig[index]).meters
            t3 = tw_time[index+2] - tw_time[index]
            t3 = t3.total_seconds()
            v3 = d3 / t3 if t3 else float('inf')
            
            if np.isinf(v1) | np.isinf(v2) | np.isinf(v3):
                denoised = [float('NaN')] * len(denoised)
                break
            if (v1 > crt_speed) & (v2 > crt_speed):
                if v3 <= crt_speed:
                    denoised[index+1] = float('NaN')
                else:
                    denoised[index] = float('NaN')
                    denoised[index+1] = float('NaN')
                    if index == len(orig) - 1:
                        denoised[index+2] = float('NaN')
            if (v1 > crt_speed) & (v2 <= crt_speed):
                denoised[index] = float('NaN')

        return denoised 

In [None]:
obj['new'] = list(zip(obj['latitude'], obj['longitude'], obj['createdAt'], obj['state']))

In [None]:
obj['state'] = obj.groupby(by=daily_user)['new'].transform(lambda x: data_denoising(x))
obj = obj[obj['state'].notnull()].reset_index(drop=True)

We still need to restrict ourselves more in order to find more reasonable results. There are still some traveling patterns which do not make sense as the number of commutes between different cantons are so many. Thus, we will remove all tweets corresponding to a user in a day who has commutes more than a threshold value.

In [None]:
def find_path(sub_df, cmt_num = 10):       
    origin = sub_df.values[0:-1]
    destination = sub_df.values[1::]
    index = origin != destination
    path = float('NaN')
    if (index.any()) & (sum(index) <= cmt_num):
        path = '->'.join(origin[index])
        path = '->'.join([path, destination[index][-1]])
    return path

In [None]:
obj['daily_pattern'] = obj.groupby(by=daily_user)['state'].transform(lambda x: find_path(x))
obj = obj[obj['daily_pattern'].notnull()].reset_index(drop=True)