In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import pickle
from os import path
from time import sleep, time
from datetime import datetime
from shapely.geometry import Point
from geopy.distance import vincenty
%matplotlib inline

In [2]:
# Loading the saved file is as easy as running these lines of code
with open('data/cache/' + 'offline_gdf' + '.pkl', 'rb') as f:
    obj = pickle.load(f)

In [3]:
obj['day'] = obj['createdAt'].map(lambda x: x.day)
obj['month'] = obj['createdAt'].map(lambda x: x.month)
obj['year'] = obj['createdAt'].map(lambda x: x.year)
daily_user = ['userId', 'day', 'month', 'year']
obj['daily_tweets'] = obj.groupby(by=daily_user)['userId'].transform('count')

In [4]:
# Remove rows corresponding to people who have less than a threshold value in one day
threshold_tweets = 2
obj = obj[obj['daily_tweets'] >= threshold_tweets].reset_index(drop=True)

In [5]:
def calculate_speed(sub_df):
    zipped_columns = sub_df.tolist()
    lst = list(zip(*zipped_columns))
    lat = lst[0]
    lng = lst[1]
    crt = lst[2]
    points = list(zip(lat,lng))
    origin = points[0:-1]
    destination = points[1::]
    velocity = []
    for i in range(len(origin)):
        distance = vincenty(origin[i], destination[i]).meters
        delta_t = crt[i+1] - crt[i]
        delta_t = delta_t.total_seconds()
        try:
            velocity.append(distance / delta_t)
        except ZeroDivisionError:
            velocity.append(float('Inf'))
    return [velocity[0]] + velocity

In [6]:
def find_path(sub_df):       
    origin = sub_df.values[0:-1]
    destination = sub_df.values[1::]
    index = origin != destination
    path = float('NaN')
    if index.any():
        path = '->'.join(origin[index])
        path = '->'.join([path, destination[index][-1]])
    return path    

In [49]:
def data_denoising(sub_df, crt_speed = 100):
    zipped_columns = sub_df.tolist()
    lst = list(zip(*zipped_columns))
    lat = lst[0]
    lng = lst[1]
    tw_time = lst[2]
    states = lst[3]
    points = list(zip(lat,lng))

    if len(sub_df) == 2:
        d = vincenty(points[1], points[0]).meters
        t = tw_time[1] - tw_time[0]
        t = delta_t.total_seconds()
        v = d / t if t else float('inf')
        if v > crt_speed:
            return [float('NaN'), float('NaN')]
        else:
            return states
    else:
        orig = points[0:-2]
        dest1 = points[1:-1]
        dest2 = points[2::]        
        denoised = states[:]

        for index in range(len(orig)):
            d1 = vincenty(dest1[index], orig[index]).meters
            t1 = tw_time[index+1] - tw_time[index]
            t1 = delta_t.total_seconds()
            v1 = d1 / t1 if t1 else float('inf')
            
            d2 = vincenty(dest2[index], dest1[index]).meters
            t2 = tw_time[index+2] - tw_time[index+1]
            t2 = delta_t.total_seconds()
            v2 = d2 / t2 if t2 else float('inf')

            d3 = vincenty(dest2[index], orig[index]).meters
            t3 = tw_time[index+2] - tw_time[index]
            t3 = delta_t.total_seconds()
            v3 = d3 / t3 if t3 else float('inf')
                
            if (v1 > crt_speed) & (v2 > crt_speed):
                if v3 <= crt_speed:
                    denoised[index+1] = float('NaN')
                else:
                    denoised[index] = float('NaN')
                    denoised[index+1] = float('NaN')
                    if index == len(orig) - 1:
                        denoised[index+2] = float('NaN')
            if (v1 > crt_speed) & (v2 <= crt_speed):
                denoised[index] = float('NaN')

        return denoised 

In [7]:
obj['daily_pattern'] = obj.groupby(by=daily_user)['state'].transform(lambda x: daily_travel(x))

In [9]:
obj = obj[obj['daily_pattern'].notnull()].reset_index(drop=True)

In [10]:
obj['daily_pattern']

0                                         Obwalden->Vaud
1                                         Obwalden->Vaud
2                              Ticino->lombardia->Ticino
3           Obwalden->Valais->Obwalden->Valais->Obwalden
4           Obwalden->Valais->Obwalden->Valais->Obwalden
5                              Ticino->lombardia->Ticino
6                              Ticino->lombardia->Ticino
7                           Genève->Auvergne-Rhone-Alpes
8           Obwalden->Valais->Obwalden->Valais->Obwalden
9           Obwalden->Valais->Obwalden->Valais->Obwalden
10          Obwalden->Valais->Obwalden->Valais->Obwalden
11          Obwalden->Valais->Obwalden->Valais->Obwalden
12                          Genève->Auvergne-Rhone-Alpes
13     Graubünden->Bourgogne-Franche-Comte->Baden-Wur...
14     Graubünden->Bourgogne-Franche-Comte->Baden-Wur...
15     Graubünden->Bourgogne-Franche-Comte->Baden-Wur...
16     Graubünden->Bourgogne-Franche-Comte->Baden-Wur...
17     Graubünden->Bourgogne-Fr

In [15]:
obj['new'] = list(zip(obj['latitude'], obj['longitude'], obj['createdAt']))

In [16]:
obj['speed'] = obj.groupby(by=daily_user)['new'].transform(lambda x: calculate_speed(x))

In [None]:
grouped = obj.groupby(by=daily_user)
idm = (79017633, 16, 9, 2016)
calculate_speed(grouped.get_group(idm))


In [None]:
obj['daily_pattern'] = obj.groupby(by=daily_user)['state'].transform(lambda x: daily_travel(x))

In [21]:
grouped = obj.groupby(by=daily_user)
for a in grouped:
    print(a[0])
    print(a[1][['createdAt', 'state', 'speed']])

(12582, 16, 9, 2016)
              createdAt              state    speed
358 2016-09-16 09:51:12        Basel-Stadt        0
399 2016-09-16 10:19:44        Basel-Stadt        0
437 2016-09-16 10:58:15        Basel-Stadt        0
469 2016-09-16 11:37:13  Baden-Wurttemberg  2.32758
518 2016-09-16 12:37:49  Baden-Wurttemberg        0
543 2016-09-16 12:48:52  Baden-Wurttemberg        0
633 2016-09-16 14:06:09  Baden-Wurttemberg        0
639 2016-09-16 14:09:56  Baden-Wurttemberg        0
739 2016-09-16 15:46:36  Baden-Wurttemberg        0
(997121, 16, 9, 2016)
              createdAt   state     speed
141 2016-09-16 06:08:22  Aargau  0.828777
529 2016-09-16 12:41:33  Zürich  0.828777
641 2016-09-16 14:10:47  Zürich         0
(12064702, 16, 9, 2016)
              createdAt       state    speed
275 2016-09-16 08:26:54  St. Gallen  7.54414
314 2016-09-16 09:04:56      Zürich  7.54414
(14604977, 16, 9, 2016)
              createdAt      state    speed
134 2016-09-16 06:04:44  lombardia  12.803

In [None]:
obj['daily_pattern'].unique()

In [None]:
obj[obj['daily_pattern']=='Obwalden->Vaud']

In [None]:
for sub_obj in obj.groupby(by=daily_user):
    print(sub_obj[1][['createdAt', 'state']])

In [None]:
tweets_per_day_by_user = obj.groupby(['userId', 'day', 'month', 'year'])['userId'].count().values

In [None]:
len(tweets_per_day_by_user)

In [None]:
sum(tweets_per_day_by_user >=5)

In [None]:
import numpy as np
A = np.array(['Solothurn', 'Zürich', 'Zürich', 'Zürich', 'Zürich',
              'Zürich', 'Zürich', 'Aargau', 'Solothurn', 'Aargau', 'Bern'])
np.unique(A)

In [None]:
tweets_per_day_by_user

In [None]:
lst = np.array(['A', 'A', 'B', 'A', 'C', 'C'])
a = lst[0:-1]
b = lst[1::]
c = a!=b
d = "-".join(a[c])
d = "-".join([d, b[c][-1]])
d

In [None]:
[a[c],b[c][-1]]

In [None]:
c

In [None]:
if c.all:
    print('yes')

In [None]:
obj.groupby(by=daily_user)['state']

In [None]:
df = pd.DataFrame({'Date':[1,1,1,2,2,2],'col1':[1,2,3,4,5,6],'col2':[1,2,3,4,5,6]})
col1 = 'col1'
col2 = 'col2'
def calc(dfg):
    nparray = np.array(dfg[col1])
    somecalc = np.array(dfg[col2])
    return nparray - nparray.mean()

In [None]:
def calc2(dfg):
    return dfg["col1"] - dfg["col1"].mean()

In [None]:
df.groupby('Date', as_index=True).apply(calc)

In [None]:
from geopy.distance import vincenty
newport_ri = (52.2296756, 21.0122287)
cleveland_oh = (52.406374, 16.9251681)
print(vincenty(newport_ri, cleveland_oh).kilometers)

In [None]:
a=[1,2,3]
b=[4,5,6]
list(zip(a,b))

In [None]:
a = [obj['createdAt'][1]-obj['createdAt'][0], obj['createdAt'][2]-obj['createdAt'][1]]
list(map(lambda i: i.total_seconds(), a))

In [None]:
A = obj['createdAt'].iloc[0]

In [None]:
A

In [None]:
a=[1,2,3]
b = [a[0]] + a

In [None]:
b

In [None]:
len(obj)

In [29]:
lst = []
if lst[-1]:
    print('yes')

IndexError: list index out of range

In [37]:
d = 10
t = 0.01
v = d / t if t else float('inf')
v

1000.0

In [41]:
a = 10
b = 10
c = 20
if (a == b) & (c < a):
    print('yes')

In [44]:
a = [1, 3, 4, 1, 5]
b = a[:]

In [45]:
a[3]=2
a

[1, 3, 4, 2, 5]

In [48]:
b[-3::]

[4, 1, 5]