In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import pickle
from os import path
from time import sleep, time
from datetime import datetime
from shapely.geometry import Point
from geopy.distance import vincenty
%matplotlib inline

In [2]:
# Loading the saved file is as easy as running these lines of code
with open('data/cache/' + 'gdf_join' + '.pkl', 'rb') as f:
    obj = pickle.load(f)

In [3]:
obj['day'] = obj['createdAt'].map(lambda x: x.day)
obj['month'] = obj['createdAt'].map(lambda x: x.month)
obj['year'] = obj['createdAt'].map(lambda x: x.year)
daily_user = ['userId', 'day', 'month', 'year']
obj['daily_tweets'] = obj.groupby(by=daily_user)['userId'].transform('count')

In [4]:
# Remove rows corresponding to people who have less than a threshold value in one day
threshold_tweets = 2
obj = obj[obj['daily_tweets'] >= threshold_tweets].reset_index(drop=True)

In [57]:
def calculate_speed(sub_df):
    lat = sub_df['placeLatitude']
    lng = sub_df['placeLongitude']
    points = list(zip(lat,lng))
    origin = points[0:-1]
    destination = points[1::]
    velocity = []
    for i in range(len(origin)-1):
        distance = vincenty(origin[i], destination[i]).meters
        delta_t = sub_df['createdAt'].iloc[i+2] - sub_df['createdAt'].iloc[i]
        delta_t = delta_t.total_seconds()
        try:
            velocity.append(distance / delta_t)
        except ZeroDivisionError:
            velocity.append(float('Inf'))
    return [velocity[0]] + velocity

In [7]:
def daily_travel(sub_df):       
    origin = sub_df.values[0:-1]
    destination = sub_df.values[1::]
    index = origin != destination
    path = float('NaN')
    if index.any():
        path = '->'.join(origin[index])
        path = '->'.join([path, destination[index][-1]])
    return path    

In [58]:
grouped = obj.groupby(by=daily_user)
idm = (79017633, 16, 9, 2016)
calculate_speed(grouped.get_group(idm))


[0.0,
 0.0,
 0.0,
 10.400062151010257,
 9.773188304752807,
 21.81372139698954,
 13.16702748353838,
 0.0,
 0.0,
 16.252921446703887,
 7.3807592217058255,
 9.66604190401445]

In [18]:
obj['daily_pattern'] = obj.groupby(by=daily_user)['state'].transform(lambda x: daily_travel(x))

In [19]:
obj = obj[obj['daily_pattern'].notnull()].reset_index(drop=True)

In [31]:
grouped = obj.groupby(by=daily_user)
for a in grouped:
    print(a[0])
    print(a[1]['state'])

(12582, 16, 9, 2016)
356          Basel-Stadt
397          Basel-Stadt
435          Basel-Stadt
467    Baden-Wurttemberg
515    Baden-Wurttemberg
539    Baden-Wurttemberg
629    Baden-Wurttemberg
635    Baden-Wurttemberg
734    Baden-Wurttemberg
Name: state, dtype: object
(997121, 16, 9, 2016)
141    Aargau
526    Zürich
637    Zürich
Name: state, dtype: object
(12064702, 16, 9, 2016)
275    St. Gallen
313        Zürich
Name: state, dtype: object
(14604977, 16, 9, 2016)
134    lombardia
137    lombardia
152       Valais
Name: state, dtype: object
(15062968, 16, 9, 2016)
102    Zürich
108    Zürich
120    Zürich
122    Zürich
125    Zürich
126    Zürich
128    Zürich
248    Luzern
278    Luzern
354    Luzern
362    Luzern
380    Luzern
419    Luzern
430    Luzern
483    Luzern
484    Luzern
489    Luzern
490    Luzern
491    Luzern
550    Luzern
552    Luzern
Name: state, dtype: object
(17365179, 16, 9, 2016)
111    Luzern
406    Zürich
Name: state, dtype: object
(17672379, 16, 9, 2016)

In [None]:
obj['daily_pattern'].unique()

In [27]:
obj[obj['daily_pattern']=='Obwalden->Vaud']

Unnamed: 0,id,userId,createdAt,placeLatitude,placeLongitude,geometry,index_right,state,country,day,month,year,daily_tweets,daily_pattern
0,776523000636203010,2741685639,2016-09-15 20:48:05,46.8131,8.22414,POINT (8.22414 46.8131),5,Obwalden,CH,15,9,2016,2,Obwalden->Vaud
1,776523269361074176,2741685639,2016-09-15 20:49:09,46.4757,6.92537,POINT (6.92537 46.4757),21,Vaud,CH,15,9,2016,2,Obwalden->Vaud


In [None]:
# Remove rows corresponding to people who have less than a threshold value in one day
threshold_tweets = 5
obj = obj[obj['daily_tweets'] >= threshold_tweets].reset_index(drop=True)

In [None]:
for sub_obj in obj.groupby(by=daily_user):
    print(sub_obj[1][['createdAt', 'state']])

In [None]:
tweets_per_day_by_user = obj.groupby(['userId', 'day', 'month', 'year'])['userId'].count().values

In [None]:
len(tweets_per_day_by_user)

In [None]:
sum(tweets_per_day_by_user >=5)

In [None]:
import numpy as np
A = np.array(['Solothurn', 'Zürich', 'Zürich', 'Zürich', 'Zürich',
              'Zürich', 'Zürich', 'Aargau', 'Solothurn', 'Aargau', 'Bern'])
np.unique(A)

In [None]:
tweets_per_day_by_user

In [None]:
lst = np.array(['A', 'A', 'B', 'A', 'C', 'C'])
a = lst[0:-1]
b = lst[1::]
c = a!=b
d = "-".join(a[c])
d = "-".join([d, b[c][-1]])
d

In [None]:
[a[c],b[c][-1]]

In [None]:
c

In [None]:
if c.all:
    print('yes')

In [None]:
obj.groupby(by=daily_user)['state']

In [None]:
df = pd.DataFrame({'Date':[1,1,1,2,2,2],'col1':[1,2,3,4,5,6],'col2':[1,2,3,4,5,6]})
col1 = 'col1'
col2 = 'col2'
def calc(dfg):
    nparray = np.array(dfg[col1])
    somecalc = np.array(dfg[col2])
    return nparray - nparray.mean()

In [None]:
def calc2(dfg):
    return dfg["col1"] - dfg["col1"].mean()

In [None]:
df.groupby('Date', as_index=True).apply(calc)

In [None]:
from geopy.distance import vincenty
newport_ri = (52.2296756, 21.0122287)
cleveland_oh = (52.406374, 16.9251681)
print(vincenty(newport_ri, cleveland_oh).kilometers)

In [None]:
a=[1,2,3]
b=[4,5,6]
list(zip(a,b))

In [None]:
a = [obj['createdAt'][1]-obj['createdAt'][0], obj['createdAt'][2]-obj['createdAt'][1]]
list(map(lambda i: i.total_seconds(), a))

In [None]:
A = obj['createdAt'].iloc[0]

In [None]:
A

In [None]:
a=[1,2,3]
b = [a[0]] + a

In [None]:
b

In [5]:
len(obj)

7022