In [1]:
# Import libraries
import pickle
from os import path
from time import time
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopy.distance import vincenty

In [2]:
# Settings
DIR_DATA = path.join('data', 'twitter data')
DIR_GEO = path.join('data', 'geofiles')

In [3]:
# Loading the saved file is as easy as running these lines of code
with open(path.join(DIR_DATA, 'clean_data.pkl'), 'rb') as in_file:
    df = pickle.load(in_file)

In [4]:
def data_denoising(sub_df, crt_speed=60):
    """ This function aims to identify noisy tweets. By the term noisy, we mean that the
    reported location for the tweet is noisy."""
    zipped_columns = sub_df.tolist()
    lst = list(zip(*zipped_columns))
    lat = lst[0]
    lng = lst[1]
    tw_time = lst[2]
    denoised = [True] * len(sub_df)
    points = list(zip(lat, lng))

    orig = points[0:-2]
    dest1 = points[1:-1]
    dest2 = points[2::]

    for index in range(len(orig)):
        d1 = vincenty(dest1[index], orig[index]).meters
        t1 = tw_time[index+1] - tw_time[index]
        t1 = t1.total_seconds()
        v1 = d1 / t1 if t1 else float('inf')

        d2 = vincenty(dest2[index], dest1[index]).meters
        t2 = tw_time[index+2] - tw_time[index+1]
        t2 = t2.total_seconds()
        v2 = d2 / t2 if t2 else float('inf')

        d3 = vincenty(dest2[index], orig[index]).meters
        t3 = tw_time[index+2] - tw_time[index]
        t3 = t3.total_seconds()
        v3 = d3 / t3 if t3 else float('inf')

        if np.isinf(v1) | np.isinf(v2) | np.isinf(v3):
            denoised = [False] * len(denoised)
            break
        if (v1 > crt_speed) & (v2 > crt_speed):
            if v3 <= crt_speed:
                denoised[index+1] = False
            else:
                denoised[index] = False
                denoised[index+1] = False
                if index == len(orig) - 1:
                    denoised[index+2] = False
        if (v1 > crt_speed) & (v2 <= crt_speed):
            denoised[index] = False

    return denoised

In [5]:
# Remove noisy tweets with the above function
daily_user = ['userId', 'year', 'month', 'day']
df['new'] = tuple(zip(df['latitude'], df['longitude'], df['createdAt']))
not_noisy = df.groupby(by=daily_user)['new'].transform(lambda x: data_denoising(x))
df = df[not_noisy].reset_index(drop=True)
# Remove the generated column
del df['new']

In [6]:
# Load geofiles
ch_gdf = gpd.read_file(path.join(DIR_GEO, 'ch-cantons.json'))
fr_gdf = gpd.read_file(path.join(DIR_GEO, 'france-states.geojson'))
it_gdf = gpd.read_file(path.join(DIR_GEO, 'italy-states.json'))
de_gdf = gpd.read_file(path.join(DIR_GEO, 'germany-states.geojson'))
at_gdf = gpd.read_file(path.join(DIR_GEO, 'austria-states.geojson'))
li_gdf = gpd.read_file(path.join(DIR_GEO, 'liechtenstein.geojson'))

# Modify dataframes for merging
ch_gdf = ch_gdf[['geometry', 'name']]
ch_gdf['country'] = 'CH'

fr_gdf = fr_gdf[['geometry', 'name']]
fr_gdf['country'] = 'FR'

it_gdf = it_gdf[['geometry', 'name']]
it_gdf['country'] = 'IT'

de_gdf = de_gdf[['geometry', 'NAME_1']]
de_gdf = de_gdf.rename(columns={'NAME_1': 'name'})
de_gdf['country'] = 'DE'

at_gdf = at_gdf[['geometry', 'name']]
at_gdf['country'] = 'AT'

li_gdf = li_gdf[['geometry', 'NAME']]
li_gdf = li_gdf.rename(columns={'NAME': 'name'})
li_gdf['country'] = 'LI'
# Concatinate the dataframes
df_poly = pd.concat([ch_gdf, fr_gdf, it_gdf, de_gdf, at_gdf, li_gdf], ignore_index=True)
df_poly = df_poly.rename(columns={'name': 'state'})

In [7]:
# Convert our dataframe to a geopandas dataframe
df = gpd.GeoDataFrame(df)
df['geometry'] = df.apply(lambda row: Point(row.longitude, row.latitude), axis=1)
df.crs = df_poly.crs

In [8]:
# Offline locating using spatial indexing in geopandas
print('Start spatial merging process...')
t = time()
df = gpd.tools.sjoin(df, df_poly, how="left")
elapsed = time() - t
print('Elapsed time is ' + str(round(elapsed, 2)) + ' seconds.')

Start spatial merging process...
Elapsed time is 3795.78 seconds.


In [9]:
# Remove unlocated tweets
df = df[df['state'].notnull()].reset_index(drop=True)
# Remove index_right column
del df['index_right']
# Drop duplicate tweets. It might be possible that we locate boundaries into two different cantons
# df = df.drop_duplicates(subset='id')
# df = df.reset_index(drop=True)

In [10]:
def find_path(sub_df, cmt_num = 10):       
    origin = sub_df.values[0:-1]
    destination = sub_df.values[1::]
    index = origin != destination
    path = float('NaN')
    if (index.any()) & (sum(index) <= cmt_num):
        path = '->'.join(origin[index])
        path = '->'.join([path, destination[index][-1]])
    return path

In [11]:
daily_patterns = df.groupby(by=daily_user)['state'].apply(lambda x: find_path(x))
daily_patterns = daily_patterns.dropna()

In [12]:
# Save the results with pickle
with open(path.join('data', 'daily_patterns.pkl'), 'wb') as in_file:
    pickle.dump(daily_patterns, in_file, protocol=pickle.HIGHEST_PROTOCOL)