In [1]:
# Import libraries
import pickle
from os import path
from time import time
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [2]:
# Settings
DIR_DATA = path.join('data', 'twitter data')
DIR_GEO = path.join('data', 'geofiles')

In [3]:
# Loading the saved file is as easy as running these lines of code
with open(path.join(DIR_DATA, 'clean_data.pkl'), 'rb') as in_file:
    df = pickle.load(in_file)

In [6]:
df.head()

1,id,userId,createdAt,longitude,latitude,text,day,month,year,daily_tweets
0,-2147483648,-2147483648,2016-09-15 20:48:01,8.96044,46.0027,se lo dici tu... https://t.co/x7Qm1VHBKL,15,9,2016,4159
1,-2147483648,-2147483648,2016-09-15 20:48:05,8.22414,46.8131,https://t.co/noYrTnqmg9,15,9,2016,4159
2,-2147483648,435239151,2016-09-15 20:48:15,5.94082,47.201,@BesacTof @Leonid_CCCP Tu dois t'engager en si...,15,9,2016,16
3,-2147483648,-2147483648,2016-09-15 20:48:27,8.96044,46.0027,dillo https://t.co/hScjeZbi4c,15,9,2016,4159
4,-2147483648,-2147483648,2016-09-15 20:48:29,9.64878,45.8865,Miii le voci nere.. Che meraviglia.. #XF10,15,9,2016,4159


In [None]:
# Remove noisy tweets with the above function
df['new'] = tuple(zip(df['latitude'], df['longitude'], df['createdAt']))
not_noisy = df.groupby(by=daily_user)['new'].transform(lambda x: data_denoising(x))
df = df[not_noisy].reset_index(drop=True)
# Remove the generated column
del df['new']

In [None]:
# Load geofiles
ch_gdf = gpd.read_file(path.join(DIR_GEO, 'ch-cantons.json'))
fr_gdf = gpd.read_file(path.join(DIR_GEO, 'france-states.geojson'))
it_gdf = gpd.read_file(path.join(DIR_GEO, 'italy-states.json'))
de_gdf = gpd.read_file(path.join(DIR_GEO, 'germany-states.geojson'))
at_gdf = gpd.read_file(path.join(DIR_GEO, 'austria-states.geojson'))
li_gdf = gpd.read_file(path.join(DIR_GEO, 'liechtenstein.geojson'))

# Modify dataframes for merging
ch_gdf = ch_gdf[['geometry', 'name']]
ch_gdf['country'] = 'CH'

fr_gdf = fr_gdf[['geometry', 'name']]
fr_gdf['country'] = 'FR'

it_gdf = it_gdf[['geometry', 'name']]
it_gdf['country'] = 'IT'

de_gdf = de_gdf[['geometry', 'NAME_1']]
de_gdf = de_gdf.rename(columns={'NAME_1': 'name'})
de_gdf['country'] = 'DE'

at_gdf = at_gdf[['geometry', 'name']]
at_gdf['country'] = 'AT'

li_gdf = li_gdf[['geometry', 'NAME']]
li_gdf = li_gdf.rename(columns={'NAME': 'name'})
li_gdf['country'] = 'LI'
# Concatinate the dataframes
df_poly = pd.concat([ch_gdf, fr_gdf, it_gdf, de_gdf, at_gdf, li_gdf], ignore_index=True)
df_poly = df_poly.rename(columns={'name': 'state'})

In [None]:
# Convert our dataframe to a geopandas dataframe
df = gpd.GeoDataFrame(df)
df['geometry'] = df.apply(lambda row: Point(row.longitude, row.latitude), axis=1)
df.crs = df_poly.crs

In [None]:
# Offline locating using spatial indexing in geopandas
print('Start spatial merging process...')
t = time()
df = gpd.tools.sjoin(df, df_poly, how="left")
elapsed = time() - t
print('Elapsed time is ' + str(round(elapsed, 2)) + ' seconds.')

In [None]:
# Remove unlocated tweets
df = df[df['state'].notnull()].reset_index(drop=True)
# Remove index_right column
del df['index_right']
# Drop duplicate tweets. It might be possible that we locate boundaries into two different cantons
# df = df.drop_duplicates(subset='id')
# df = df.reset_index(drop=True)

In [None]:
def find_path(sub_df, cmt_num = 10):       
    origin = sub_df.values[0:-1]
    destination = sub_df.values[1::]
    index = origin != destination
    path = float('NaN')
    if (index.any()) & (sum(index) <= cmt_num):
        path = '->'.join(origin[index])
        path = '->'.join([path, destination[index][-1]])
    return path

In [None]:
daily_patterns = df.groupby(by=daily_user)['state'].apply(lambda x: find_path(x))
daily_patterns = daily_patterns.dropna()

In [None]:
# Save the results with pickle
with open(path.join('data', 'daily_patterns.pkl'), 'wb') as in_file:
    pickle.dump(daily_patterns, in_file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
len(df.groupby(by=daily_user))

In [None]:
daily_patterns