In [1]:
# Import libraries
import pickle
from os import path
from time import time, sleep
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopy.distance import vincenty
from geopy.geocoders import Nominatim
from difflib import get_close_matches

In [2]:
# Settings
DIR_DATA = path.join('..', 'data', 'sample data')
DIR_GEO = path.join('..', 'data', 'geofiles')

In [3]:
# Loading the saved file
with open(path.join(DIR_DATA, 'clean_data.pkl'), 'rb') as in_file:
    df = pickle.load(in_file)

In [4]:
def data_denoising(sub_df, crt_speed=60):
    """ This function aims to identify noisy tweets. By the term noisy, we mean that the
    reported location for the tweet is noisy."""
    zipped_columns = sub_df.tolist()
    lst = list(zip(*zipped_columns))
    lat = lst[0]
    lng = lst[1]
    tw_time = lst[2]
    denoised = [True] * len(sub_df)
    points = list(zip(lat, lng))

    orig = points[0:-2]
    dest1 = points[1:-1]
    dest2 = points[2::]

    for index in range(len(orig)):
        d1 = vincenty(dest1[index], orig[index]).meters
        t1 = tw_time[index+1] - tw_time[index]
        t1 = t1.total_seconds()
        v1 = d1 / t1 if t1 else float('inf')

        d2 = vincenty(dest2[index], dest1[index]).meters
        t2 = tw_time[index+2] - tw_time[index+1]
        t2 = t2.total_seconds()
        v2 = d2 / t2 if t2 else float('inf')

        d3 = vincenty(dest2[index], orig[index]).meters
        t3 = tw_time[index+2] - tw_time[index]
        t3 = t3.total_seconds()
        v3 = d3 / t3 if t3 else float('inf')

        if np.isinf(v1) | np.isinf(v2) | np.isinf(v3):
            denoised = [False] * len(denoised)
            break
        if (v1 > crt_speed) & (v2 > crt_speed):
            if v3 <= crt_speed:
                denoised[index+1] = False
            else:
                denoised[index] = False
                denoised[index+1] = False
                if index == len(orig) - 1:
                    denoised[index+2] = False
        if (v1 > crt_speed) & (v2 <= crt_speed):
            denoised[index] = False

    return denoised

In [5]:
# Remove noisy tweets with the above function
daily_user = ['userId', 'year', 'month', 'day']
df['new'] = tuple(zip(df['latitude'], df['longitude'], df['createdAt']))
not_noisy = df.groupby(by=daily_user)['new'].transform(lambda x: data_denoising(x))
df = df[not_noisy].reset_index(drop=True)
# Remove the generated column
del df['new']

In order to recover the cities location from latitude-longitude pairs, we use two different strategies:

1. **online strategy:** we use the geopy API to send a request containing information about the longitude and latitude of a place. The main cumbersome here is that all these kind of online APIs have some kind of request rate limit, and as it is suggested in [its website](http://wiki.openstreetmap.org/wiki/Nominatim_usage_policy), the time between two consecutive request should be more that 1 seconds. This actually makes the online approach so slow. One remedy to accelerate the process is to save longitude-latitude: location pair to a dictionary. Thus, before sending a request, we first check whether we have the location in our dictionary or not.

2. **offline strategy:** we can also use the geojson or topojson files for Switzerland and its neighbor countries. The corresponding geofiles are downloaded from the following github repositories:
    1. Switzerland topojson file from [swiss_map repo](https://github.com/interactivethings/swiss-maps).
    2. France geojson file from [france-geojson repo](https://github.com/gregoiredavid/france-geojson).
    3. Italy geojson file from [leaflet-geojson-selector repo](https://github.com/stefanocudini/leaflet-geojson-selector).
    4. Germany geojson file from [deutschlandGeoJSON repo](https://github.com/isellsoap/deutschlandGeoJSON)
    5. Austria geojson file from [click_that_hood repo](https://github.com/codeforamerica/click_that_hood).
    6. Liechtenstein geojson file from [CountryGeoJSONCollection repo](https://github.com/LonnyGomes/CountryGeoJSONCollection).

In general, for the offinle strategy, one can also follow this [stackoverflow response](http://stackoverflow.com/questions/6159074/given-the-lat-long-coordinates-how-can-we-find-out-the-city-country/6355183#6355183) or this [one](http://stackoverflow.com/a/24871449/5267664). The first one relies on the geoname database while the second one actually gives us a procedure to find geojson files for any country.

## 1. Online strategy:
We start from the online strategy. The following function find the country together with its state/canton of a location. We will see later that we could do the same thing in the offline approach.

In [6]:
# Function for finding a location from the latitude-longitude information using online API
geolocator = Nominatim()
locations = dict()
def online_locating(data):
    lat = str(data.latitude)
    lng = str(data.longitude)
    lookup = ','.join([lat, lng])
    if lookup not in set(locations.keys()):
        try:
            location = geolocator.reverse(lookup, language='en')
        except TimeOut:
            online_locating(data)                
        try:
            country = location.raw['address']['country_code'].upper()
        except:
            country = float('NaN')
        try:
            state = location.raw['address']['state']
        except:
            try:
                state = location.raw['address']['country']
            except:
                state = float('NaN')
        locations[lookup] = {'country': country, 'state': state}
        sleep(1) # sleep for 1 sec (required by Nominatim usage policy)
    return pd.Series({'country': locations[lookup]['country'],
                      'state': locations[lookup]['state']})

In [7]:
online_df = df.copy()
t = time()
online_df[['country', 'state']] = online_df.apply(lambda x: online_locating(x), axis=1)
elapsed = time() - t
print('Elapsed time is ' + str(round(elapsed, 2)) + ' seconds.')

Elapsed time is 312.38 seconds.


As we can see, it took very long to even recover locations for the sample file. Hence, it does not make sense to follow the online approach for the actual problem.

## 2. Offline approach:
As we mentioned before, it is necessary to download the required geojson/topojson file to run the offline approach. All files are available in data/geofiles folder. In this part, we use [gepandas](http://geopandas.org/) for furthur analysis. The resulting dataframes can be used to find the location of tweets.

In [8]:
ch_gdf = gpd.read_file(path.join('..', 'data', 'geofiles', 'ch-cantons.json'))
fr_gdf = gpd.read_file(path.join('..', 'data', 'geofiles', 'france-states.geojson'))
it_gdf = gpd.read_file(path.join('..', 'data', 'geofiles', 'italy-states.json'))
de_gdf = gpd.read_file(path.join('..', 'data', 'geofiles', 'germany-states.geojson'))
at_gdf = gpd.read_file(path.join('..', 'data', 'geofiles', 'austria-states.geojson'))
li_gdf = gpd.read_file(path.join('..', 'data', 'geofiles', 'liechtenstein.geojson'))

In [9]:
# Modify dataframes for merging
ch_gdf = ch_gdf[['geometry', 'name']]
ch_gdf['country'] = 'CH'

fr_gdf = fr_gdf[['geometry', 'name']]
fr_gdf['country'] = 'FR'

it_gdf = it_gdf[['geometry', 'name']]
it_gdf['country'] = 'IT'

de_gdf = de_gdf[['geometry', 'NAME_1']]
de_gdf = de_gdf.rename(columns={'NAME_1': 'name'})
de_gdf['country'] = 'DE'

at_gdf = at_gdf[['geometry', 'name']]
at_gdf['country'] = 'AT'

li_gdf = li_gdf[['geometry', 'NAME']]
li_gdf = li_gdf.rename(columns={'NAME': 'name'})
li_gdf['country'] = 'LI'

df_poly = pd.concat([ch_gdf, fr_gdf, it_gdf, de_gdf, at_gdf, li_gdf], ignore_index=True)
df_poly = df_poly.rename(columns={'name': 'state'})

The R-tree structure in geopandas dataframe enables us to find the twitters location very fast. The following function find each location is inside which state/canton. A tutorial to show how to take advantages of R-tree structure is available [here](http://geoffboeing.com/2016/10/r-tree-spatial-index-python/#more-2183).

In [10]:
df = gpd.GeoDataFrame(df)
df['geometry'] = df.apply(lambda row: Point(row.longitude, row.latitude), axis=1)
df.crs = df_poly.crs

In [11]:
t = time()
offline_gdf = gpd.tools.sjoin(df, df_poly, how="left")
elapsed = time() - t
print('Elapsed time is ' + str(round(elapsed, 4)) + ' seconds.')

Elapsed time is 0.7876 seconds.


In [12]:
offline_gdf.drop_duplicates(subset='id', inplace=True)
offline_gdf.reset_index(drop=True, inplace=True)

However, there are still some tweets which we are not able to find its location. For these tweets, we call the online_locating function in order to find the location of the tweets.

In [13]:
null_index = offline_gdf['state'].isnull()
offline_gdf.loc[null_index,['country', 'state']] = offline_gdf[null_index].apply(
    lambda row: online_locating(row), axis=1)

There also some inconsistansy between the geolacating by offline approach and online approach. The following function aims to remove these inconsistansies.

In [14]:
def modify_dataframe(row):
    countries = set(df_poly['country'].unique())
    if row['country'] not in countries:
        row['state'] = row['country']
    else:
        sub_gdf = df_poly[df_poly['country'] == row['country']]
        states = sub_gdf.state.values
        if 'Bavaria' in row['state']:
            row['state'] = 'Bayern'
        elif row['state'] == 'Great East':
            row['state'] = 'Alsace-Champagne-Ardenne-Lorraine'
        elif row['state'] == 'Grisons':
            row['state'] = 'GraubÃ¼nden'
        elif row['state'] == 'Aosta Valley':
            row['state'] = "valle d'aosta"
        else:
            row['state'] = get_close_matches(row['state'], states, 1, 0)[0]
        row['index_right'] = sub_gdf[sub_gdf['state'] == row['state']].index.values[0]
    return row

In [15]:
offline_gdf.loc[null_index,:] = offline_gdf[null_index].apply(
    lambda row: modify_dataframe(row), axis=1)

In [16]:
online_df = online_df.apply(lambda row: modify_dataframe(row), axis=1)
print('The error in locating country is %' + str(
        round(100 * sum(online_df['country'] != offline_gdf['country'])/online_df.shape[0],2)))
print('The error in locating state is %' + str(
        round(100 * sum(online_df['state'] != offline_gdf['state'])/online_df.shape[0],2)))

The error in locating country is %0.0
The error in locating state is %0.46


In [17]:
# Remove tweets from countries which are not in our list
offline_gdf = offline_gdf[offline_gdf['index_right'].notnull()]

# Reset index
offline_gdf.reset_index(drop=True, inplace=True)

# Remove index_right column
del offline_gdf['index_right']

In [18]:
def find_path(sub_df, cmt_num = 10):       
    origin = sub_df.values[0:-1]
    destination = sub_df.values[1::]
    index = origin != destination
    path = float('NaN')
    if (index.any()) & (sum(index) <= cmt_num):
        path = '->'.join(origin[index])
        path = '->'.join([path, destination[index][-1]])
    return path

In [19]:
daily_patterns = offline_gdf.groupby(by=daily_user)['state'].apply(lambda x: find_path(x))
daily_patterns = daily_patterns.dropna()

In [20]:
# Save the results with pickle
with open(path.join(DIR_DATA, 'daily_patterns.pkl'), 'wb') as in_file:
    pickle.dump(daily_patterns, in_file, protocol=pickle.HIGHEST_PROTOCOL)