In [1]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import pickle
from os import path
from time import sleep, time
from datetime import datetime
from shapely.geometry import Point
from difflib import get_close_matches
from geopy.geocoders import Nominatim
%matplotlib inline

In [2]:
# Read the csv file
print('Reading sample.tsv file...')
df = pd.read_csv(
    path.join('data', 'sample.tsv'),
    sep="\t",
    encoding='utf-8',
    escapechar='\\',
    na_values='N',
    quoting=csv.QUOTE_NONE,
    header=None
)
print('is done!')

Reading sample.tsv file...
is done!


In [3]:
print('Reading schema.txt file...')
schema = pd.read_csv(
    path.join('data', 'schema.txt'),
    sep="\s+",
    header =None
)
print('is done!')
# Rename the dataframe columns
df.columns = schema[1]

Reading schema.txt file...
is done!


In [4]:
# From what we observe it is better to consider latitude/logitude columns for the tweets which these
# information are available.
df['latitude'].fillna(df['placeLatitude'], inplace=True)
df['longitude'].fillna(df['placeLongitude'], inplace=True)

In [5]:
# Drop rows with NaN values in important columns
df.dropna(
    subset=['createdAt', 'id', 'latitude', 'longitude', 'userId'],
    how='any',
    inplace=True
)

In [6]:
# Change the string in 'createdAt' column to datetime format
df['createdAt'] = pd.to_datetime(
    df['createdAt'],
    format='%Y-%m-%d %H:%M:%S',
    errors='coerce'
)

In [7]:
# Remove duplicated tweets with the same id
if not df['id'].is_unique:
    df.drop_duplicates(subset='id', inplace=True)
# Remove unnecessary columns
df = df[['id', 'userId', 'createdAt', 'text', 'longitude', 'latitude']]
# Reset index
df.reset_index(drop=True, inplace=True)

In order to recover the cities location from latitude-longitude pairs, we use two different strategies:

1. **online strategy:** we use the geopy API to send a request containing information about the longitude and latitude of a place. The main cumbersome here is that all these kind of online APIs have some kind of request rate limit, and as it is suggested in [its website](http://wiki.openstreetmap.org/wiki/Nominatim_usage_policy), the time between two consecutive request should be more that 1 seconds. This actually makes the online approach so slow. One remedy to accelerate the process is to save longitude-latitude: location pair to a dictionary. Thus, before sending a request, we first check whether we have the location in our dictionary or not.

2. **offline strategy:** we can also use the geojson or topojson files for Switzerland and its neighbor countries. The corresponding geofiles are downloaded from the following github repositories:
    1. Switzerland topojson file from [swiss_map repo](https://github.com/interactivethings/swiss-maps).
    2. France geojson file from [france-geojson repo](https://github.com/gregoiredavid/france-geojson).
    3. Italy geojson file from [leaflet-geojson-selector repo](https://github.com/stefanocudini/leaflet-geojson-selector).
    4. Germany geojson file from [deutschlandGeoJSON repo](https://github.com/isellsoap/deutschlandGeoJSON)
    5. Austria geojson file from [click_that_hood repo](https://github.com/codeforamerica/click_that_hood).
    6. Liechtenstein geojson file from [CountryGeoJSONCollection repo](https://github.com/LonnyGomes/CountryGeoJSONCollection).

In general, for the offinle strategy, one can also follow this [stackoverflow response](http://stackoverflow.com/questions/6159074/given-the-lat-long-coordinates-how-can-we-find-out-the-city-country/6355183#6355183) or this [one](http://stackoverflow.com/a/24871449/5267664). The first one relies on the geoname database while the second one actually gives us a procedure to find geojson files for any country.

## 1. Online strategy:
We start from the online strategy. The following function find the country together with its state/canton of a location. We will see later that we could do the same thing in the offline approach.

In [8]:
# Function for finding a location from the latitude-longitude information using online API
geolocator = Nominatim()
locations = dict()
def online_locating(data):
    lat = str(data.latitude)
    lng = str(data.longitude)
    lookup = ','.join([lat, lng])
    if lookup not in set(locations.keys()):
        try:
            location = geolocator.reverse(lookup, language='en')
        except TimeOut:
            online_locating(data)                
        try:
            country = location.raw['address']['country_code'].upper()
        except:
            country = float('NaN')
        try:
            state = location.raw['address']['state']
        except:
            try:
                state = location.raw['address']['country']
            except:
                state = float('NaN')
        locations[lookup] = {'country': country, 'state': state}
        sleep(1) # sleep for 1 sec (required by Nominatim usage policy)
    return pd.Series({'country': locations[lookup]['country'],
                      'state': locations[lookup]['state']})

In [9]:
online_df = df.copy()
t = time()
online_df[['country', 'state']] = online_df.apply(lambda x: online_locating(x), axis=1)
elapsed = time() - t
print('Elapsed time is ' + str(round(elapsed, 4)) + ' seconds.')

Elapsed time is 1962.436 seconds.


As we can see, it took very long to even recover locations for the sample file. Hence, it does not make sense to follow the online approach for the actual problem.

## 2. Offline approach:
As we mentioned before, it is necessary to download the required geojson/topojson file to run the offline approach. All files are available in data/geofiles folder. In this part, we use [gepandas](http://geopandas.org/) for furthur analysis. The resulting dataframes can be used to find the location of tweets.

In [10]:
ch_gdf = gpd.read_file(path.join('data/geofiles', 'ch-cantons.json'))
fr_gdf = gpd.read_file(path.join('data/geofiles', 'france-states.geojson'))
it_gdf = gpd.read_file(path.join('data/geofiles', 'italy-states.json'))
de_gdf = gpd.read_file(path.join('data/geofiles', 'germany-states.geojson'))
at_gdf = gpd.read_file(path.join('data/geofiles', 'austria-states.geojson'))
li_gdf = gpd.read_file(path.join('data/geofiles', 'liechtenstein.geojson'))

In [11]:
# Modify dataframes for merging
ch_gdf = ch_gdf[['geometry', 'name']]
ch_gdf['country'] = 'CH'

fr_gdf = fr_gdf[['geometry', 'name']]
fr_gdf['country'] = 'FR'

it_gdf = it_gdf[['geometry', 'name']]
it_gdf['country'] = 'IT'

de_gdf = de_gdf[['geometry', 'NAME_1']]
de_gdf = de_gdf.rename(columns={'NAME_1': 'name'})
de_gdf['country'] = 'DE'

at_gdf = at_gdf[['geometry', 'name']]
at_gdf['country'] = 'AT'

li_gdf = li_gdf[['geometry', 'NAME']]
li_gdf = li_gdf.rename(columns={'NAME': 'name'})
li_gdf['country'] = 'LI'

gdf_poly = pd.concat([ch_gdf, fr_gdf, it_gdf, de_gdf, at_gdf, li_gdf], ignore_index=True)
gdf_poly = gdf_poly.rename(columns={'name': 'state'})

The R-tree structure in geopandas dataframe enables us to find the twitters location very fast. The following function find each location is inside which state/canton. A tutorial to show how to take advantages of R-tree structure is available [here](http://geoffboeing.com/2016/10/r-tree-spatial-index-python/#more-2183).

In [12]:
gdf_point = gpd.GeoDataFrame(df)
gdf_point['geometry'] = df.apply(lambda row: Point(row.longitude, row.latitude), axis=1)
gdf_point.crs = gdf_poly.crs

In [13]:
t = time()
offline_gdf = gpd.tools.sjoin(gdf_point, gdf_poly, how="left")
elapsed = time() - t
print('Elapsed time is ' + str(round(elapsed, 4)) + ' seconds.')

Elapsed time is 2.206 seconds.


In [14]:
offline_gdf.drop_duplicates(subset='id', inplace=True)
offline_gdf.reset_index(drop=True, inplace=True)

However, there are still some tweets which we are not able to find its location. For these tweets, we call the online_locating function in order to find the location of the tweets.

In [15]:
null_index = offline_gdf['state'].isnull()
offline_gdf.loc[null_index,['country', 'state']] = offline_gdf[null_index].apply(
    lambda row: online_locating(row), axis=1)

There also some inconsistansy between the geolacating by offline approach and online approach. The following function aims to remove these inconsistansies.

In [16]:
def modify_dataframe(row):
    countries = set(gdf_poly['country'].unique())
    if row['country'] not in countries:
        row['state'] = row['country']
    else:
        sub_gdf = gdf_poly[gdf_poly['country'] == row['country']]
        states = sub_gdf.state.values
        if 'Bavaria' in row['state']:
            row['state'] = 'Bayern'
        elif row['state'] == 'Great East':
            row['state'] = 'Alsace-Champagne-Ardenne-Lorraine'
        elif row['state'] == 'Grisons':
            row['state'] = 'Graubünden'
        elif row['state'] == 'Aosta Valley':
            row['state'] = "valle d'aosta"
        else:
            row['state'] = get_close_matches(row['state'], states, 1, 0)[0]
        row['index_right'] = sub_gdf[sub_gdf['state'] == row['state']].index.values[0]
    return row

In [17]:
offline_gdf.loc[null_index,:] = offline_gdf[null_index].apply(
    lambda row: modify_dataframe(row), axis=1)

Now we can measure the difference between online/exact and offline/approximate approaches.

In [18]:
online_df = online_df.apply(lambda row: modify_dataframe(row), axis=1)
print('The error in locating country is %' + str(
        round(100 * sum(online_df['country'] != offline_gdf['country'])/online_df.shape[0],2)))
print('The error in locating state is %' + str(
        round(100 * sum(online_df['state'] != offline_gdf['state'])/online_df.shape[0],2)))

The error in locating country is %0.49
The error in locating state is %0.77


As you can see the error is not significant. Thus, the proposed mixed offline-online mapping is both fast and accurate.

In [19]:
# Remove tweets from countries which are not in our list
offline_gdf = offline_gdf[offline_gdf['index_right'].notnull()]
# Reset index
offline_gdf.reset_index(drop=True, inplace=True)
# Convert float number in the index_right column to integer
offline_gdf['index_right'] = offline_gdf['index_right'].astype(int)

In [20]:
# Save the results with pickle
with open('data/cache/' + 'offline_gdf' + '.pkl', 'wb') as f:
    pickle.dump(offline_gdf, f, protocol=pickle.HIGHEST_PROTOCOL)