In [10]:
from os import path
from time import sleep, time
import csv
from urllib.request import urlretrieve
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from geopy.geocoders import Nominatim
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
%matplotlib inline

In [2]:
# Read the csv file
print('Reading sample.tsv file...')
df = pd.read_csv(
    path.join('data', 'sample.tsv'),
    sep="\t",
    encoding='utf-8',
    escapechar='\\',
    na_values='N',
    quoting=csv.QUOTE_NONE,
    header=None
)
print('is done!')

Reading sample.tsv file...
is done!


In [3]:
print('Reading schema.txt file...')
schema = pd.read_csv(
    path.join('data', 'schema.txt'),
    sep="\s+",
    header =None
)
print('is done!')
# Rename the dataframe columns
df.columns = schema[1]

Reading schema.txt file...
is done!


In [4]:
# Drop rows with NaN values in important columns
df.dropna(
    subset=['createdAt','placeLatitude','placeLongitude','userId','id'],
    how='any',
    inplace=True
)

In [5]:
# Change the string in 'createdAt' column to datetime format
df['createdAt'] = pd.to_datetime(
    df['createdAt'],
    format='%Y-%m-%d %H:%M:%S',
    errors='coerce'
)

In order to recover the cities location from latitude-longitude pairs, we use two different strategies:

1. **online strategy:** we use the geopy API to send a request containing information about the longitude and latitude of a place. The main cumbersome here is that all these kind of online APIs have some kind of request rate limit, and as it is suggested in [its website](http://wiki.openstreetmap.org/wiki/Nominatim_usage_policy), the time between two consecutive request should be more that 1 seconds. This actually makes the online approach so slow. One remedy to accelerate the process is to save longitude-latitude: location pair to a dictionary. Thus, before sending a request, we first check whether we have the location in our dictionary or not.

2. **offline strategy:** we can also use the geojson or topojson files for Switzerland and its neighbor countries. The corresponding geofiles are downloaded from the following github repositories:
    1. Switzerland topojson file from [swiss_map repo](https://github.com/d-qn/swiss-maps).
    2. France geojson file from [france-geojson repo](https://github.com/gregoiredavid/france-geojson).
    3. Italy geojson file from [leaflet-geojson-selector repo](https://github.com/stefanocudini/leaflet-geojson-selector).
    4. Germany geojson file from [deutschlandGeoJSON repo](https://github.com/isellsoap/deutschlandGeoJSON)
    5. Austria geojson file from [click_that_hood repo](https://github.com/codeforamerica/click_that_hood).

In general, for the offinle strategy, one can also follow this [stackoverflow response](http://stackoverflow.com/questions/6159074/given-the-lat-long-coordinates-how-can-we-find-out-the-city-country/6355183#6355183) or this [one](http://stackoverflow.com/a/24871449/5267664). The first one relies on the geoname database while the second one actually gives us a procedure to find geojson files for any country.

## 1. Online strategy:
We start from the online strategy. The following function find the state/canton of a location together with its county. We will see later that we could do the same thing in the offline approach.

In [9]:
# Function for finding a location from the latitude-longitude information using online API
geolocator = Nominatim()
locations = dict()
settlements = {'city', 'town', 'village', 'hamlet', 'isolated_dwelling'}
def online_locating(data):
    lat = str(round(data.placeLatitude,2))
    lng = str(round(data.placeLongitude,2))
    lookup = ','.join([lat, lng])
    if lookup not in set(locations.keys()):
        location = geolocator.reverse(lookup, language='en')
        try:
            state = location.raw['address']['state']
        except:
            state = float('NaN')
        sets_intersect = settlements.intersection(set(location.raw['address'].keys()))
        try:
            settlement = location.raw['address']['county']
        except:
            try:
                settlement = location.raw['address'][list(sets_intersect)[0]]
            except:
                settlement = float('NaN')
        locations[lookup] = {'state': state, 'settlement': settlement}
        sleep(1) # sleep for 1 sec (required by Nominatim usage policy)
    return pd.Series({'state': locations[lookup]['state'],
                      'settlement': locations[lookup]['settlement']})

In [None]:
t = time()
df[['state', 'settlement']] = df.apply(lambda x: online_locating(x), axis=1)
elapsed = time() - t
print('Elapsed time is ' + str(round(elapsed, 4)) + ' seconds.')

As we can see, it took very long to even recover locations for the sample file. Hence, it does not make sense to follow the online approach for the actual problem.

## 2. Offline approach:
As we mentioned before, it is necessary to download the required geojson/topojson file to run the offline approach. All files are available in data/geofiles folder. In this part, we use [gepandas](http://geopandas.org/) for furthur analysis. The resulting dataframes can be used to find the location of tweets.

In [39]:
ch_gdf = gpd.read_file(path.join('data/geofiles', 'ch-cantons.json'))
fr_gdf = gpd.read_file(path.join('data/geofiles', 'france-states.geojson'))
it_gdf = gpd.read_file(path.join('data/geofiles', 'italy-states.json'))
de_gdf = gpd.read_file(path.join('data/geofiles', 'germany-states.geojson'))
at_gdf = gpd.read_file(path.join('data/geofiles', 'austria-states.geojson'))

In [40]:
# Modify dataframes for merging
ch_gdf = ch_gdf[['geometry', 'name']]
ch_gdf['country'] = 'CH'
fr_gdf = fr_gdf[['geometry', 'name']]
fr_gdf['country'] = 'FR'
it_gdf = it_gdf[['geometry', 'name']]
it_gdf['country'] = 'IT'
de_gdf = de_gdf[['geometry', 'NAME_1']]
de_gdf = de_gdf.rename(columns={'NAME_1': 'name'})
de_gdf['country'] = 'DE'
at_gdf = at_gdf[['geometry', 'name']]
at_gdf['country'] = 'AT'
gdf = pd.concat([ch_gdf, fr_gdf, it_gdf, de_gdf, at_gdf], ignore_index=True)

The R-tree structure in geopandas dataframe enables us to find the twitters location very fast. The following function find each location is inside which state/canton.

In [None]:
neighbors_df = (
    pd.read_csv(
        path.join('data/geonames', fname + '.txt'),
        header=None,
        encoding='utf8',
        delimiter='\t',
        dtype={9: str},
        names=col_names,
        low_memory=False
    )
    for fname in ['DE', 'FR', 'IT', 'AT']
)
neighbors_df = pd.concat(neighbors_df, ignore_index=True)
neighbors_df = neighbors_df[neighbors_df['feature class'].str.contains('P')]

We can also reduce our search space size for neighbors countries by considering the states sharing border line with Switzerland.

In [None]:
imp_states = [
    'FR.84', 'FR.27', 'FR.44', # Auvergne-Rhône-Alpes, Bourgogne-Franche-Comte, and Grand Est 
    'DE.01', 'DE.02', # Bavaria and Baden-Wuerttemberg
    'AT.07', 'AT.08', # Tyrol and Vorarlberg
    'IT.19', 'IT.09', 'IT.12', 'IT.17' # Aosta Valley, Lombardy, Piedmont, and Trentino-Alto Adige
]
neighbors_df['country code'] + .head()

Useful links:
http://stackoverflow.com/questions/17267248/how-where-do-i-get-geojson-data-for-states-provinces-and-administrative-region

In [None]:
a = {1, 2, 3}
b = {3, 4}
c = a.intersection(b)
if c:
    print('Ture')
else:
    print('False')

In [None]:
import osmnx as ox
import geopandas as gpd
from descartes import PolygonPatch
from shapely.geometry import Point, Polygon, MultiPolygon

In [None]:
gdf = ox.graph_from_place('brig', network_type='drive')
ox.plot_graph(ox.project_graph(gdf))

In [None]:
city = ox.gdf_from_place('brig')
ox.save_gdf_shapefile(city)
city = ox.project_gdf(city)
fig, ax = ox.plot_shape(city)

In [None]:
# get the boundary of some city
gdf = ox.gdf_from_place('Switzerland')

# get the street network within this bounding box
west, south, east, north = gdf.unary_union.buffer(0.1).bounds
G = ox.graph_from_bbox(north, south, east, west, network_type='drive', retain_all=True)

# get lat-long points for each intersection
xy = [(data['x'], data['y']) for node, data in G.nodes(data=True)]
x, y = list(zip(*xy))

In [None]:
# turn the lat-long points into a geodataframe
gdf_nodes = gpd.GeoDataFrame(data={'x':x, 'y':y})
gdf_nodes.crs = gdf.crs
gdf_nodes.name = 'nodes'
gdf_nodes['geometry'] = gdf_nodes.apply(lambda row: Point((row['x'], row['y'])), axis=1)

In [None]:
geometry = gdf['geometry'][0]

In [None]:
if isinstance(geometry, Polygon):
    geometry = MultiPolygon([geometry])

In [None]:
west, south, east, north = gdf.unary_union.bounds

In [None]:
west, south, east, north = gdf.unary_union.bounds
fig, ax = plt.subplots(figsize=(6,6))
for polygon in geometry:
    print('yes')
    patch = PolygonPatch(polygon, fc='#cccccc', ec='k', alpha=0.5, zorder=2)
    ax.add_patch(patch)
    
ax.set_xlim(west, east)
ax.set_ylim(south, north)
ax.axis('off')
plt.show()

In [None]:
my_gpd = gpd.read_file(path.join('data', 'ch.json'), layer='municipalities')

In [6]:
a = float('NaN')

In [7]:
a

nan