In [None]:
# Our observations suggest that considering latitude/logitude columns is more accurate
df['latitude'].fillna(df['placeLatitude'], inplace=True)
df['longitude'].fillna(df['placeLongitude'], inplace=True)

# Drop rows with NaN values in important columns
df.dropna(
    subset=['id', 'userId', 'createdAt', 'longitude', 'latitude'],
    how='any',
    inplace=True
)


# Change the string in 'createdAt' column to datetime format
df['createdAt'] = pd.to_datetime(
    df['createdAt'],
    format='%Y-%m-%d %H:%M:%S',
    errors='coerce'
)

# Remove duplicated tweets with the same id
if not df['id'].is_unique:
    df.drop_duplicates(subset='id', inplace=True)

# Remove unnecessary columns
df = df[['id', 'userId', 'createdAt', 'longitude', 'latitude']]

# Add some columns for further analysis
df['day'] = df['createdAt'].map(lambda x: x.day)
df['month'] = df['createdAt'].map(lambda x: x.month)
df['year'] = df['createdAt'].map(lambda x: x.year)
daily_user = ['userId', 'day', 'month', 'year']
df['daily_tweets'] = df.groupby(by=daily_user)['userId'].transform('count')
df['new'] = tuple(zip(df['latitude'], df['longitude'], df['createdAt']))

# Remove rows corresponding to people who have less than a threshold value in one day
df = df[df['daily_tweets'] >= threshold_tweets].reset_index(drop=True)

# Function for removing noisy tweets
def data_denoising(sub_df, crt_speed=60):
    """ This function aims to identify noisy tweets. By the term noisy, we mean that the
    reported location for the tweet is noisy."""
    zipped_columns = sub_df.tolist()
    lst = list(zip(*zipped_columns))
    lat = lst[0]
    lng = lst[1]
    tw_time = lst[2]
    states = lst[3]
    points = list(zip(lat, lng))

    if len(sub_df) == 2:
        d = vincenty(points[1], points[0]).meters
        t = tw_time[1] - tw_time[0]
        t = t.total_seconds()
        v = d / t if t else float('inf')
        if v > crt_speed:
            return [float('NaN'), float('NaN')]
        else:
            return states
    else:
        orig = points[0:-2]
        dest1 = points[1:-1]
        dest2 = points[2::]
        denoised = list(states)

        for index in range(len(orig)):
            d1 = vincenty(dest1[index], orig[index]).meters
            t1 = tw_time[index+1] - tw_time[index]
            t1 = t1.total_seconds()
            v1 = d1 / t1 if t1 else float('inf')

            d2 = vincenty(dest2[index], dest1[index]).meters
            t2 = tw_time[index+2] - tw_time[index+1]
            t2 = t2.total_seconds()
            v2 = d2 / t2 if t2 else float('inf')

            d3 = vincenty(dest2[index], orig[index]).meters
            t3 = tw_time[index+2] - tw_time[index]
            t3 = t3.total_seconds()
            v3 = d3 / t3 if t3 else float('inf')

            if np.isinf(v1) | np.isinf(v2) | np.isinf(v3):
                denoised = [float('NaN')] * len(denoised)
                break
            if (v1 > crt_speed) & (v2 > crt_speed):
                if v3 <= crt_speed:
                    denoised[index+1] = float('NaN')
                else:
                    denoised[index] = float('NaN')
                    denoised[index+1] = float('NaN')
                    if index == len(orig) - 1:
                        denoised[index+2] = float('NaN')
            if (v1 > crt_speed) & (v2 <= crt_speed):
                denoised[index] = float('NaN')

        return denoised

# Remove noisy tweets with the above function
df['state'] = df.groupby(by=daily_user)['new'].transform(lambda x: data_denoising(x))
df = df[df['state'].notnull()].reset_index(drop=True)

# Remove the new column
del df['new']

# Load geofiles
ch_gdf = gpd.read_file(path.join(DIR_GEO, 'ch-cantons.json'))
fr_gdf = gpd.read_file(path.join(DIR_GEO, 'france-states.geojson'))
it_gdf = gpd.read_file(path.join(DIR_GEO, 'italy-states.json'))
de_gdf = gpd.read_file(path.join(DIR_GEO, 'germany-states.geojson'))
at_gdf = gpd.read_file(path.join(DIR_GEO, 'austria-states.geojson'))
li_gdf = gpd.read_file(path.join(DIR_GEO, 'liechtenstein.geojson'))

# Modify dataframes for merging
ch_gdf = ch_gdf[['geometry', 'name']]
ch_gdf['country'] = 'CH'

fr_gdf = fr_gdf[['geometry', 'name']]
fr_gdf['country'] = 'FR'

it_gdf = it_gdf[['geometry', 'name']]
it_gdf['country'] = 'IT'

de_gdf = de_gdf[['geometry', 'NAME_1']]
de_gdf = de_gdf.rename(columns={'NAME_1': 'name'})
de_gdf['country'] = 'DE'

at_gdf = at_gdf[['geometry', 'name']]
at_gdf['country'] = 'AT'

li_gdf = li_gdf[['geometry', 'NAME']]
li_gdf = li_gdf.rename(columns={'NAME': 'name'})
li_gdf['country'] = 'LI'
# Concatinate the dataframes
df_poly = pd.concat([ch_gdf, fr_gdf, it_gdf, de_gdf, at_gdf, li_gdf], ignore_index=True)
df_poly = df_poly.rename(columns={'name': 'state'})

# Convert our dataframe to a geopandas dataframe
df = gpd.GeoDataFrame(df)
df['geometry'] = df.apply(lambda row: Point(row.longitude, row.latitude), axis=1)
df.crs = df_poly.crs

# Offline locating using spatial indexing in geopandas
print('Start spatial merging process...')
t = time()
df = gpd.tools.sjoin(df, df_poly, how="left")
elapsed = time() - t
print('Elapsed time is ' + str(round(elapsed, 4)) + ' seconds.')

# Drop duplicate tweets. It might be possible that we locate boundaries into two different cantons
df = df.drop_duplicates(subset='id')
df = df.reset_index(drop=True)

# Find unlocated tweets
null_index = df['state'].isnull()

# Initialization for online locating
geolocator = Nominatim()
locations = dict()

# Online locating function for missing data
def online_locating(data):
    """ This function find the state and country of a location using an online API"""
    lat = str(data.latitude)
    lng = str(data.longitude)
    lookup = ','.join([lat, lng])
    if lookup not in set(locations.keys()):
        try:
            location = geolocator.reverse(lookup, language='en')
        except TimeOut:
            online_locating(data)                
        try:
            country = location.raw['address']['country_code'].upper()
        except:
            country = float('NaN')
        try:
            state = location.raw['address']['state']
        except:
            try:
                state = location.raw['address']['country']
            except:
                state = float('NaN')
        locations[lookup] = {'country': country, 'state': state}
        sleep(1) # sleep for 1 sec (required by Nominatim usage policy)
    return pd.Series({'country': locations[lookup]['country'],
                      'state': locations[lookup]['state']})

# Function for finding a location from the latitude-longitude information using online API
df.loc[null_index, ['country', 'state']] = df[null_index].apply(
    lambda row: online_locating(row), axis=1)

# Function to modify the online API results
def modify_dataframe(row):
    countries = set(df_poly['country'].unique())
    if row['country'] not in countries:
        row['state'] = row['country']
    else:
        sub_gdf = df_poly[df_poly['country'] == row['country']]
        states = sub_gdf.state.values
        if 'Bavaria' in row['state']:
            row['state'] = 'Bayern'
        elif row['state'] == 'Great East':
            row['state'] = 'Alsace-Champagne-Ardenne-Lorraine'
        elif row['state'] == 'Grisons':
            row['state'] = 'Graubünden'
        elif row['state'] == 'Aosta Valley':
            row['state'] = "valle d'aosta"
        else:
            row['state'] = get_close_matches(row['state'], states, 1, 0)[0]
        row['index_right'] = sub_gdf[sub_gdf['state'] == row['state']].index.values[0]
    return row

# Apply function modifier to the dataframe
df.loc[null_index,:] = df[null_index].apply(
    lambda row: modify_dataframe(row), axis=1)

# Remove tweets from countries which are not in our list
df = df[df['index_right'].notnull()]

# Reset index
df = df.reset_index(drop=True)

# Remove index_right column
del df['index_right']


def find_path(sub_df, cmt_num = 10):       
    origin = sub_df.values[0:-1]
    destination = sub_df.values[1::]
    index = origin != destination
    path = float('NaN')
    if (index.any()) & (sum(index) <= cmt_num):
        path = '->'.join(origin[index])
        path = '->'.join([path, destination[index][-1]])
    return path