In [None]:
# Import libraries
import pickle
from os import path
from time import time
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [None]:
# Settings
DIR_DATA = path.join('data', 'twitter data')
DIR_GEO = path.join('data', 'geofiles')
threshold_tweets = 2

In [None]:
# We need a file splitter
splitLen = 10**4       # 1e4 lines per file same as the sample file

with open(path.join(DIR_DATA, 'twex.tsv'), 'r', encoding='utf8') as input_f:
    count = 0
    at = 0
    dest = None
    row = ''
    for line in input_f:
        if count % splitLen == 0:
            if line.count('\t') != 19:
                count -= 1
            else:
                if dest:
                    dest.close()
                dest = open(path.join(DIR_DATA, 'twex.' + str(at) + '.tsv'),
                            'w',
                            newline='\n',
                            encoding='utf8')
                at += 1
        dest.write(line)
        count += 1            
dest.close()

In [None]:
# Read the tsv files
all_files = glob(path.join(DIR_DATA, '*.tsv'))
if path.join(DIR_DATA, 'twex.tsv') in all_files:
    all_files.remove(path.join(DIR_DATA, 'twex.tsv'))

df_from_each_file = (pd.read_csv(
    file_name,
    sep="\t",
    encoding='utf-8',
    escapechar='\\',
    na_values='N',
    quoting=csv.QUOTE_NONE,
    header=None
    )
    for file_name in all_files)
print('Reading twex.tsv file...')
df = pd.concat(df_from_each_file, ignore_index=True)
print('is done!')

In [None]:
# Read the schema file
print('Reading schema.txt file...')
schema = pd.read_csv(
    path.join(DIR_DATA, 'schema.txt'),
    sep="\s+",
    header=None
)
print('is done!')

# Rename the dataframe columns
df.columns = schema[1]

In [None]:
# Our observations suggest that considering latitude/logitude columns is more accurate
df['latitude'].fillna(df['placeLatitude'], inplace=True)
df['longitude'].fillna(df['placeLongitude'], inplace=True)

# Just keep the important columns
df = df[['id', 'userId', 'createdAt', 'longitude', 'latitude', 'text']]

# Change the string in 'createdAt' column to datetime format
df['createdAt'] = pd.to_datetime(
    df['createdAt'],
    format='%Y-%m-%d %H:%M:%S',
    errors='coerce'
)

# Change the possible strings to numbers
df['id'] = df['id'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['id'] = df['id'].astype(int)
df['userId'] = df['userId'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['userId'] = df['userId'].astype(int)
df['longitude'] = df['longitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['latitude'] = df['latitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [None]:
# Drop rows with NaN values in important columns
df = df.dropna(subset=['id', 'userId', 'createdAt', 'longitude', 'latitude'], how='any')

# Remove duplicated tweets with the same id (it is too time consuming!)
# df = df.drop_duplicates(subset='id')

# Reset index
df = df.reset_index(drop=True)

In [None]:
# Add some columns for further analysis
df['day'] = df['createdAt'].map(lambda x: x.day)
df['month'] = df['createdAt'].map(lambda x: x.month)
df['year'] = df['createdAt'].map(lambda x: x.year)
daily_user = ['userId', 'year', 'month', 'day']
df['daily_tweets'] = df.groupby(by=daily_user)['userId'].transform('count')

In [None]:
# Remove rows corresponding to people who have less than a threshold value in one day
threshold_tweets = 10
df = df[df['daily_tweets'] >= threshold_tweets].reset_index(drop=True)

In [None]:
# Remove noisy tweets with the above function
df['new'] = tuple(zip(df['latitude'], df['longitude'], df['createdAt']))
not_noisy = df.groupby(by=daily_user)['new'].transform(lambda x: data_denoising(x))
df = df[not_noisy].reset_index(drop=True)
# Remove the generated column
del df['new']

In [None]:
# Load geofiles
ch_gdf = gpd.read_file(path.join(DIR_GEO, 'ch-cantons.json'))
fr_gdf = gpd.read_file(path.join(DIR_GEO, 'france-states.geojson'))
it_gdf = gpd.read_file(path.join(DIR_GEO, 'italy-states.json'))
de_gdf = gpd.read_file(path.join(DIR_GEO, 'germany-states.geojson'))
at_gdf = gpd.read_file(path.join(DIR_GEO, 'austria-states.geojson'))
li_gdf = gpd.read_file(path.join(DIR_GEO, 'liechtenstein.geojson'))

# Modify dataframes for merging
ch_gdf = ch_gdf[['geometry', 'name']]
ch_gdf['country'] = 'CH'

fr_gdf = fr_gdf[['geometry', 'name']]
fr_gdf['country'] = 'FR'

it_gdf = it_gdf[['geometry', 'name']]
it_gdf['country'] = 'IT'

de_gdf = de_gdf[['geometry', 'NAME_1']]
de_gdf = de_gdf.rename(columns={'NAME_1': 'name'})
de_gdf['country'] = 'DE'

at_gdf = at_gdf[['geometry', 'name']]
at_gdf['country'] = 'AT'

li_gdf = li_gdf[['geometry', 'NAME']]
li_gdf = li_gdf.rename(columns={'NAME': 'name'})
li_gdf['country'] = 'LI'
# Concatinate the dataframes
df_poly = pd.concat([ch_gdf, fr_gdf, it_gdf, de_gdf, at_gdf, li_gdf], ignore_index=True)
df_poly = df_poly.rename(columns={'name': 'state'})

In [None]:
# Convert our dataframe to a geopandas dataframe
df = gpd.GeoDataFrame(df)
df['geometry'] = df.apply(lambda row: Point(row.longitude, row.latitude), axis=1)
df.crs = df_poly.crs

In [None]:
# Offline locating using spatial indexing in geopandas
print('Start spatial merging process...')
t = time()
df = gpd.tools.sjoin(df, df_poly, how="left")
elapsed = time() - t
print('Elapsed time is ' + str(round(elapsed, 2)) + ' seconds.')

In [None]:
# Remove unlocated tweets
df = df[df['state'].notnull()].reset_index(drop=True)
# Remove index_right column
del df['index_right']
# Drop duplicate tweets. It might be possible that we locate boundaries into two different cantons
# df = df.drop_duplicates(subset='id')
# df = df.reset_index(drop=True)

In [None]:
def find_path(sub_df, cmt_num = 10):       
    origin = sub_df.values[0:-1]
    destination = sub_df.values[1::]
    index = origin != destination
    path = float('NaN')
    if (index.any()) & (sum(index) <= cmt_num):
        path = '->'.join(origin[index])
        path = '->'.join([path, destination[index][-1]])
    return path

In [None]:
daily_patterns = df.groupby(by=daily_user)['state'].apply(lambda x: find_path(x))
daily_patterns = daily_patterns.dropna()

In [None]:
# Save the results with pickle
with open(path.join('data', 'daily_patterns.pkl'), 'wb') as in_file:
    pickle.dump(daily_patterns, in_file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
len(df.groupby(by=daily_user))

In [None]:
daily_patterns