In [1]:
# Import libraries
import csv
import pickle
from os import path
from glob import glob
from time import sleep, time
from difflib import get_close_matches
import numpy as np
import pandas as pd
import geopandas as gpd
import multiprocessing
from joblib import Parallel, delayed
from shapely.geometry import Point

In [8]:
# Settings
DIR_DATA = path.join('..', 'data', 'sample data')

In [9]:
# Read the tsv files
print('Reading sample.tsv file...')
df = pd.read_csv(
    path.join(DIR_DATA, 'sample.tsv'),
    sep="\t",
    encoding='utf-8',
    escapechar='\\',
    na_values='N',
    quoting=csv.QUOTE_NONE,
    header=None
)
print('is done!')

# Read the schema file
print('Reading schema.txt file...')
schema = pd.read_csv(
    path.join(DIR_DATA, 'schema.txt'),
    sep="\s+",
    header=None
)
print('is done!')

# Rename the dataframe columns
df.columns = schema[1]

Reading sample.tsv file...
is done!
Reading schema.txt file...
is done!


In [10]:
# Our observations suggest that considering latitude/logitude columns is more accurate
df['latitude'].fillna(df['placeLatitude'], inplace=True)
df['longitude'].fillna(df['placeLongitude'], inplace=True)

# Just keep the important columns
df = df[['id', 'userId', 'createdAt', 'longitude', 'latitude', 'text']]

# Change the string in 'createdAt' column to datetime format
df['createdAt'] = pd.to_datetime(
    df['createdAt'],
    format='%Y-%m-%d %H:%M:%S',
    errors='coerce'
)

# Change the possible strings to numbers
df['id'] = df['id'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['id'] = df['id'].astype(int)
df['userId'] = df['userId'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['userId'] = df['userId'].astype(int)
df['longitude'] = df['longitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['latitude'] = df['latitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [12]:
# Drop rows with NaN values in important columns
df = df.dropna(subset=['id', 'userId', 'createdAt', 'longitude', 'latitude', 'text'], how='any')

# Remove duplicated tweets with the same id (it is too time consuming!)
df = df.drop_duplicates(subset='id')

# Reset index
df = df.reset_index(drop=True)