In [1]:
# Import libraries
import csv
import pickle
from os import path
from glob import glob
import pandas as pd
import numpy as np

In [2]:
# Settings
DIR_DATA = path.join('data', 'twitter data')
threshold_tweets = 10

Writing a file splitter in order to load the huge tsv file.

In [3]:
# We need a file splitter
def tsv_splitter(file_path):
    splitLen = 10**4       # 1e4 lines per file same as the sample file

    with open(file_path, 'r', encoding='utf8') as input_f:
        count = 0
        at = 0
        dest = None
        row = ''
        for line in input_f:
            if count % splitLen == 0:
                if line.count('\t') != 19:
                    count -= 1
                else:
                    if dest:
                        dest.close()
                    replacement = '.' + str(at) + '.'
                    dest = open(replacement.join(file_path.rsplit('.', 1)),
                                'w',
                                newline='\n',
                                encoding='utf8')
                    at += 1
            dest.write(line)
            count += 1            
    dest.close()

In [4]:
# Split twex.tsv file
tsv_splitter(path.join(DIR_DATA, 'twex.tsv'))

In [5]:
# Read the splitted tsv files
all_files = glob(path.join(DIR_DATA, '*.tsv'))
if path.join(DIR_DATA, 'twex.tsv') in all_files:
    all_files.remove(path.join(DIR_DATA, 'twex.tsv'))

df_from_each_file = (pd.read_csv(
    file_name,
    sep="\t",
    encoding='utf-8',
    escapechar='\\',
    na_values='N',
    quoting=csv.QUOTE_NONE,
    header=None
    )
    for file_name in all_files)
print('Reading twex.tsv file...')
df = pd.concat(df_from_each_file, ignore_index=True)
print('is done!')

Reading twex.tsv file...
is done!


In [6]:
# Read the schema file
print('Reading schema.txt file...')
schema = pd.read_csv(
    path.join(DIR_DATA, 'schema.txt'),
    sep="\s+",
    header=None
)
print('is done!')

# Rename the dataframe columns
df.columns = schema[1]

Reading schema.txt file...
is done!


In [7]:
# Our observations suggest that considering latitude/logitude columns is more accurate
df['latitude'].fillna(df['placeLatitude'], inplace=True)
df['longitude'].fillna(df['placeLongitude'], inplace=True)

# Just keep the important columns
df = df[['id', 'userId', 'createdAt', 'longitude', 'latitude', 'text']]

# Change the string in 'createdAt' column to datetime format
df['createdAt'] = pd.to_datetime(
    df['createdAt'],
    format='%Y-%m-%d %H:%M:%S',
    errors='coerce'
)

# Change the possible strings to numbers
df['id'] = df['id'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['userId'] = df['userId'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['longitude'] = df['longitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['latitude'] = df['latitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [8]:
# Drop rows with NaN values in important columns
df = df.dropna(subset=['id', 'userId', 'createdAt', 'longitude', 'latitude'], how='any')

# Change the id and user id format to integer 
df['id'] = df['id'].astype(np.int64)
df['userId'] = df['userId'].astype(np.int64)

# Remove duplicated tweets with the same id (it is too time consuming!)
df = df.drop_duplicates(subset='id')

# Reset index
df = df.reset_index(drop=True)

In [9]:
# Add some columns for further analysis
df['day'] = df['createdAt'].map(lambda x: x.day)
df['month'] = df['createdAt'].map(lambda x: x.month)
df['year'] = df['createdAt'].map(lambda x: x.year)
daily_user = ['userId', 'year', 'month', 'day']
df['daily_tweets'] = df.groupby(by=daily_user)['userId'].transform('count')

In [10]:
# Remove rows corresponding to people who have less than a threshold value in one day
df = df[df['daily_tweets'] >= threshold_tweets].reset_index(drop=True)

In [11]:
# Save the results with pickle
with open(path.join(DIR_DATA, 'clean_data.pkl'), 'wb') as in_file:
    pickle.dump(df, in_file, protocol=pickle.HIGHEST_PROTOCOL)