In [1]:
import os
import sys
import socket
import re
import numpy as np
import string
import warnings
from timeit import default_timer as timer
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

# Config

In [3]:
country_name = "mexico"
print('Country:', country_name)

Country: mexico


In [4]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "extract-timelines").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "extract-timelines").getOrCreate()
        
# ignoreCorruptFiles
spark.conf.set("spark.sql.files.ignoreCorruptFiles", "true")
    
# Local
print('Hostname:', socket.gethostname())
if  'samuel' in socket.gethostname().lower():
    path_to_data='../../data/timelines/'
# Cluster
else:
    path_to_data='/user/spf248/twitter/data/timelines/'

Hostname: Samuels-MacBook-Pro.local


# Import Data

In [5]:
print('Import:')
start = timer()

# Getting Error Without Allowing For Multiline
path_to_data='/user/spf248/twitter/data/timelines/'
df = spark.read.option(
'compression', 'bzip2').option(
"multiLine", "true").option(
"encoding", "UTF-8").json(os.path.join(path_to_data,country_name,'*.json.bz2'))

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import:
Computing Time: 8 sec


In [6]:
df.cache()

DataFrame[contributors: string, coordinates: struct<coordinates:array<double>,type:string>, created_at: string, display_text_range: array<bigint>, entities: struct<hashtags:array<struct<indices:array<bigint>,text:string>>,media:array<struct<display_url:string,expanded_url:string,id:bigint,id_str:string,indices:array<bigint>,media_url:string,media_url_https:string,sizes:struct<large:struct<h:bigint,resize:string,w:bigint>,medium:struct<h:bigint,resize:string,w:bigint>,small:struct<h:bigint,resize:string,w:bigint>,thumb:struct<h:bigint,resize:string,w:bigint>>,source_status_id:bigint,source_status_id_str:string,source_user_id:bigint,source_user_id_str:string,type:string,url:string>>,symbols:array<struct<indices:array<bigint>,text:string>>,urls:array<struct<display_url:string,expanded_url:string,indices:array<bigint>,url:string>>,user_mentions:array<struct<id:bigint,id_str:string,indices:array<bigint>,name:string,screen_name:string>>>, extended_entities: struct<media:array<struct<addition

# Select Fields

In [8]:
df = df.select(
'id_str',
'created_at',
'full_text',
'lang',
'user.id_str',
'user.location',
'coordinates.coordinates',
'place.id',
)

df = df.toDF(*[
'tweet_id',
'created_at',
'text',
'tweet_lang',
'user_id',
'user_location',
'tweet_coordinates',
'place_id',
])

print("DROP DUPLICATE IDS")
df = df.drop_duplicates(subset=['tweet_id'])

print("CLEAN TIME")
df = df.withColumn('created_at', to_timestamp('created_at',"EEE MMM dd HH:mm:ss ZZZZZ yyyy"))

print('CLEAN COORDINATES')
df = df.withColumn('tweet_longitude', F.col('tweet_coordinates').getItem(0))
df = df.withColumn('tweet_latitude',  F.col('tweet_coordinates').getItem(1))
df = df.drop('tweet_coordinates')

DROP DUPLICATE IDS
CLEAN TIME
CLEAN COORDINATES


In [10]:
print('SAVE TO PARQUET')
start = timer()

df.write.mode("overwrite").parquet(os.path.join(path_to_data,'extract',country_name))

end = timer()
print('DONE IN', round(end - start), 'SEC')

SAVE TO PARQUET
DONE IN 15 SEC
