In [None]:
import os
import sys
import socket
import re
import numpy as np
import string
import warnings
from timeit import default_timer as timer
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,when,count,col,count,lit,sum
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType
from py4j.java_gateway import java_import
from functools import reduce
from pyspark.sql import DataFrame

# Config

In [None]:
country_code = "US"
print('Country:', country_code)

In [None]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark=SparkSession.builder.config("spark.driver.host", "localhost").appName("extract-timelines").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark=SparkSession.builder.appName("extract-timelines").getOrCreate()
        
# IgnoreCorruptFiles
spark.conf.set("spark.sql.files.ignoreCorruptFiles", "true")
    
print('Hostname:', socket.gethostname())
if  'samuel' in socket.gethostname().lower():
    path_to_data='../../data/timelines/'
else:
    path_to_data='/user/spf248/twitter/data/timelines/'

In [None]:
print('List files to be processed...')
fs=spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
list_status=fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(os.path.join(path_to_data,country_code)))
paths=[file.getPath().toString() for file in list_status]
paths=[path.replace('hdfs://dumbo','') for path in paths if 'json.bz2' in path]
np.random.seed(0)
paths=np.random.permutation(sorted(paths))
print('# Files:', len(paths))

In [None]:
n_chunks=10
print('# Chunks:', n_chunks)
paths_chunks=np.array_split(paths, n_chunks)

n_existing=0
print('# Existing Chunks:',n_existing)

# Process Data

In [None]:
def extract_chunk(i_chunk,paths_chunk):

        df=spark.read.option(
        "compression","bzip2").option(
        "multiLine","true").option(
        "encoding","UTF-8").json(list(paths_chunk))

        df=df.select(
        'id_str',
        'created_at',
        'full_text',
        'lang',
        'user.id_str',
        'user.location',
        'coordinates.coordinates',
        'place.id',
        )

        df = df.toDF(*[
        'tweet_id',
        'created_at',
        'text',
        'tweet_lang',
        'user_id',
        'user_location',
        'tweet_coordinates',
        'place_id',
        ])

        df = df.withColumn('created_at', to_timestamp('created_at',"EEE MMM dd HH:mm:ss ZZZZZ yyyy"))
        df = df.withColumn('tweet_longitude', F.col('tweet_coordinates').getItem(0))
        df = df.withColumn('tweet_latitude',  F.col('tweet_coordinates').getItem(1))
        df = df.drop('tweet_coordinates')
        
        df.write.mode("overwrite").parquet(os.path.join(path_to_data,'chunks',country_code,str(i_chunk+n_existing)))

In [None]:
for i_chunk,paths_chunk in enumerate(paths_chunks):
    
    try:
        
        print('EXTRACT CHUNK', i_chunk)
        start = timer()
        
        extract_chunk(i_chunk,paths_chunk)
        
        end = timer()
        print('TIME:', round(end - start), 'SEC')
        
    
    except:
        
        print('ERROR WITH CHUNK', i_chunk)
        
    print()

In [5]:
print('Computing Time:',round((1579882418733-1579876096267)/(1000*3600),2))

Computing Time: 1.76
