In [1]:
import os
import sys
import socket
import re
import numpy as np
import string
import warnings
from timeit import default_timer as timer
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType
from pyspark.sql import Window
from pyspark.ml.feature import RegexTokenizer

In [12]:
print('1690 sec')

1690 sec


# Config

In [3]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "extract-data-from-geolocated-tweets").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "extract-data-from-geolocated-tweets").getOrCreate()
    
# Local
print('Hostname:', socket.gethostname())
if  'samuel' in socket.gethostname().lower():
    path_to_tweets   = '../data/tweets/tweets-with-geocoordinates-or-place/'
# Cluster
else:
    path_to_tweets   = '/user/spf248/twitter/parsed/tweets/tweets-with-geocoordinates-or-place/'

Hostname: Samuels-MacBook-Pro.local


# Import Data

In [17]:
print('Import:')
start = timer()

df = spark.read.option(
'compression', 'bzip2').option(
"multiLine", "true").option(
"encoding", "UTF-8").option(
"mode", "FAILFAST").json(
path_to_tweets+'decahose/tweets-with-geocoordinates-or-place-from-decahose-partition-9-block-95.json.bz2')

schema = df.schema

# Getting Error Without Allowing For Multiline
df = spark.read.option(
'compression', 'bzip2').option(
"multiLine", "true").option(
"encoding", "UTF-8").option(
"mode", "FAILFAST").schema(schema).json(
path_to_tweets+'decahose/tweets-with-geocoordinates-or-place-from-decahose-partition-*-block-*.json.bz2')

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import:
Computing Time: 2 sec


In [18]:
df.count()

7582

# Select Fields

In [9]:
df = df.select(
'id_str',
'created_at',
'text',
'extended_tweet.full_text',
'truncated',
'lang',
'user.id_str',
'user.location',
'coordinates.coordinates',
'place.id',
)

df = df.toDF(*[
'tweet_id',
'created_at',
'text',
'full_text',
'truncated',
'tweet_lang',
'user_id',
'user_location',
'tweet_coordinates',
'place_id',
])

print("DROP DUPLICATE IDS")
df = df.drop_duplicates(subset=['tweet_id'])

print("GET FULL TEXT")
def get_text(t,x,y):
    if not t:
        return x 
    else: 
        return y
get_text_udf = udf(get_text, StringType())
df = df.withColumn("text", get_text_udf("truncated","text","full_text")).drop('truncated','full_text')

print("CLEAN TIME")
df = df.withColumn('created_at', to_timestamp('created_at',"EEE MMM dd HH:mm:ss ZZZZZ yyyy"))

print('CLEAN COORDINATES')
df = df.withColumn('tweet_longitude', F.col('tweet_coordinates').getItem(0))
df = df.withColumn('tweet_latitude',  F.col('tweet_coordinates').getItem(1))
df = df.drop('tweet_coordinates')

DROP DUPLICATE IDS
GET FULL TEXT
CLEAN TIME
CLEAN COORDINATES


In [13]:
df.show()

+------------------+-------------------+--------------------+----------+------------------+--------------------+----------------+---------------+--------------+
|          tweet_id|         created_at|                text|tweet_lang|           user_id|       user_location|        place_id|tweet_longitude|tweet_latitude|
+------------------+-------------------+--------------------+----------+------------------+--------------------+----------------+---------------+--------------+
|812406766864658433|2016-12-23 16:17:21|Maquilladora Aino...|        es|         476810112|     Senija//Benissa|21b8315555fd34c4|           null|          null|
|812409837086736384|2016-12-23 16:29:33|amem selena gomez...|        en|        2291154534|                null|012d81605af0416c|           null|          null|
|812411569346932737|2016-12-23 16:36:26|Entre erros e ace...|        pt|         164508100|      Rio de Janeiro|5a4ba2b4cd9a0b2c|           null|          null|
|812413133813874688|2016-12-23 16:

In [11]:
print('SAVE TO PARQUET')
start = timer()

df.write.mode("overwrite").parquet(path_to_tweets+'extract')

end = timer()
print('DONE IN', round(end - start), 'SEC')

SAVE TO PARQUET
DONE IN 18 SEC


In [2]:
# print((df.count(), len(df.columns)))
# (1419941194, 9)

(1419941194, 9)   

In [3]:
# df.where(col("tweet_latitude").isNull()).count()
# 622733742    