In [16]:
import os
import sys
import socket
import re
import numpy as np
import string
import warnings
from timeit import default_timer as timer
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

# Config

In [6]:
country_code = "US"

In [5]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "extract-data-from-geolocated-tweets").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "extract-data-from-geolocated-tweets").getOrCreate()
    
# Local
print('Hostname:', socket.gethostname())
if  'samuel' in socket.gethostname().lower():
    path_to_tweets = '../data/decahose/parsed/tweets/tweets-with-identified-location/'
    path_to_locations = '../data/decahose/parsed/locations/'
# Cluster
else:
    path_to_tweets = '/user/spf248/twitter/data/decahose/parsed/tweets/tweets-with-identified-location/'
    path_to_locations = '/user/spf248/twitter/data/decahose/parsed/locations/'    

Hostname: Samuels-MBP


# Import Dataset

In [3]:
print('Import')
start = timer()

df = spark.read.option(
'compression', 'bzip2').option(
'header','true').option(
"multiLine", "true").option(
"mode", "DROPMALFORMED").option(
'escape','"').csv(
path_to_tweets+'tweets-with-identified-location-from-decahose-partition-9-block-9.csv.bz2')

schema = df.schema

# multiLine allows for \n
df = spark.read.option(
'compression', 'bzip2').option(
'header','true').option(
"multiLine", "true").option(
'escape','"').option(
"mode", "DROPMALFORMED").schema(schema).csv(
path_to_tweets+'tweets-with-identified-location-from-decahose-partition-*-block-*.csv.bz2')

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import


AnalysisException: 'Path does not exist: file:/Users/samuelfraiberger/Dropbox/Work/Projects/twitter/data/decahose/parsed/tweets/tweets-with-identified-location/tweets-with-identified-location-from-decahose-partition-9-block-9.csv.bz2;'

In [13]:
print('Import Identified Locations')

identified_locations = spark.read.option(
'header','true').option(
"multiLine", "true").csv(
path_to_locations+'account-locations-identified.csv')

identified_locations = identified_locations.where(identified_locations.country_short == country_code)
identified_locations = identified_locations.select(col('LOCATION').alias('user_location'))

Import Identified Locations


# Process Data

In [None]:
print('Drop Index')
df = df.drop('_c0')

print('Select Rows in Identified Locations')
df = df.join(identified_locations, on=['user_location'], how='inner')

print("DROP DUPLICATE IDS")
df = df.drop_duplicates(subset=['id_str'])

print("CLEAN TIME")
df = df.withColumn('created_at', to_timestamp('created_at',"EEE MMM dd HH:mm:ss ZZZZZ yyyy"))

print("REORDER COLUMNS")
df = df.select(
'id_str',
'created_at',
'text',
'lang',
'user_id_str',
'user_location',
'place_id',
'tweet_longitude',
'tweet_latitude',
)

print("RENAME COLUMNS")
df = df.toDF(*[
'tweet_id',
'created_at',
'text',
'tweet_lang',
'user_id',
'user_location',
'place_id',
'tweet_longitude',
'tweet_latitude',
])

In [None]:
print('SAVE TO PARQUET')
start = timer()

df.write.mode("overwrite").parquet(path_to_tweets+'../tweets-with-identified-location-'+country_code)

end = timer()
print('DONE IN', round(end - start), 'SEC')

In [1]:
# US:
# >>> df.count()
# 3808797523 