In [1]:
import os
import sys
import socket
import re
import numpy as np
import string
import warnings
from timeit import default_timer as timer
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

In [None]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "extract-data-from-geolocated-tweets").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "extract-data-from-geolocated-tweets").getOrCreate()

In [6]:
# Local
print('Hostname:', socket.gethostname())
if  'samuel' in socket.gethostname().lower():
    path_to_tweets='../../data/tweets/tweets-with-geocoordinates-or-place/'
    path_to_locations='../../data/locations/profiles/'
# Cluster
else:
    path_to_tweets='/user/spf248/twitter/data/tweets/tweets-with-geocoordinates-or-place/'
    path_to_locations='/user/spf248/twitter/data/locations/profiles/'  

Hostname: Samuels-MacBook-Pro.local


In [28]:
df=spark.read.parquet(path_to_tweets+'geocoded')
df.cache()

DataFrame[tweet_id: string, created_at: timestamp, text: string, tweet_lang: string, user_id: string, user_location: string, place_id: string, tweet_longitude: double, tweet_latitude: double, rg_name: string, rg_admin1: string, rg_admin2: string, rg_cc: string]

In [29]:
locations=spark.read.option('header','true').option('multiLine','true').csv(
path_to_locations+'account-locations.csv')
locations=locations.selectExpr('user_location as user_location', 'country_short as cc')

In [30]:
print('# Tweets:', df.count())
print('# Users:', df.select('user_id').distinct().count())

# Tweets: 414
# Users: 401


In [31]:
print('SELECT TWEETS WITH IDENTIFIED LOCATION')
df=df.join(locations, on=['user_location'], how='inner')

SELECT TWEETS WITH IDENTIFIED LOCATION


In [32]:
print('# Tweets:', df.count())
print('# Users:', df.select('user_id').distinct().count())

# Tweets: 233
# Users: 226


In [38]:
print('SELECT TWEETS IF GEOLOCATED IN A USER"S COUNTRY')
df=df.filter(df['rg_cc']==df['cc']).drop('cc')

SELECT TWEETS IF GEOLOCATED IN A USER"S COUNTRY


In [39]:
print('# Tweets:', df.count())
print('# Users:', df.select('user_id').distinct().count())

# Tweets: 211
# Users: 206


In [40]:
df.show()

+------------------+------------------+-------------------+--------------------+----------+------------------+----------------+---------------+--------------+--------------------+--------------------+--------------------+-----+
|     user_location|          tweet_id|         created_at|                text|tweet_lang|           user_id|        place_id|tweet_longitude|tweet_latitude|             rg_name|           rg_admin1|           rg_admin2|rg_cc|
+------------------+------------------+-------------------+--------------------+----------+------------------+----------------+---------------+--------------+--------------------+--------------------+--------------------+-----+
|            London|812409988060684288|2016-12-23 16:30:09|#londonlife #desi...|        pt|          53126594|5de8cffc145c486b|        -0.1345|       51.5207|              London|             England|      Greater London|   GB|
|            Brasil|812407198865391616|2016-12-23 16:19:04|#reuniaodefechame...|       u

In [37]:
print('SAVE')
start = timer()

df.write.mode("overwrite").parquet(path_to_tweets+'identified')

end = timer()
print('DONE IN', round(end - start), 'SEC')

SAVE
DONE IN 1 SEC
