In [None]:
import os
import sys
import socket
import re
import numpy as np
import string
import warnings
from timeit import default_timer as timer
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,lit
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

In [None]:
(1576605423886 - 1576601639442)/(1000*3600)

# Config

In [None]:
country_code = "FR"

In [None]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "get-tweets-with-identified-location-extract").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "get-tweets-with-identified-location-extract").getOrCreate()
spark.conf.set('spark.sql.session.timeZone', 'UTC')

In [None]:
# Local
print('Hostname:', socket.gethostname())
if  'samuel' in socket.gethostname().lower():
    path_to_tweets = '../data/tweets/tweets-with-identified-location/'
    path_to_locations = '../data/locations/profiles/'
# Cluster
else:
    path_to_tweets = '/user/spf248/twitter/data/tweets/tweets-with-identified-location/'
    path_to_locations = '/user/spf248/twitter/data/locations/profiles/'  

# Import Dataset

In [None]:
print('Import')
start = timer()

df = spark.read.option(
'compression', 'bzip2').option(
'header','true').option(
"multiLine", "true").option(
"mode", "FAILFAST").option(
'escape','"').option(
"encoding", "UTF-8").csv(
os.path.join(path_to_tweets,'decahose','tweets-with-identified-location-from-decahose-partition-9-block-9.csv.bz2'))

schema = df.schema

# multiLine allows for \n
df = spark.read.option(
'compression', 'bzip2').option(
'header','true').option(
"multiLine", "true").option(
'escape','"').option(
"mode", "FAILFAST").option(
"encoding", "UTF-8").schema(schema).csv(
os.path.join(path_to_tweets,'decahose','tweets-with-identified-location-from-decahose-partition-*-block-*.csv.bz2'))

end = timer()
print('Computing Time:', round(end - start), 'sec')

In [None]:
print('Import Identified Locations')

locations = spark.read.option(
'header','true').option(
"multiLine", "true").csv(
path_to_locations+'account-locations.csv')

locations = locations.where(locations.country_short == country_code)
locations = locations.select('user_location')

# Process Data

In [None]:
print('Drop Index')
df = df.drop('_c0')

print('Select Rows in Identified Locations')
df = df.join(locations, on=['user_location'], how='inner')

print("DROP DUPLICATE IDS")
df = df.drop_duplicates(subset=['id_str'])

print("CLEAN TIME")
df = df.withColumn('created_at', to_timestamp('created_at',"EEE MMM dd HH:mm:ss ZZZZZ yyyy"))

print("REORDER COLUMNS")
df = df.select(
'id_str',
'created_at',
'text',
'lang',
'user_id_str',
'user_location',
'place_id',
'tweet_longitude',
'tweet_latitude',
)

print("RENAME COLUMNS")
df = df.toDF(*[
'tweet_id',
'created_at',
'text',
'tweet_lang',
'user_id',
'user_location',
'place_id',
'tweet_longitude',
'tweet_latitude',
])

In [None]:
df.write.mode("overwrite").parquet(os.path.join(path_to_tweets,country_code))

# Split By Month and Year

In [1]:
# def month_year_iter( start_month, start_year, end_month, end_year ):
#     ym_start= 12*start_year + start_month - 1
#     ym_end= 12*end_year + end_month - 1
#     for ym in range( ym_start, ym_end ):
#         y, m = divmod( ym, 12 )
#         yield y, m+1

In [None]:
# print("EXTRACT YEAR AND MONTH")
# df=df.withColumn('year',year('created_at').cast("string"))
# df=df.withColumn('month',month('created_at').cast("string"))

In [None]:
# print('SAVE TO PARQUET')
# start = timer()

# dates=list(month_year_iter(1,2012,1,2020))

# for i in range(len(dates)-1):
    
#     df.filter((df["year"]==dates[i][0])&(df["month"]==dates[i][1])).drop('year','month').write.mode(
#     "overwrite").parquet(os.path.join(path_to_tweets,country_code,str(dates[i][0])+'-'+str(dates[i][1])))

# end = timer()
# print('DONE IN', round(end - start), 'SEC')

country_code='US'

df.count()=3,808,797,523 