In [2]:
from pyspark.sql import SparkSession
import pandas

In [3]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Test") \
    .config('sprk.sql.session.timeZone', 'UTC') \
    .getOrCreate()

spark

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/09 23:05:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Read Data & Create View

# Method 1
spark.read.option("header", True).option("delimiter", ",").csv("../data/reCOVery/recovery-social-media-data.txt").createOrReplaceTempView("tweets")

# Method 2
# spark.sql("""
# CREATE OR REPLACE TEMPORARY VIEW tweets
# USING CSV
# OPTIONS (
#     path './data/full_dataset_clean.tsv',
#     header true,
#     delimiter '\t'
# )
# """)

In [6]:
spark.sql("""
SELECT count(*)
FROM tweets
""").toPandas()

Unnamed: 0,count(1)
0,140820


In [5]:
# Add index column to fake "pagination"

spark.sql("""
SELECT *, monotonically_increasing_id() AS idx
FROM tweets
""").createOrReplaceTempView("tweets2")

In [7]:
# Mimic getting 100 twitter ids at a time as that is max amount for twitter api tweets endpoint

start = 0
end = start+99
# while start < 140820:
while start < 200:
    print(spark.sql(f"""
        SELECT tweet_id, idx
        FROM tweets2
        WHERE idx BETWEEN {start} AND {end}
    """).toPandas()['tweet_id'])
    start = end+1
    end = start+99

0     1252387836239593472
1     1223121049325228034
2     1223121502838521861
3     1223141036354162689
4     1223148934538854400
             ...         
95    1222184504539975680
96    1222184407466799105
97    1222184314307252225
98    1222163772061581312
99    1222163936302129158
Name: tweet_id, Length: 100, dtype: object
0     1222164889462022144
1     1222165280048271360
2     1222165392795164672
3     1222165620738928640
4     1222166947820793856
             ...         
95    1225014628339666945
96    1225014885823606785
97    1225034168184131584
98    1225034300070006786
99    1225045769113063424
Name: tweet_id, Length: 100, dtype: object


In [8]:
# Example of prefixing strings to fixed length for filenames

for i in range(10):
    print(str(i).zfill(9))

000000000
000000001
000000002
000000003
000000004
000000005
000000006
000000007
000000008
000000009
