This notebook shows how to transform tweets in line-oriented JSON files to data that's useful for topic modeling!

In [1]:
# this is big data
# use Spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import element_at, col, coalesce, length, max as max_, concat_ws

spark = SparkSession.builder.getOrCreate()

In [3]:
# the tweet files, about 6gb each, are compressed into .gz files, which spark can read
df = spark.read.json("compressed_tweets/*")

In [5]:
# concatenate the mentions, hashtags, and urls into pandas-transformation friendly single strings
# pyarrow, the Pandas conversion engine, can't handle arrays from Spark yet
df = df.withColumn("mentions", concat_ws(" ", col("entities.user_mentions.screen_name")))
df = df.withColumn("hashtags", concat_ws(" ", "entities.hashtags.text"))
df = df.withColumn("urls", concat_ws(" ", "entities.urls.display_url"))

In [6]:
# limit to languages that I speak
df = df.where(df.lang.isin({"en", "tr", "fr"}))

In [7]:
# select only a few useful columns
# a full tweet object contains a BUNCH of stuff that you can read about here:
# https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/intro-to-tweet-json
df = df.select(          
              "created_at",
              "full_text",
              "lang",
              "favorite_count", 
              "retweet_count", 
              "place.country", 
              "place.place_type", 
              "place.full_name",
              "mentions",
              "urls",
              "hashtags"
)

In [20]:
# write json
# I had best results with pandas-spark intercompatibility using JSONs
df.write.json("parsed_tweets3")

In [26]:
# test reading with pandas
# spark writes line-oriented JSONs - one complete JSON object per line
import pandas as pd
test_df = pd.read_json("parsed_tweets3/part-00000-1d9e27cf-c47e-43c5-b56c-c3c8448494c4-c000.json", lines = True)