# Read parquet

In [None]:
%pyspark
training_parquet_path = "/PATH/training_df"
validation_parquet_path = "/PATH/val_df"
test_parquet_path = "/PATH/test_df"

training_df = sqc.read.parquet(training_parquet_path)
validation_df = sqc.read.parquet(validation_parquet_path)
test_df = sqc.read.parquet(test_parquet_path)

# Flow of engagements in the first 24 hours

In [None]:
%pyspark
none_engagments = (F.col("reply_engagement_timestamp").isNull() & F.col("retweet_engagement_timestamp").isNull() &
                         F.col("retweet_with_comment_engagement_timestamp").isNull() & F.col("like_engagement_timestamp").isNull())

engagee_activity = training_df\
.filter(~none_engagments)\
.select(F.col("engagee_user_id"), F.col("reply_engagement_timestamp"), F.col("retweet_engagement_timestamp"), F.col("retweet_with_comment_engagement_timestamp"), F.col("like_engagement_timestamp"))\
.withColumn("like_engagment", F.when(~F.col("like_engagement_timestamp").isNull(), F.array(F.col("like_engagement_timestamp"), F.lit("like"))))\
.withColumn("reply_engagment", F.when(~F.col("reply_engagement_timestamp").isNull(), F.array(F.col("reply_engagement_timestamp"), F.lit("reply"))))\
.withColumn("rt_engagment", F.when(~F.col("retweet_engagement_timestamp").isNull(), F.array(F.col("retweet_engagement_timestamp"), F.lit("retweet"))))\
.withColumn("rtwithcmt_engagment", F.when(~F.col("retweet_with_comment_engagement_timestamp").isNull(), F.array(F.col("retweet_with_comment_engagement_timestamp"), F.lit("rtWithcmt"))))\
.withColumn("engagment", F.array(F.col("like_engagment"), F.col("reply_engagment"), F.col("rt_engagment"), F.col("rtwithcmt_engagment")))\
.drop("reply_engagement_timestamp", "retweet_engagement_timestamp", "retweet_with_comment_engagement_timestamp", "like_engagement_timestamp", "like_engagment", "reply_engagment", "rt_engagment", "rtwithcmt_engagment")\
.select(F.col("engagee_user_id"), F.explode(F.col("engagment")))\
.filter(~F.col("col").isNull())\
.select(F.col("engagee_user_id"), F.col("col")[0].alias("engage_timestamp"), F.col("col")[1].alias("engage_type"))\
.withColumn("time", F.to_timestamp(F.col("engage_timestamp")+0))\
.withColumn("day", F.dayofmonth(F.col("time")))\
.withColumn("hour", F.hour(F.col("time")))\
.withColumn("minute", F.minute(F.col("time")))\
.filter(F.col("day")==6)

z.show(engagee_activity.groupBy("day", "hour").agg(F.sum(F.when(F.col("engage_type")=="like", 1).otherwise(0)).alias("like"),
                                            F.sum(F.when(F.col("engage_type")=="retweet", 1).otherwise(0)).alias("retweet"),
                                            F.sum(F.when(F.col("engage_type")=="reply", 1).otherwise(0)).alias("reply"),
                                            F.sum(F.when(F.col("engage_type")=="rtWithcmt", 1).otherwise(0)).alias("rtWithCmt")))

# Engager Follower Distribution

In [None]:
%pyspark
users = training_df.dropDuplicates(["engager_user_id"]).select("engager_user_id", "engager_follower_count", "engager_following_count", "engager_is_verified", "engager_account_creation_time")
bins, counts = users.select("engager_follower_count").rdd.flatMap(lambda x: x).histogram(1000000)

fig, ax = plt.subplots()
plt.plot(bins[:-1], counts,'--',label='degrees', linewidth=2)
# plt.plot(bins[1000:1130], counts[1000:1130],'-', linewidth=4)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Number of follower")
plt.ylabel("freq.")
plt.title("Distribution of engager users based on follower count")
# plt.savefig("/nas_ssd_social_media_analytics/ali_twitter/figs/followers.png")
plt.show()

# Engager following distribution

In [None]:
%pyspark
users = training_df.dropDuplicates(["engager_user_id"]).select("engager_user_id", "engager_follower_count", "engager_following_count", "engager_is_verified", "engager_account_creation_time")
bins, counts = users.select("engager_following_count").rdd.flatMap(lambda x: x).histogram(1000000)

fig, ax = plt.subplots()
plt.plot(bins[:-1], counts,'--',label='degrees', linewidth=2)
plt.plot(bins[1000:1130], counts[1000:1130],'-', linewidth=4)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Number of following")
plt.ylabel("freq.")
plt.title("Distribution of engager users based on following count")
# plt.savefig("/PATH/following.png")
plt.show()

# Hashtag Duration

In [None]:
%pyspark
import matplotlib.ticker as ticker

tweets = training_df.dropDuplicates(["tweet_id"])

hashtagActivity = tweets\
.filter(~F.col("hashtags").isNull())\
.select(F.explode("hashtags").alias("hashtag"), "timestamp")

bins, counts = hashtagActivity\
.groupby("hashtag").agg(F.min("timestamp").alias("start_timestamp"), (F.max("timestamp") - F.min("timestamp")).alias("duration"), F.count("timestamp").alias("cnt"))\
.select("duration").rdd.flatMap(lambda x: x).histogram(1000)

fig, ax = plt.subplots()
plt.plot(bins[:-1], counts,'--', linewidth=2)
plt.yscale("log")
# plt.xscale("log")
scale_x = 24*60*60
scale_y = 1e3
ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(int(x/scale_x)))
ax.xaxis.set_major_formatter(ticks_x)


plt.xlabel("Duration (days)")
plt.ylabel("freq. (log)")
plt.title("hashtag duration")
# plt.savefig("/PATH/hashtags.png")
plt.show()