In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, lower, count, to_date, date_format, to_timestamp, avg, stddev
from pyspark.sql.types import StructType, StructField, StringType
import pyspark.pandas as ps



In [2]:
# Initialize Spark session
spark = SparkSession.builder.appName("WordCountPerDay").getOrCreate()

file_path = "Datasets/archive/Bitcoin_tweets.csv"
# Load CSV (modify options if needed)
inputDF = spark.read.csv(
    file_path, 
    header=True,  # Use the first row as column names
    inferSchema=True,  # Infer data types
    multiLine=True,  # Handle newlines within fields
    escape='"',  # Escape character for double quotes
    quote='"',  # Define the quote character

    # Different depending on the document
    sep=",",  # Specify the correct delimiter


    mode="PERMISSIVE"  # Handle malformed rows gracefully
)
inputDF.printSchema()

root
 |-- user_name: string (nullable = true)
 |-- user_location: string (nullable = true)
 |-- user_description: string (nullable = true)
 |-- user_created: string (nullable = true)
 |-- user_followers: string (nullable = true)
 |-- user_friends: string (nullable = true)
 |-- user_favourites: string (nullable = true)
 |-- user_verified: string (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- source: string (nullable = true)
 |-- is_retweet: boolean (nullable = true)



In [3]:
date_checkDF = inputDF.withColumn("date", date_format(to_timestamp(col("date")), "yyyy-MM-dd")) \
    .filter(col("date").isNotNull())
# print(date_checkDF.count(), '/', inputDF.count())

In [4]:
daily_tweet_countDF = date_checkDF.groupBy("date").agg(count("*").alias("total_tweets"))

# Tokenize text: split into words, normalize to lowercase
wordsDF = date_checkDF.withColumn("word", explode(split(lower(col("text")), "\\s+")))

# Count occurrences per day
word_countsDF = wordsDF.groupBy("date", "word").agg(count("*").alias("word_count"))

In [5]:
# # Show results
# word_countsDF.show()
# daily_tweet_countDF.show()

In [6]:
normalized_word_countsDF = word_countsDF.join(daily_tweet_countDF, "date").withColumn(
    "normalized_count", col("word_count") / col("total_tweets")
)

In [7]:
# normalized_word_countsDF.show()

In [8]:
word_statsDF = normalized_word_countsDF.groupBy("word").agg(
    avg("normalized_count").alias("avg_normalized"),
    stddev("normalized_count").alias("stddev_normalized")
)
# word_statsDF.show()

In [9]:
# Convert Spark DataFrame to Pandas-on-Spark DataFrame
psdf = word_statsDF.to_pandas_on_spark()

# Set "word" as the index and select the columns to plot
df = psdf.set_index("word")[["avg_normalized", "stddev_normalized"]].sort_values("avg_normalized", ascending=False)

# Sort by 'avg_normalized' and select top 20 words
top_20 = df.head(20)



In [10]:
top_20.plot.bar()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [11]:
total = df['avg_normalized'].iloc[:100].sum()
total_rest = df['avg_normalized'].iloc[100:].sum()

In [12]:
print("First 100: ", total)
print("The rest: ", total_rest)
print("Ratio: ", total/(total_rest+total))

First 100:  9.836212564490106
The rest:  382.81295186915105
Ratio:  0.025050893916144967
