In [2]:
#Example usage of the preprocess_and_filter_bots function 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
import csv
import pandas as pd
from pyspark.sql.functions import count
from preprocessing_func import preprocess_and_filter_bots

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("RedditSentimentAnalysis") \
    .master("spark://192.168.2.46:7077") \
    .config("spark.dynamicAllocation.enabled", True) \
    .config("spark.shuffle.service.enabled", False) \
    .config("spark.dynamicAllocation.executorIdleTimeout", "30s") \
    .config("spark.executor.cores", 2) \
    .config("spark.cores.max", 4) \
    .config("spark.driver.port", 9999) \
    .config("spark.blockManager.port", 10005) \
    .getOrCreate()

print("✅ Spark NLP Session Started")

# Define file paths
reddit_json_path = "hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json"
botlist_csv_path = "/home/ubuntu/botlist.csv"

# Run preprocessing and bot filtering
df_cleaned = preprocess_and_filter_bots(spark, reddit_json_path, botlist_csv_path)

# Show the first few cleaned rows
df_cleaned.select("author", "normalizedBody").show(10, truncate=False)


✅ Spark NLP Session Started
 Loading Reddit JSON data from HDFS...
 Loading bot list from CSV...


FileNotFoundError: [Errno 2] No such file or directory: '/home/ubuntu/botlist.csv'

In [7]:
#Example of Sentiment Anlyser 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
import csv
import pandas as pd
from pyspark.sql.functions import count

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("RedditSentimentAnalysis") \
    .master("spark://192.168.2.46:7077") \
    .config("spark.dynamicAllocation.enabled", True) \
    .config("spark.shuffle.service.enabled", False) \
    .config("spark.dynamicAllocation.executorIdleTimeout", "30s") \
    .config("spark.executor.cores", 2) \
    .config("spark.cores.max", 4) \
    .config("spark.driver.port", 9999) \
    .config("spark.blockManager.port", 10005) \
    .getOrCreate()

print("✅ Spark NLP Session Started")

# Load Reddit comments from HDFS
df = spark.read.json("hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json")

# Load bot list
bot_df = pd.read_csv("/home/ubuntu/botlist.csv")
bot_list = bot_df["AAbot"].dropna().unique().tolist()
print("Sample bot names:", bot_list[:10])  # Show the first 10 bot usernames

# Remove bot-generated comments
df_filtered = df.filter(~col("author").isin(bot_list))

# Load positive and negative words from CSV files
def load_words_from_csv(filepath):
    words = set()
    with open(filepath, "r") as file:
        reader = csv.reader(file)
        next(reader)  # Skip header
        for row in reader:
            if row:  # Ensure row is not empty
                words.add(row[0].strip().lower())
    return words

# File paths for sentiment words
positive_words_file = "/home/ubuntu/positive_words.csv"
negative_words_file = "/home/ubuntu/negative_words.csv"

# Load words
positive_words = load_words_from_csv(positive_words_file)
negative_words = load_words_from_csv(negative_words_file)

print(f"Loaded {len(positive_words)} positive words and {len(negative_words)} negative words.")

# Define sentiment analysis function
def analyze_sentiment(text):
    if not text:
        return "neutral"
    words = set(text.lower().split())
    pos_count = len(words & positive_words)
    neg_count = len(words & negative_words)
    
    if pos_count > neg_count:
        return "positive"
    elif neg_count > pos_count:
        return "negative"
    return "neutral"

# Register UDF
sentiment_udf = udf(analyze_sentiment, StringType())

# Apply sentiment analysis
df_result = df_filtered.withColumn("sentiment", sentiment_udf(col("normalizedBody")))

# Show results
df_result.select("author", "normalizedBody", "sentiment").show(10, truncate=False)

def count_sentiments(df):
    """
    Function to count the number of positive, negative, and neutral sentiments in a DataFrame.
    
    Args:
        df (DataFrame): The Spark DataFrame containing the sentiment column.
    
    Returns:
        dict: A dictionary with counts of each sentiment category.
    """
    sentiment_counts = df.groupBy("sentiment").agg(count("*").alias("count")).collect()
    
    sentiment_dict = {"positive": 0, "negative": 0, "neutral": 0}
    
    for row in sentiment_counts:
        sentiment_dict[row["sentiment"]] = row["count"]
    
    return sentiment_dict

# Call function on the resulting DataFrame
sentiment_summary = count_sentiments(df_result)

# Print results
print("Sentiment Summary:", sentiment_summary)

# Save results back to HDFS
#df_result.write.mode("overwrite").json("hdfs://192.168.2.46:9000/output/reddit_sentiment.json")

print("✅ Sentiment analysis completed and saved to HDFS")


✅ Spark NLP Session Started


                                                                                

Sample bot names: ['ADHDbot', 'ALTcointip', 'AVR_Modbot', 'A_random_gif', 'AltCodeBot', 'Antiracism_Bot', 'ApiContraption', 'AssHatBot', 'AtheismModBot', 'AutoInsult']
Loaded 27 positive words and 27 negative words.


                                                                                

+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

Sentiment Summary: {'positive': 1111835, 'negative': 505324, 'neutral': 1855185}
✅ Sentiment analysis completed and saved to HDFS


In [None]:
# Stop Spark
spark.stop()