<a href="https://colab.research.google.com/github/sanjaysathish2000/Cloud-Assignment/blob/main/Spam_vs_Ham.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pyspark

In [None]:
from pyspark.sql import SparkSession

# Step 1: Create a Spark session
spark = SparkSession.builder.appName("Spam vs Ham").getOrCreate()

# Step 2: Read a dataset
file_path = "/content/spam_email_dataset.csv"

# Read the CSV file into a DataFrame
# You can adjust options based on your CSV file's format
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Step 3: Perform operations on the DataFrame
df.printSchema()
df.show(5)



In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import col, lower, when
from pyspark.sql import functions as F
from pyspark.sql.functions import lit, explode
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, DoubleType
from math import log
from collections import Counter

# Select relevant columns
selected_columns = ["Sender", "Subject", "`Spam Indicator`"]
df = df.select(*selected_columns)

df.show(5)

# Check for null values in "Subject," "Sender," and "Spam Indicator" columns
null_check_columns = ["Subject", "Sender", "`Spam Indicator`"]
for column in null_check_columns:
    null_count = df.filter(col(column).isNull()).count()
    print(f"Number of null values in {column}: {null_count}")

# Convert "Subject" to lowercase
df = df.withColumn("Subject", lower(col("Subject")))

# Handle null values in "Spam Indicator" column (assuming 0 for null)
df = df.withColumn("`Spam Indicator`", when(col("`Spam Indicator`").isNull(), 0).otherwise(col("`Spam Indicator`")))

# Tokenize the "Subject" column
tokenizer = Tokenizer(inputCol="Subject", outputCol="words")
df = tokenizer.transform(df)

# Remove stop words from the tokenized words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
df = remover.transform(df)

# Select only specified columns
selected_columns = ["Subject", "Sender", "`Spam Indicator`", "filtered"]
df = df.select(*selected_columns)

# Show the updated DataFrame
df.show(5)


In [None]:
# Separate values based on "Spam Indicator" (0 for ham, 1 for spam)
ham_df = df.filter("`Spam Indicator` == 0")
spam_df = df.filter("`Spam Indicator` == 1")

# Collect unique words used in ham and spam
ham_words = ham_df.select("filtered").rdd.flatMap(lambda x: x["filtered"]).distinct().collect()
spam_words = spam_df.select("filtered").rdd.flatMap(lambda x: x["filtered"]).distinct().collect()

# Display unique words in ham and spam
print("Unique words in ham emails:")
print(ham_words)

print("\nUnique words in spam emails:")
print(spam_words)

# Show the updated DataFrame
ham_df.show(5)
spam_df.show(5)


In [None]:
# Collect all the filtered words used in ham emails
ham_words = ham_df.select("filtered").rdd.flatMap(lambda x: x["filtered"]).collect()

# Use Counter to count occurrences of each word in ham emails
word_counts_ham = Counter(ham_words)

# Convert the Counter to a DataFrame
word_counts_ham_df = spark.createDataFrame(word_counts_ham.items(), ["word", "count"])

# Show the top 10 ham words
print("Top 10 Ham Words:")
word_counts_ham_df.orderBy(F.desc("count")).show(10)

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col

# Select the top 10 ham words
top_ham_words = word_counts_ham_df.orderBy(F.desc("count")).limit(10).select("word").rdd.flatMap(lambda x: x).collect()

# Explode the array column to separate rows in ham emails
ham_df_exploded = ham_df.select("Sender", "Subject", explode("filtered").alias("word"))

# Filter based on the top ham words
top_ham_senders = (
    ham_df_exploded
    .filter(col("word").isin(top_ham_words))
    .groupBy("Sender", "Subject")
    .count()
    .orderBy(F.desc("count"))
    .limit(10)
)

# Show the top 10 ham senders and subjects
print("Top 10 ham Senders and Subjects:")
top_ham_senders.show(truncate=False)


In [None]:
# Collect all the filtered words used in spam emails
spam_words = spam_df.select("filtered").rdd.flatMap(lambda x: x["filtered"]).collect()

# Use Counter to count occurrences of each word in spam emails
word_counts_spam = Counter(spam_words)

# Convert the Counter to a DataFrame
word_counts_spam_df = spark.createDataFrame(word_counts_spam.items(), ["word", "count"])

# Show the top 10 spam words
print("Top 10 Spam Words:")
word_counts_spam_df.orderBy(F.desc("count")).show(10)

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col

# Select the top 10 spam words
top_spam_words = word_counts_spam_df.orderBy(F.desc("count")).limit(10).select("word").rdd.flatMap(lambda x: x).collect()

# Explode the array column to separate rows in spam emails
spam_df_exploded = spam_df.select("Sender", "Subject", explode("filtered").alias("word"))

# Filter based on the top spam words
top_spam_senders = (
    spam_df_exploded
    .filter(col("word").isin(top_spam_words))
    .groupBy("Sender", "Subject")
    .count()
    .orderBy(F.desc("count"))
    .limit(10)
)

# Show the top 10 spam senders and subjects
print("Top 10 Spam Senders and Subjects:")
top_spam_senders.show(truncate=False)


In [None]:
# Select relevant columns from top_ham_senders
df = top_ham_senders.select("Sender", "Subject")

# Tokenize the "Subject" column
tokenizer_udf = udf(lambda text: text.split(), ArrayType(StringType()))
df = df.withColumn("words", tokenizer_udf(df["Subject"]))

# Calculate Term Frequencies (TF)
calculate_tf_udf = udf(lambda word_list: {word: word_list.count(word) / len(word_list) for word in set(word_list)}, StringType())
df = df.withColumn("tf", calculate_tf_udf(df["words"]))

# Extract unique words
unique_words = list(set(df.selectExpr("explode(words) as word").select("word").distinct().rdd.flatMap(lambda x: x).collect()))

# Calculate Inverse Document Frequencies (IDF)
total_documents = df.count()
document_frequency = df.select("Sender", "words").rdd.flatMap(lambda x: [(word, 1) for word in set(x[1])]).reduceByKey(lambda x, y: x + y)
idf_values = document_frequency.map(lambda x: (x[0], log(total_documents / x[1])))

# Broadcast IDF values
idf_broadcast = spark.sparkContext.broadcast(dict(idf_values.collect()))

# Calculate TF-IDF for each document
def calculate_tfidf(row):
    user_name, words = row
    tfidf_values = {word: words.count(word) * idf_broadcast.value.get(word, 0.0) for word in words}
    return user_name, words, tfidf_values

tfidf_data = df.select("Sender", "words").rdd.map(calculate_tfidf)

# Display the result
tfidf_df = spark.createDataFrame(tfidf_data, ["Sender", "words", "tfidf"])
tfidf_df.show(10, truncate = False)


In [None]:
# Select relevant columns from top_spam_senders
df = top_spam_senders.select("Sender", "Subject")

# Tokenize the "Subject" column
tokenizer_udf = udf(lambda text: text.split(), ArrayType(StringType()))
df = df.withColumn("words", tokenizer_udf(df["Subject"]))

# Calculate Term Frequencies (TF)
calculate_tf_udf = udf(lambda word_list: {word: word_list.count(word) / len(word_list) for word in set(word_list)}, StringType())
df = df.withColumn("tf", calculate_tf_udf(df["words"]))

# Extract unique words
unique_words = list(set(df.selectExpr("explode(words) as word").select("word").distinct().rdd.flatMap(lambda x: x).collect()))

# Calculate Inverse Document Frequencies (IDF)
total_documents = df.count()
document_frequency = df.select("Sender", "words").rdd.flatMap(lambda x: [(word, 1) for word in set(x[1])]).reduceByKey(lambda x, y: x + y)
idf_values = document_frequency.map(lambda x: (x[0], log(total_documents / x[1])))

# Broadcast IDF values
idf_broadcast = spark.sparkContext.broadcast(dict(idf_values.collect()))

# Calculate TF-IDF for each document
def calculate_tfidf(row):
    user_name, words = row
    tfidf_values = {word: words.count(word) * idf_broadcast.value.get(word, 0.0) for word in words}
    return user_name, words, tfidf_values

tfidf_data = df.select("Sender", "words").rdd.map(calculate_tfidf)

# Display the result
tfidf_df = spark.createDataFrame(tfidf_data, ["Sender", "words", "tfidf"])
tfidf_df.show(10, truncate = False)

