<a href="https://colab.research.google.com/github/sridhartroy/AIML/blob/main/Big_Data_A2_Part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark==3.5.6 spacy


Collecting pyspark==3.5.6
  Downloading pyspark-3.5.6.tar.gz (317.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.4/317.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.6-py2.py3-none-any.whl size=317895798 sha256=bb00f8658bbb9e11f2f50761a116d7191111c442ecbe8dc6ae0ad8d66641f6fc
  Stored in directory: /root/.cache/pip/wheels/64/62/f3/ec15656ea4ada0523cae62a1827fe7beb55d3c8c87174aad4a
Successfully built pyspark
Installing collected packages: pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.1
    Uninstalling pyspark-3.5.1:
      Successfully uninstalled pyspark-3.5.1
Successfully installed pyspark-3.5.6


In [None]:
from pyspark.sql import SparkSession

# Step 3: Initialize Spark session
spark = SparkSession.builder \
    .appName("NER-WordCount") \
    .getOrCreate()

sc = spark.sparkContext


In [None]:
# Download dataset
!wget -q https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip -q smsspamcollection.zip

# Read as RDD
data_rdd = sc.textFile("SMSSpamCollection")

# Parse label and text
def parse_line(line):
    label, text = line.split("\t", 1)
    return (label.strip(), text.strip())

data_rdd = data_rdd.map(parse_line)
print("Total records:", data_rdd.count())


Total records: 5574


In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
token_pattern = re.compile(r"[a-zA-Z']+")

def preprocess_text(text):
    text = text.lower()
    tokens = token_pattern.findall(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    tokens = [stemmer.stem(t) for t in tokens]
    return tokens

# Transform dataset: (label, tokens)
data_rdd = data_rdd.map(lambda x: (x[0], preprocess_text(x[1])))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_rdd, test_rdd = data_rdd.randomSplit([0.8, 0.2], seed=42)
print(f"Train size: {train_rdd.count()}, Test size: {test_rdd.count()}")


Train size: 4458, Test size: 1116


In [None]:
from collections import Counter
import math

# Count docs per class
class_doc_counts = train_rdd.map(lambda x: (x[0], 1)).reduceByKey(lambda a,b: a+b).collectAsMap()
total_docs = sum(class_doc_counts.values())
class_priors = {cls: math.log(count/total_docs) for cls, count in class_doc_counts.items()}

# Word counts per class
def word_count_mapper(record):
    label, tokens = record
    counts = Counter(tokens)
    for word, c in counts.items():
        yield ((label, word), c)

word_counts = train_rdd.flatMap(word_count_mapper).reduceByKey(lambda a,b: a+b)

# Total words per class
class_word_totals = word_counts.map(lambda x: (x[0][0], x[1])).reduceByKey(lambda a,b: a+b).collectAsMap()

# Vocabulary size
vocab = word_counts.map(lambda x: x[0][1]).distinct().collect()
V = len(vocab)
print("Vocab size:", V)

# Build log-prob table for each class
alpha = 1.0
class_word_probs = {}
for (cls, word), count in word_counts.collect():
    total = class_word_totals[cls]
    denom = total + alpha * V
    if cls not in class_word_probs:
        class_word_probs[cls] = {}
    class_word_probs[cls][word] = math.log((count + alpha) / denom)

# Default prob for unseen words
default_logprob = {cls: math.log(alpha / (class_word_totals[cls] + alpha * V)) for cls in class_word_totals}


Vocab size: 5529


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

def predict(tokens):
    scores = {}
    token_counts = Counter(tokens)
    for cls in class_priors:
        score = class_priors[cls]
        for word, c in token_counts.items():
            score += c * class_word_probs.get(cls, {}).get(word, default_logprob[cls])
        scores[cls] = score
    return max(scores, key=scores.get)

y_true = []
y_pred = []
for label, tokens in test_rdd.collect():
    y_true.append(label)
    y_pred.append(predict(tokens))

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("Class priors (raw):", {cls: math.exp(logp) for cls, logp in class_priors.items()})


Accuracy: 0.9740143369175627
Confusion Matrix:
 [[965  24]
 [  5 122]]
Class priors (raw): {'ham': 0.860924181247196, 'spam': 0.13907581875280395}
