In [1]:
# Predict Whether Email is spam or not using SVM /Naive Bayes

In [2]:
from pyspark.mllib.classification import   SVMWithSGD, NaiveBayes
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.regression import LabeledPoint


In [3]:
  
## Each line has text from one Email
spam = sc.textFile("/FileStore/tables/spam")
ham = sc.textFile("/FileStore/tables/ham")

In [4]:
spam_words = spam.map(lambda email: email.split())
ham_words = ham.map(lambda email: email.split())

print(spam_words.take(1))
print(ham_words.take(1))

In [5]:
# Create a HashingTF instance to map email text to vectors of features.
tf = HashingTF(numFeatures = 200)
spam_features = tf.transform(spam_words)
ham_features = tf.transform(ham_words)

print(spam_features.take(1))
print(ham_features.take(1))

In [6]:
#Create LabeledPoint datasets for positive (spam) and negative (ham) examples.¶
spam_samples = spam_features.map(lambda features:LabeledPoint(1, features))
ham_samples = ham_features.map(lambda features:LabeledPoint(0, features))

print(spam_samples.take(1))
print(ham_samples.take(1))

In [7]:
# Split the data set 80/20
samples = spam_samples.union(ham_samples)
[training_data, test_data] = samples.randomSplit([0.8, 0.2])
training_data.cache()
test_data.cache()

In [8]:
def score(model):
    predictions = model.predict(test_data.map(lambda x: x.features))
    labels_and_preds = test_data.map(lambda x: x.label).zip(predictions)
    accuracy = labels_and_preds.filter(lambda x: x[0] == x[1]).count() / float(test_data.count())
    return accuracy

In [9]:
#Train a SVM model¶
algo = SVMWithSGD()
model = algo.train(training_data)
score(model)

In [10]:
# Train a Naive Bayes model
algo = NaiveBayes()
model = algo.train(training_data)

In [11]:
score(model)