In [0]:
# Inializing the spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spoilers').getOrCreate()

In [0]:
# Read DATA
df = spark.read.csv('dbfs:/FileStore/shared_uploads/shhivramcss@gmail.com/SMSSpamCollection', sep = '\t')

In [0]:
df.show(5)

In [0]:
# rENAMING THE COLUMNS
df = df.withColumnRenamed('_c0','index').withColumnRenamed('_c1','text') 

In [0]:
df.show(5)

In [0]:
# Data Preprocessing
from pyspark.ml.feature import (StringIndexer, Tokenizer,
                                StopWordsRemover, HashingTF, IDF)

In [0]:
# String Indexing the index column and renaming it as labels
sidex = StringIndexer(inputCol= 'index', outputCol= 'labels')
df = sidex.fit(df).transform(df)

In [0]:
# Ham is 0
# Spam in 1
df = df.drop('index')
df.show(2)

In [0]:
# Checking the average length of spam and ham emails
from pyspark.sql.functions import length, format_number
df = df.withColumn('length', length(df['text']))
df.show(2)
length_grouped = df.groupby('index').mean()
length_grouped.select('index' , format_number('avg(length)', 2).alias('Length')).show()


In [0]:
# Spliting the email into word tokens
tokenizer = Tokenizer(inputCol='text' , outputCol='tokens')
tokenizer_df = tokenizer.transform(df)
tokenizer_df.show(2)

In [0]:
# Removing stopwords using Stopwordsremover
spw = StopWordsRemover(inputCol= 'tokens', outputCol= 'cleaned_corpus')
clean_corpus_df = spw.transform(tokenizer_df)
clean_corpus_df.show(2)

In [0]:
# Calculating term frequency for the words
htf = HashingTF(inputCol= 'cleaned_corpus', outputCol= 'Term_freq')
term_freq_Df = htf.transform(clean_corpus_df)
term_freq_Df.show(2)

In [0]:
# calculating inverse document frequency for the words
idf = IDF(inputCol= 'Term_freq', outputCol= 'Inverse_doc_feq')
model = idf.fit(term_freq_Df)
results = model.transform(term_freq_Df)
results.show(1)

In [0]:
# Extracting the final df
f_df = results.select('Inverse_doc_feq','labels')

In [0]:
# Train test split on the dataset
train, test = f_df.randomSplit([0.7,0.3])

In [0]:
# Invoking the naive bayes algorithm
from pyspark.ml.classification import NaiveBayes

In [0]:
nb_clf = NaiveBayes(featuresCol= 'Inverse_doc_feq', labelCol= 'labels')

In [0]:
trained_naive_base_model = nb_clf.fit(train)

In [0]:
preds = trained_naive_base_model.transform(test)

In [0]:
#eval_auc = BinaryClassificationEvaluator(labelCol= 'labels')
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
eval_acc = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'accuracy', metricLabel= 1)
eval_pres = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'precisionByLabel', metricLabel= 1)
eval_recall = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'recallByLabel')
eval_f1 = MulticlassClassificationEvaluator(labelCol= 'labels')
print("Precision by label: 1", eval_pres.evaluate(preds))
print("Recall by label: 1", eval_recall.evaluate(preds))
print("Accuracy of the model", eval_acc.evaluate(preds))
print("f1 score of the model", eval_f1.evaluate(preds))

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LogisticRegression

In [0]:
# Decision Tree Model
dt_clf = DecisionTreeClassifier(featuresCol= 'Inverse_doc_feq', labelCol= 'labels')
dt_clf_model = dt_clf.fit(train)
dt_preds = dt_clf_model.transform(test)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
eval_acc = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'accuracy', metricLabel= 1)
eval_pres = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'precisionByLabel', metricLabel= 1)
eval_recall = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'recallByLabel')
eval_f1 = MulticlassClassificationEvaluator(labelCol= 'labels')
print("Precision by label: 1", eval_pres.evaluate(dt_preds))
print("Recall by label: 1", eval_recall.evaluate(dt_preds))
print("Accuracy of the model", eval_acc.evaluate(dt_preds))
print("f1 score of the model", eval_f1.evaluate(dt_preds))

In [0]:
# Random Forest Model
rf_clf = RandomForestClassifier(featuresCol= 'Inverse_doc_feq', labelCol= 'labels')
rf_clf_model = rf_clf.fit(train)
rf_clf_preds = rf_clf_model.transform(test)

In [0]:
eval_acc = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'accuracy', metricLabel= 1)
eval_pres = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'precisionByLabel', metricLabel= 1)
eval_recall = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'recallByLabel')
eval_f1 = MulticlassClassificationEvaluator(labelCol= 'labels')
print("Precision by label: 1", eval_pres.evaluate(rf_clf_preds))
print("Recall by label: 1", eval_recall.evaluate(rf_clf_preds))
print("Accuracy of the model", eval_acc.evaluate(rf_clf_preds))
print("f1 score of the model", eval_f1.evaluate(rf_clf_preds))

In [0]:
#Gradient Boosting Model
gb_clf = GBTClassifier(featuresCol= 'Inverse_doc_feq', labelCol= 'labels')
gb_clf_model = gb_clf.fit(train)
gb_clf_preds = gb_clf_model.transform(test)

In [0]:
eval_acc = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'accuracy', metricLabel= 1)
eval_pres = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'precisionByLabel', metricLabel= 1)
eval_recall = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'recallByLabel')
eval_f1 = MulticlassClassificationEvaluator(labelCol= 'labels')
print("Precision by label: 1", eval_pres.evaluate(gb_clf_preds))
print("Recall by label: 1", eval_recall.evaluate(gb_clf_preds))
print("Accuracy of the model", eval_acc.evaluate(gb_clf_preds))
print("f1 score of the model", eval_f1.evaluate(gb_clf_preds))

In [0]:
#Logistic Regression Model
lr_clf = LogisticRegression(featuresCol= 'Inverse_doc_feq', labelCol= 'labels')
lr_clf_model = lr_clf.fit(train)
lr_clf_preds = lr_clf_model.transform(test)

In [0]:
eval_acc = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'accuracy', metricLabel= 1)
eval_pres = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'precisionByLabel', metricLabel= 1)
eval_recall = MulticlassClassificationEvaluator(labelCol= 'labels', metricName= 'recallByLabel')
eval_f1 = MulticlassClassificationEvaluator(labelCol= 'labels')
print("Precision by label: 1", eval_pres.evaluate(lr_clf_preds))
print("Recall by label: 1", eval_recall.evaluate(lr_clf_preds))
print("Accuracy of the model", eval_acc.evaluate(lr_clf_preds))
print("f1 score of the model", eval_f1.evaluate(lr_clf_preds))

In [0]:
# The logistic Regression Model Has the hightest accuracy and precision