In [None]:
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession

# Create SparkSession object
spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('test') \
                    .getOrCreate()

# What version of Spark?
# (Might be different to what you saw in the presentation!)
print(spark.version)

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
sms = spark.read.csv("../input/spam-text-message-classification", header=True)
sms.printSchema()

In [None]:
from pyspark.ml.feature import StringIndexer
tag = StringIndexer(inputCol = 'Category' , outputCol = 'label' )
tag = tag.fit(sms)
sms_one_hot = tag.transform(sms)

In [None]:
sms_one_hot = sms_one_hot.drop('Category')
from pyspark.sql.functions import monotonically_increasing_id 
sms_one_hot = sms_one_hot.select("*").withColumn("id", monotonically_increasing_id())
sms_one_hot.show()

In [None]:
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer
wrangled = sms_one_hot.withColumn('Message', regexp_replace(sms.Message, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('Message', regexp_replace(wrangled.Message, '[0-9]', ' '))
wrangled = wrangled.withColumn('Message', regexp_replace(wrangled.Message, ' +', ' '))
wrangled = Tokenizer(inputCol='Message', outputCol='words').transform(wrangled)
wrangled.show(4, truncate=False)

In [None]:
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF
stopwords = StopWordsRemover()
stopwords = stopwords.setInputCol('words').setOutputCol('terms')
wrangled = stopwords.transform(wrangled)
hasher = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)
wrangled =  hasher.transform(wrangled)
tf_idf = IDF(inputCol='hash', outputCol='features').fit(wrangled).transform(wrangled)   
tf_idf.select('terms', 'features').show(4, truncate=False)

In [None]:
from pyspark.ml.classification import LogisticRegression
sms_train, sms_test = tf_idf.randomSplit([0.8,0.2], seed = 13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2).fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy('label', 'prediction').count().show()

In [None]:
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label != prediction').count()
FP = prediction.filter('prediction = 1 AND label != prediction').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP) / (TN + TP + FN + FP)
print(accuracy)