### Create SPARK_HOME and PYLIB env var and update PATH env var

In [1]:
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

### Initializing Spark

Build __SparkConf__ object 

    Contains information about your application.  


Create __SparkContext__ object 
    
    Tells Spark how to access a cluster. 
    

Create __SparkSession__ object

    The entry point to programming Spark with the Dataset and DataFrame API.

    Used to create DataFrame, register DataFrame as tables and execute SQL over tables etc.

In [2]:
from pyspark.conf import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession

conf = SparkConf().setAppName("TM App Model Building").setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

### Loading the dependent libraries

In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

### Problem Statement

#### Description:
The SMS Spam Collection v.1 (hereafter the corpus) is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam. 

### Reading the data and creating a dataframe

In [4]:
## Read data and create a dataframe
data = spark.read.csv(path='file:///home/thomasj/Batch52/SparkStructuredStreamingKafka/SMSSpamCollectionTrain',
                      sep='\t',
                      header=False,
                      inferSchema=True)

In [5]:
data.show(2)

+---+--------------------+
|_c0|                 _c1|
+---+--------------------+
|ham|Go until jurong p...|
|ham|Ok lar... Joking ...|
+---+--------------------+
only showing top 2 rows



#### Rename Columns

In [6]:
data = data.withColumnRenamed('_c0', 'messageType').withColumnRenamed('_c1', 'message')

### Understanding Data

#### Print Schema

In [7]:
data.printSchema()

root
 |-- messageType: string (nullable = true)
 |-- message: string (nullable = true)



#### Total number of Columns and Records

In [8]:
print("No. of Columns = {}".format(len(data.columns)))

print('No. of Records = {}'.format(data.count()))

No. of Columns = 2
No. of Records = 5574


In [9]:
data.show(2, truncate = False)

+-----------+---------------------------------------------------------------------------------------------------------------+
|messageType|message                                                                                                        |
+-----------+---------------------------------------------------------------------------------------------------------------+
|ham        |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...|
|ham        |Ok lar... Joking wif u oni...                                                                                  |
+-----------+---------------------------------------------------------------------------------------------------------------+
only showing top 2 rows



### Data Preprocessing

Checking for null values at each column

In [10]:
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+-----------+-------+
|messageType|message|
+-----------+-------+
|          0|      0|
+-----------+-------+



### Tokenizer

Tokenization is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). Let's tokenize the messages and create a list of words of each message.

In [11]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="message", outputCol="words")

###  CountVectorizer

CountVectorizer converts the list of tokens above to vectors of token counts. 

In [12]:
from pyspark.ml.feature import CountVectorizer

countVectorizer = CountVectorizer(inputCol="words", outputCol="rawFeatures")

### Inverse Document Frequency

IDF down-weighs features which appear frequently in a corpus. This generally improves performance when using text as features since most frequent, and hence less important words, get down-weighed.

In [13]:
from pyspark.ml.feature import  IDF

idf = IDF(inputCol="rawFeatures", outputCol="features")

### String Indexer

In [14]:
from pyspark.ml.feature import StringIndexer

indexer_Label = StringIndexer(inputCol="messageType", outputCol="label")

### Create list of preprocessing Pipeline Stages

In [15]:
preprocessing_Stages = [tokenizer]+ [countVectorizer] + [idf] + [indexer_Label]

### Build Logistic Regression Classification Model

In [16]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label", featuresCol="features")

In [17]:
from pyspark.ml import Pipeline

In [18]:
lr_Pipeline = Pipeline(stages=preprocessing_Stages + [lr]) 

lr_Pipeline_model = lr_Pipeline.fit(data)

### Save the Pipeline Model

In [21]:
lr_Pipeline_model.save("/user/thomasj/StructuredStreamingKafka/Model")