# TO DO:

- make an ensemble
    - combine predictions of both
- facebook vulgar words

# Part I: Preprocessing the Document Edits

## Step 0: Import, Initialization and Loading

IDEA: load all the part files into a single dataframe

In [1]:
import pandas as pd
import numpy as np
import emoji
import re as re
import difflib
import shutil
import sys
import os

import profanity_check as pc
from fuzzywuzzy import fuzz

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)
        
# from pyspark import SparkContext
# SparkContext.setSystemProperty('spark.executor.memory', '2g')

from pyspark.ml import Pipeline
        
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler,IndexToString

from pyspark.sql.types import IntegerType, StringType, ArrayType, FloatType
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit, monotonically_increasing_id

from pyspark.sql.types import StructType
from pyspark.sql.types import StructField

from pyspark.ml.classification import NaiveBayes, NaiveBayesModel, RandomForestClassifier
from pyspark.ml.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import MultilabelMetrics
from pyspark.mllib.util import MLUtils

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from nltk.stem.snowball import *

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import multilabel_confusion_matrix





In [2]:
sc

In [3]:
spark

### Loading the data into a single dataframe

The idea is to load all the saved partfiles into a single dataframe. Next this dataframe can be used to train the model

In [4]:
def load_rdd(base_directory):
    # Get all the directory names of the saved myoutput folders
    foldernames = os.listdir(base_directory)
    
    # Create list of full directorie names
    full_directories = []
    
    for i in range(len(foldernames)):
        
        if foldernames[i] == '.DS_Store':
            continue
        
        directory_temp = base_directory + "/" + foldernames[i]
        full_directories.append(directory_temp)
    
    print("Number of directories included: ", len(full_directories))
    
    df = spark.read.format('json').load(path=full_directories)
    return df


In [5]:
# base_directory = r'/Users/Simon/Documents/GitHub/adana_task3/Spark_Cleaned/myoutput-1586797640000/part-00003'
# base_directory = r'/Users/Simon/Documents/GitHub/adana_task3/Spark_Cleaned/myoutput-1586797640000'
# base_directory = r'/Users/Simon/Documents/GitHub/adana_task3/Spark_Data_Limited'
base_directory = r'/Users/Simon/Documents/GitHub/adana_task3/Spark_Cleaned'

df = load_rdd(base_directory)
print('total nr of instances =  ',len(df.toPandas()))
# df.show()
    

Number of directories included:  1120
total nr of instances =   2703


### Filtering the loaded data 

The goal of this step is to obtain a more balanced dataset

In [6]:
def filter_data(dataframe): #input a spark dataframe
    
    df = dataframe.toPandas()
    
    df_unsafe = df[df['label'] == 'unsafe']
    df_vandal = df[df['label'] == 'vandal']
    
    max_label = len(df_unsafe) + len(df_vandal)
    
    df_safe = df[df['label'] == 'safe'].sample(n = max_label,random_state=42)
    
    df = df_safe.append(df_unsafe)
    df = df.append(df_vandal)
    df = df.sample(frac=1).reset_index(drop=True)
    df = sqlContext.createDataFrame(df)
    
    return df

In [7]:
df_selection = filter_data(df)
print('total nr of instances =  ',len(df_selection.toPandas()))
df_selection.show()

## To run faster we only take a selection of the data:
# df_selection = df.toPandas().tail(2500)
# df_selection = sqlContext.createDataFrame(df_selection)
df_selection = df_selection.repartition(200)
df_selection.rdd.getNumPartitions()

total nr of instances =   692
+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|           name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Fix placename lin...|  safe|              Certes|{{About|the India...|{{Short descripti...|           Rajasthan|//en.wikipedia.or...|
|→‎Geography and g...|  safe|            Bneu2013|{{Other uses|Copp...|{{Other uses|Copp...|Copper Basin (Ten...|//en.wikipedia.or...|
|→‎Biography:that ...|  safe|        Lisapollison|{{short descripti...|{{short descripti...|     Marijohn Wilkin|//en.wikipedia.or...|
|Update date forma...|  safe|     Rich Farmbrough|{{short descripti...|{{short descripti...|          Petr Málek|//en.wikipedia.or...|
|    (→‎Plot summary)|uns

200

### Checking the frequency of each label

In [8]:
df_selection.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------+-----+
| label|count|
+------+-----+
|  safe|  346|
|unsafe|  314|
|vandal|   32|
+------+-----+



## Step 1a: Tokenization & Normalization

The regexTokenizer is used because of its extra functionality compared to the standard Tokenizer built into spark. Also useful is that the tokens are normalized (decapitalized). 

In [9]:
def tokenize(dataframe):
    df = dataframe
    
    rt_old = RegexTokenizer(inputCol="text_old", outputCol="words_old", toLowercase=True, pattern=("\\W"))
    countTokens = udf(lambda words: len(words), IntegerType())

    # regexTokenized_old = rt_old.transform(df)
    # df_step1a = regexTokenized_old.withColumn("tokens_old", countTokens(col("words_old")))

    regexTokenized_old = rt_old.transform(df)
    df = regexTokenized_old.withColumn("tokens_old", countTokens(col("words_old")))

    #########################################################################################

    rt_new = RegexTokenizer(inputCol="text_new", outputCol="words_new", toLowercase=True, pattern=("\\W"))
    # regexTokenized_new = rt_new.transform(df_step1a)
    # df_step1b = regexTokenized_new.withColumn("tokens_new", countTokens(col("words_new")))

    regexTokenized_new = rt_new.transform(df)
    df = regexTokenized_new.withColumn("tokens_new", countTokens(col("words_new")))

    # df_step1b.show(truncate=False)
    
    print("Step 1a Done")
    return df


## Step 1b: Delta Generator

In this crucial step the difference between input and output text is determined. The difference is found using the unified_diff function accesible in through the difflib python library. The function takes two lists of strings as inputs and computes the deleted and inserted (replaced) words. This difference is used to later classify the text edit.

In [10]:
def text_difference(text_old,text_new):

    new_words = []
    deleted_words = []

    for line in difflib.unified_diff(text_old, text_new, fromfile='before.txt', tofile='after.txt'):
    #     sys.stdout.write(line)
        if "-" in line and " " not in line:
            new_line = line.replace("-", "")
            deleted_words.append(new_line)
        elif "+" in line and " " not in line:
            new_line = line.replace("+", "")
            new_words.append(new_line)

    # print("Deleted words: ", deleted_words)
    # print("Inserted words: ", new_words)

    edited_words = deleted_words + new_words
    
    ## Need to built in a protection mechanism for some of the edits which are massive
    ## Some pranksters copy the same sentence millions of times which breaks the vectorizer
    
    threshold_editsize = 300
    massive_edit = 0.0
    
    if len(edited_words) >= threshold_editsize:
        
        massive_edit = 1.0
        
        if len(set(edited_words)) >= threshold_editsize:
            edited_words = ['massive', 'edit']
        
        else:
            edited_words = list(set(edited_words))
            
    
    
    return edited_words,massive_edit

# text_old = df_step1b.select("words_old").collect()[0][0]
# text_new = df_step1b.select("words_new").collect()[0][0]
# edited_words = text_difference(text_old,text_new)


### User Defined Function

This code calculated the difference between the input and output text. This is accomplished by defining a UDF and a seperate function arrayUdf(). The udf is called on two columns *'words_old'* and *'words_new'*. Next a lambda function is defined to iterate over each row of the two input columns. Within the udf is refered to another function arrayUdf() which requires two inputs: the two tokenized lists of words which will be used to compute the difference. The arrayUdf() function acts as an itermediary to call on a different function: text_difference(). The text_difference() function uses the unified_diff generator from the difflib package to return the deltas between two lists of strings.

Through experimentation with the unified_diff generator, we found that it was much easier to first tokenize the input and output text and then compute the difference between the two tokenized lists of words. This in contrast to passing the two texts (*'text_old'* and *'text_new'*) of the rdd's as input directly and then tokenizing this *'difference_text'*. Although the latter method might create less computational overhead due to less tokenization, the former method proves to be much more reliable to determine which words have been deleted and which words are new.

In [11]:
def diff_text(dataframe):
    df = dataframe
    
    def arrayUdf1(text_old,text_new):
        edited_words = text_difference(text_old,text_new)[0]
        return edited_words
    
    def arrayUdf2(text_old,text_new):
        massive_edit = text_difference(text_old,text_new)[1]
        return massive_edit

    #calling udf function
    callArrayUdf1 = udf(lambda row: arrayUdf1(row[0],row[1]), ArrayType(StringType()))
    callArrayUdf2 = udf(lambda row: arrayUdf2(row[0],row[1]), FloatType())

    #registering udf function
    spark.udf.register("callArrayUdf1",callArrayUdf1)
    spark.udf.register("callArrayUdf2",callArrayUdf2)

    #results of udf function
    df = df.withColumn("diff_text", callArrayUdf1(struct('words_old','words_new')))
    df = df.withColumn("massive_edit", callArrayUdf2(struct('words_old','words_new')))
    
    print("Step 1b Done")
    return df

In [12]:
# Cross validate the change by checking the difference in number of tokens created

# tokens_old = df_step1b.select("tokens_old").collect()[0][0]
# tokens_new = df_step1b.select("tokens_new").collect()[0][0]

# diff_tokens = tokens_new - tokens_old
# print("The difference in number of tokens for input and output text = ", diff_tokens)

## Step 2: Stop Word Removal

In [13]:
def stop_words_removal(dataframe):

    df = dataframe
    
    locale = sc._jvm.java.util.Locale
    locale.setDefault(locale.forLanguageTag("en-US"))

    stopwords = StopWordsRemover.loadDefaultStopWords("english")
    extra_stopwords = ["http","https","ref","www","com","org","url","web"]
    stopwords = stopwords + extra_stopwords
    # print(stopwords)

    remover = StopWordsRemover(inputCol="diff_text", outputCol="words_clean",stopWords=stopwords)
    stopwords = remover.getStopWords()


    # df_step2 = remover.transform(df_step1c)

    # df_step2.select("words_clean").show(truncate=False)

    df = remover.transform(df)

    # (inputCol="words", outputCol="filtered",stopWords=StopWordsRemover.loadDefaultStopWords("english"))
    # remover.transform(df_tokenized).show(truncate=False)
    
    print("Step 2 Done")
    return df

## Step 3: Stemming

The chosen algorithm for stemming is the snowball stemming algorithm (a variant of the Porter algorithm). The snowball stemmer was chosen because it is slightly more aggresive at stemming the tokenized words than the standard Porter algorithm while still being less aggresive than the Lancaster algorithm. It is a nice 'middle ground' between the two stemming variants.

In [14]:
def stemming(dataframe):
    
    df = dataframe
    
    # stemmer = SnowballStemmer('english')
    stemmer = PorterStemmer()

    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))

    # df_step3 = df_step2.withColumn("words_stemmed", stemmer_udf("words_clean"))

    df = df.withColumn("words_stemmed", stemmer_udf("words_clean"))

    # df_step3.select("words_stemmed").show(truncate=False)
    
    print("Step 3 Done")
    return df

## Step 4: Feature Vectorization (TF-IDF)

In [15]:
def vectorization(dataframe):
    
    df = dataframe
    
    tf = HashingTF(inputCol="words_stemmed", outputCol="tf")#, numFeatures=20)

    # df_step4a = tf.transform(df_step3)
    df = tf.transform(df)


    idf = IDF(inputCol="tf", outputCol="tf_idf")
    # idfModel = idf.fit(df_step4a)
    # df_step4b = idfModel.transform(df_step4a)

    idfModel = idf.fit(df)
    df = idfModel.transform(df)

    # df_step4a.show(truncate=False)
    # df_step4b.select("words_stemmed","tf_idf").show(truncate=False)
    
    print("Step 4 Done")
    return df

## Step 5: String Indexer

In this final step the labels (*Safe, Unsafe and Vandal*) are encoded to label indices. The most frequent label gets index 0 while the least frequent label gets the last index depending on the number of indices. In this case the least frequent label gets index 2.

In our data 0 corresponds to *Safe*, 1 corresponds to *Unsafe*, and 2 corresponds to *Vandal*.

In [16]:
def string_indexer(dataframe):
    
    df = dataframe
    
    label_indexer = StringIndexer(inputCol = "label", outputCol = "label_index")
    
#     indexToLabel = label_indexer.labels
    
    # df_step5a = label_indexer.fit(df_step4b).transform(df_step4b)
    # df_step5b = df_step5a.select("tf_idf","label_index")

    #  # Renaming the columns
    # df_final = df_step5b.withColumnRenamed("tf_idf","features")
    # df_final = df_final.withColumnRenamed("label_index","label")

    df = label_indexer.fit(df).transform(df)
    

    # # Renaming the columns
    df_label = df.select("tf_idf","label_index")
    df_label = df_label.withColumnRenamed("label_index","actual_label")

    # df_final.show()
    
    print("Step 5 Done")
    return df , df_label #, indexToLabel

## Step 6: Assembly of the preprocessing steps

In [17]:
def preprocessing(dataframe):
    
    
    df = dataframe
    
    df = tokenize(df)
    df = diff_text(df)
    df = stop_words_removal(df)
    df = stemming(df)
    df = vectorization(df)
    df , df_label = string_indexer(df)
    
    return df,df_label



##### Preprocessing the training data #####
df_final , df_label = preprocessing(df_selection)



Step 1a Done
Step 1b Done
Step 2 Done
Step 3 Done
Step 4 Done
Step 5 Done


# Part II: Ancillary Features



In [18]:
# old_text = ["car", "train", "boat"]
# new_text = ["car", "train", "boat", "biiike", "pl4n3"]
# diff_text = ["biiike", "pl4n3", "pl4n3"]

# word = 'biiike'

In [19]:
def maxRepeat(diff_text):
    
    h = len(diff_text)
    count = 0
    
    for i in range(0, h):
        l = len(diff_text[i])
        
        #Find the maximum repeating character
                   
        for j in range(0, l):
            cur_count = 1
            for k in range(j + 1, l):
                if(diff_text[i][j] != diff_text[i][k]):
                    break
                cur_count += 1
                                
                #update result if required
                if cur_count > count:
                    count = cur_count
                            
    return count

# print(maxRepeat(diff_text))



In [20]:
#empty revision
##checks if edit is is empty or non empty

def empty(text_list):
    if len(text_list) == 0 or text_list[0] == "empty":
        empty = 1
    else:
        empty = 0

    return empty

In [21]:
#ratio between old text and new text; if  > 1 new text is longer than old text
#i would consider it vandal if there is a significant deviation from 1

def size_ratio(old_text_list, new_text_list):
    len_old_text = len(old_text_list)
    len_new_text = len(new_text_list)
    
    if len_old_text == 0:
        ratio = 0.0
    else:
        ratio = round(len_new_text / len_old_text,3)
    
    return ratio

In [22]:
#counts the alphanumberic strings in the diffrence list eg (dkfdj125kd,...) the strings with numbers and letters
## Since these strings are likely to be vandal
## Absolute count or ratio better?

def alphanumeric_count(difference_list):
    alpha_num = 0
    for element in difference_list:
        if element.isdigit():
            continue
        elif element.isalpha():
            continue
        else:
            alpha_num += 1
    
    return alpha_num

In [23]:
#ratio of vulgar words in the edit

def vulgar(old_text_list,difference_list):
    if len(old_text_list) == 0:
        old_text_list = ['empty']
    
    if len(difference_list) == 0:
        difference_list = ['empty']
    
    vulgar_list_edit = pc.predict_prob(difference_list)
    vulgar_list_old = pc.predict_prob(old_text_list)
    count1 = 0
    count2 = 0
    
    for i in vulgar_list_edit:
        count1 += i
        
    for k in vulgar_list_old:
        count2 += k
        
    ratio1 = count1 #/ len(difference_list)
    
    if count2 == 0:
        ratio2 = count1
    
    else:
        ratio2 = round(count1 / count2,3)
        
    return float(ratio2)

In [24]:
#Gives a similarity metric between original and new text
#How less similar the more suspicous

def similarity(old_text_list, new_text_list):
    old = ''.join(old_text_list)
    new = ''.join(new_text_list)
    ratio = round(fuzz.token_set_ratio(old, new)/100,3)
    
    return ratio

In [25]:
def compute_ancillary_features(dataframe):
    df = dataframe
    
    ## UDF for computing longest repeated character
    cafUdf1 = udf(lambda row: maxRepeat(row[2]), IntegerType())
    spark.udf.register("cafUdf1", cafUdf1)
    df = df.withColumn("longest_repeated_char", cafUdf1(struct('words_old','words_new','diff_text')))
    
    ## UDF for checking if edit is empty or not
    cafUdf2 = udf(lambda row: empty(row[2]), IntegerType())
    spark.udf.register("cafUdf2", cafUdf2)
    df = df.withColumn("empty_edit", cafUdf2(struct('words_old','words_new','diff_text')))
    
    ## UDF to determine size ratio between input and output text
    cafUdf3 = udf(lambda row: size_ratio(row[0],row[1]), FloatType())
    spark.udf.register("cafUdf3", cafUdf3)
    df = df.withColumn("size_ratio", cafUdf3(struct('words_old','words_new','diff_text')))
    
    ## UDF to determine number of alphanumeric words in an edit
    cafUdf4 = udf(lambda row: alphanumeric_count(row[2]), IntegerType())
    spark.udf.register("cafUdf4", cafUdf4)
    df = df.withColumn("alpha_count", cafUdf4(struct('words_old','words_new','diff_text')))
    
    ## UDF to determine the ratio of vulgar words in the text
    cafUdf5 = udf(lambda row: vulgar(row[0],row[2]), FloatType())
    spark.udf.register("cafUdf5", cafUdf5)
    df = df.withColumn("vulgar_ratio", cafUdf5(struct('words_old','words_new','diff_text')))
    
    ## UDF to determine the ratio of vulgar words in the text
    cafUdf6 = udf(lambda row: similarity(row[0],row[1]), FloatType())
    spark.udf.register("cafUdf6", cafUdf6)
    df = df.withColumn("similarity", cafUdf6(struct('words_old','words_new','diff_text')))
    
    print("Ancillary Features Done")
    return df

In [26]:
df_ancillary = compute_ancillary_features(df_final)

# pd_df_ancillary = df_ancillary.toPandas()
# pd_df_ancillary = pd_df_ancillary[pd_df_ancillary['empty_edit'] == 0]
# df_ancillary = sqlContext.createDataFrame(pd_df_ancillary)

df_ancillary = df_ancillary.select('diff_text','massive_edit','longest_repeated_char','empty_edit','size_ratio','alpha_count',\
                                   'vulgar_ratio','similarity','label_index')
df_ancillary.show()


Ancillary Features Done
+--------------------+------------+--------------------+---------------------+----------+----------+-----------+------------+----------+-----------+
|           diff_text|massive_edit|              tf_idf|longest_repeated_char|empty_edit|size_ratio|alpha_count|vulgar_ratio|similarity|label_index|
+--------------------+------------+--------------------+---------------------+----------+----------+-----------+------------+----------+-----------+
|                  []|         0.0|      (262144,[],[])|                    0|         1|       1.0|          0|       0.002|       1.0|        0.0|
|[i, realy, was, f...|         0.0|(262144,[125372,1...|                    0|         0|     1.002|          0|       0.002|       1.0|        1.0|
|[entered, public,...|         0.0|(262144,[3268,605...|                    3|         0|     0.994|          0|       0.093|      0.94|        0.0|
|[ford, 2011, ford...|         0.0|(262144,[31587,49...|                    2|    

In [53]:
df_temp = df_ancillary.toPandas()
df_temp = df_temp[df_temp['label_index']==2.0]
df_temp = df_temp[['diff_text','massive_edit','longest_repeated_char','vulgar_ratio','label_index']]
df_temp.head(32)

Unnamed: 0,diff_text,massive_edit,longest_repeated_char,vulgar_ratio,label_index
73,"[haemo, nowiki, lytic]",0.0,0,0.002,2.0
82,"[and, there, is, an, actual, child, kyrie, ele...",0.0,2,0.021,2.0
104,"[big, tucker, is, awesome, mom]",0.0,0,0.003,2.0
113,"[wiktionary, card, card, or, the, card, may, r...",0.0,2,0.151,2.0
122,"[jgomora, is, the, most, awesome, show, ever]",0.0,0,0.084,2.0
125,"[any, of, the, people, who, smell, also, see, ...",0.0,2,0.2,2.0
155,"[hey, hey, peopl, este, h, here]",0.0,0,0.018,2.0
193,"[booty, juice]",0.0,2,0.001,2.0
216,"[natalia, aka, beanbear, loves, her, brother, ...",0.0,0,0.026,2.0
251,"[abby, lee, miller, is, famous, for, hit, danc...",0.0,2,0.006,2.0


# Part III: Building Models

## Multinomial Naive Bayes Classifier

In [28]:
# rdd_final = df_final.rdd
(training_data_nb, test_data_nb) = df_label.randomSplit([0.7, 0.3], seed = 42)


In [29]:
# Show the frequency of each label in the test set
# training_data_nb.groupBy("actual_label") \
#     .count() \
#     .orderBy(col("count").desc()) \
#     .show()

In [30]:
# Show the frequency of each label in the test set
# test_data_nb.groupBy("actual_label") \
#     .count() \
#     .orderBy(col("count").desc()) \
#     .show()

In [31]:
# from pyspark.ml.classification import LogisticRegression

# lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
# lrModel = lr.fit(training_data)
# predictions = lrModel.transform(test_data)

In [32]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial",featuresCol='tf_idf', labelCol='actual_label',thresholds = [0.99,0.99,0.05])
model_nb = nb.fit(training_data_nb)
predictions_nb = model_nb.transform(test_data_nb)

# # Convert indexed labels back to original labels.
# label_converter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=indexToLabel)
# predictions_nb = label_converter.fit(predictions_nb).transform(predictions_nb)

In [33]:
# predictions.filter(predictions['prediction'] == 1).select('label','probability','prediction').show(truncate=False)

In [34]:
# predictions.filter(predictions['prediction'] == 2).select('label','probability','prediction').show(truncate=False)

In [35]:
# predictions.filter(predictions['label'] == 1).select('label','probability','prediction').show(truncate=False)

In [36]:
# predictions.filter(predictions['label'] == 2).select('label','probability','prediction').show(truncate=False)

### Saving the Model
Use pickle package to save a model to a file and load a model from a file.


In [37]:
# output_dir = r'/Users/Simon/Documents/GitHub/adana_task3'
# shutil.rmtree(output_dir, ignore_errors=True)
# model_nb.save(sc, output_dir)
# sameModel = nb.load(sc, output_dir)

## Random Forest Classisfier

In [38]:
def vector_assembler(dataframe):
    
    df_ancillary = dataframe
    assembler = VectorAssembler(inputCols=['massive_edit','longest_repeated_char','empty_edit','size_ratio','alpha_count','vulgar_ratio','similarity'],\
                                outputCol='features')

    df_ancillary = assembler.transform(df_ancillary)
    df_ancillary_vector = df_ancillary.select('features','label_index')
    
    return df_ancillary_vector


+--------------------+-----------+
|            features|label_index|
+--------------------+-----------+
|[0.0,0.0,1.0,1.0,...|        0.0|
|(7,[3,5,6],[1.001...|        1.0|
|[0.0,3.0,0.0,0.99...|        0.0|
|[0.0,2.0,0.0,1.0,...|        1.0|
|(7,[1,3,6],[2.0,1...|        1.0|
|[1.0,3.0,0.0,0.96...|        1.0|
|[0.0,2.0,0.0,1.00...|        0.0|
|(7,[2,3,6],[1.0,1...|        0.0|
|[0.0,2.0,0.0,1.00...|        1.0|
|[0.0,2.0,0.0,1.00...|        1.0|
|(7,[3,5,6],[1.0,0...|        0.0|
|(7,[3,5,6],[1.0,0...|        0.0|
|[0.0,2.0,0.0,1.02...|        1.0|
|[0.0,2.0,0.0,1.02...|        1.0|
|[0.0,2.0,0.0,1.0,...|        0.0|
|[0.0,2.0,0.0,1.00...|        0.0|
|[0.0,2.0,0.0,1.00...|        1.0|
|[0.0,2.0,0.0,1.00...|        0.0|
|(7,[3,5,6],[1.001...|        0.0|
|[0.0,2.0,0.0,0.98...|        1.0|
+--------------------+-----------+
only showing top 20 rows



In [39]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = df_ancillary_vector.randomSplit([0.7, 0.3], seed = 42)

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label_index", featuresCol="features", numTrees=30)

# Convert indexed labels back to original labels.
# labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
#                                labels=labelIndexer.labels)

# Chain indexers and forest in a Pipeline
# pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

# Train model.  This also runs the indexers.
model_rf = rf.fit(trainingData)

# Make predictions.
predictions_rf = model_rf.transform(testData)
predictions_rf.show()

+--------------------+-----------+--------------------+--------------------+----------+
|            features|label_index|       rawPrediction|         probability|prediction|
+--------------------+-----------+--------------------+--------------------+----------+
|[0.0,0.0,1.0,1.0,...|        0.0|[24.5447292525366...|[0.81815764175122...|       0.0|
|[0.0,3.0,0.0,0.99...|        0.0|[8.66329371538427...|[0.28877645717947...|       1.0|
|[1.0,3.0,0.0,0.96...|        1.0|[10.1696412642093...|[0.33898804214031...|       1.0|
|(7,[3,5,6],[1.0,0...|        0.0|[17.2172226493089...|[0.57390742164363...|       0.0|
|[0.0,2.0,0.0,1.0,...|        0.0|[8.73413083712336...|[0.29113769457077...|       1.0|
|[0.0,2.0,0.0,1.00...|        1.0|[10.5573386590045...|[0.35191128863348...|       1.0|
|(7,[1,3,6],[2.0,1...|        0.0|[8.73413083712336...|[0.29113769457077...|       1.0|
|[0.0,2.0,0.0,0.98...|        1.0|[11.5818915382587...|[0.38606305127529...|       1.0|
|(7,[3,5,6],[1.289...|        0.

# Part IV - A: Evaluation of the Naive Bayes Model

## Accuracy


In [40]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='actual_label')
accuracy_nb = evaluator.evaluate(predictions_nb)
print(accuracy_nb)
# Result = 0.7410821256831497
# Result = 0.8053634745632435
# Result = 0.8838408650684599
# Result = 0.911560434636496

0.48244730584541506


## Precision and Recall

For the precision and recall measures we fall back on Sklearn packages because their implementation is much more straight forward as compared to the mllib packages. We are mostly interested in the unsafe and vandal isntances!

**Binary Classification:**

- Precision = TruePositives / (TruePositives + FalsePositives)
- Recall = TruePositives / (TruePositives + FalseNegatives)

**Multilabel Classification:**

In an imbalanced classification problem with more than two classes, precision is calculated as the sum of true positives across all classes divided by the sum of true positives and false positives across all classes.

- Precision = Sum c in C TruePositives_c / Sum c in C (TruePositives_c + FalsePositives_c)


In an imbalanced classification problem with more than two classes, recall is calculated as the sum of true positives across all classes divided by the sum of true positives and false negatives across all classes.

- Recall = Sum c in C TruePositives_c / Sum c in C (TruePositives_c + FalseNegatives_c)


In [41]:
pd_predictions_nb = predictions_nb.toPandas()
# print(pd_predictions_nb[pd_predictions_nb.prediction == 2.0].count())


y_true = pd_predictions_nb['actual_label'].to_list()
y_pred = pd_predictions_nb['prediction'].to_list()

# precision_recall_fscore_support(y_true, y_pred, average='micro')
# precision_recall_fscore_support(y_true, y_pred, average='macro')
precision_recall_fscore_support(y_true, y_pred, average='weighted')
# precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0.0, 1.0, 2.0])
# pd_predictions_nb.head()

(0.6208214103838786, 0.4642857142857143, 0.482447305845415, None)

## Confusion Matrices and Manual Calculations

In [42]:
cm_arr = multilabel_confusion_matrix(y_true, y_pred,labels=[0.0, 1.0, 2.0])
cm_arr_safe = cm_arr[0,:,:]
cm_arr_unsafe = cm_arr[1,:,:]
cm_arr_vandal = cm_arr[2,:,:]

cm_safe = pd.DataFrame({'Pred_Other': cm_arr_safe[:,0], 'Pred_0': cm_arr_safe[:,1]})
cm_safe = cm_safe.rename(index={0: 'Act_Other', 1: 'Act_0'})

cm_unsafe = pd.DataFrame({'Pred_Other': cm_arr_unsafe[:,0], 'Pred_1': cm_arr_unsafe[:,1]})
cm_unsafe = cm_unsafe.rename(index={0: 'Act_Other', 1: 'Act_1'})

cm_vandal = pd.DataFrame({'Pred_Other': cm_arr_vandal[:,0], 'Pred_2': cm_arr_vandal[:,1]})
cm_vandal = cm_vandal.rename(index={0: 'Act_Other', 1: 'Act_2'})

cm_vandal.head()

Unnamed: 0,Pred_Other,Pred_2
Act_Other,170,44
Act_2,10,0


In [43]:
precision_safe = cm_arr_safe[1,1]/(cm_arr_safe[1,1] + cm_arr_safe[0,1])
precision_unsafe = cm_arr_unsafe[1,1]/(cm_arr_unsafe[1,1] + cm_arr_unsafe[0,1])
precision_vandal = cm_arr_vandal[1,1]/(cm_arr_vandal[1,1] + cm_arr_vandal[0,1])

recall_safe = cm_arr_safe[1,1]/(cm_arr_safe[1,1] + cm_arr_safe[1,0])
recall_unsafe = cm_arr_unsafe[1,1]/(cm_arr_unsafe[1,1] + cm_arr_unsafe[1,0])
recall_vandal = cm_arr_vandal[1,1]/(cm_arr_vandal[1,1] + cm_arr_vandal[1,0])

print("Precision for Label 0: ", precision_safe)
print("Precision for Label 1: ", precision_unsafe)
print("Precision for Label 2: ", precision_vandal)

print("Recall for Label 0: ", recall_safe)
print("Recall for Label 1: ", recall_unsafe)
print("Recall for Label 2: ", recall_vandal)

tp_tot = cm_arr_safe[1,1]+cm_arr_unsafe[1,1]+cm_arr_vandal[1,1]
fp_tot = cm_arr_safe[0,1]+cm_arr_unsafe[0,1]+cm_arr_vandal[0,1]
fn_tot = cm_arr_safe[1,0]+cm_arr_unsafe[1,0]+cm_arr_vandal[1,0]

precision_tot = (tp_tot / (tp_tot + fp_tot))
recall_tot = ( tp_tot / (tp_tot + fn_tot ))

print("Total Precision: ", precision_tot)
print("Total Recall: ", recall_tot)
              

Precision for Label 0:  0.7674418604651163
Precision for Label 1:  0.5182481751824818
Precision for Label 2:  0.0
Recall for Label 0:  0.2920353982300885
Recall for Label 1:  0.7029702970297029
Recall for Label 2:  0.0
Total Precision:  0.4642857142857143
Total Recall:  0.4642857142857143


# Part IV - B: Evaluation of the Random Forest Model

In [44]:
evaluator2 = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='label_index')
accuracy_rf = evaluator2.evaluate(predictions_rf)
print(accuracy_rf)

0.6005569643385751


In [45]:
pd_predictions_rf = predictions_rf.toPandas()
# print(pd_predictions_nb[pd_predictions_nb.prediction == 2.0].count())


y_true = pd_predictions_rf['label_index'].to_list()
y_pred = pd_predictions_rf['prediction'].to_list()

# precision_recall_fscore_support(y_true, y_pred, average='micro')
# precision_recall_fscore_support(y_true, y_pred, average='macro')
precision_recall_fscore_support(y_true, y_pred, average='weighted')
# precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0.0, 1.0, 2.0])
# pd_predictions_nb.head()

(0.5858528919776506, 0.6160714285714286, 0.6005569643385751, None)

In [46]:
cm_arr = multilabel_confusion_matrix(y_true, y_pred,labels=[0.0, 1.0, 2.0])
cm_arr_safe = cm_arr[0,:,:]
cm_arr_unsafe = cm_arr[1,:,:]
cm_arr_vandal = cm_arr[2,:,:]

cm_safe = pd.DataFrame({'Pred_Other': cm_arr_safe[:,0], 'Pred_0': cm_arr_safe[:,1]})
cm_safe = cm_safe.rename(index={0: 'Act_Other', 1: 'Act_0'})

cm_unsafe = pd.DataFrame({'Pred_Other': cm_arr_unsafe[:,0], 'Pred_1': cm_arr_unsafe[:,1]})
cm_unsafe = cm_unsafe.rename(index={0: 'Act_Other', 1: 'Act_1'})

cm_vandal = pd.DataFrame({'Pred_Other': cm_arr_vandal[:,0], 'Pred_2': cm_arr_vandal[:,1]})
cm_vandal = cm_vandal.rename(index={0: 'Act_Other', 1: 'Act_2'})

cm_vandal.head()

Unnamed: 0,Pred_Other,Pred_2
Act_Other,211,1
Act_2,12,0


In [47]:
precision_safe = cm_arr_safe[1,1]/(cm_arr_safe[1,1] + cm_arr_safe[0,1])
precision_unsafe = cm_arr_unsafe[1,1]/(cm_arr_unsafe[1,1] + cm_arr_unsafe[0,1])
precision_vandal = cm_arr_vandal[1,1]/(cm_arr_vandal[1,1] + cm_arr_vandal[0,1])

recall_safe = cm_arr_safe[1,1]/(cm_arr_safe[1,1] + cm_arr_safe[1,0])
recall_unsafe = cm_arr_unsafe[1,1]/(cm_arr_unsafe[1,1] + cm_arr_unsafe[1,0])
recall_vandal = cm_arr_vandal[1,1]/(cm_arr_vandal[1,1] + cm_arr_vandal[1,0])

print("Precision for Label 0: ", precision_safe)
print("Precision for Label 1: ", precision_unsafe)
print("Precision for Label 2: ", precision_vandal)

print("Recall for Label 0: ", recall_safe)
print("Recall for Label 1: ", recall_unsafe)
print("Recall for Label 2: ", recall_vandal)

tp_tot = cm_arr_safe[1,1]+cm_arr_unsafe[1,1]+cm_arr_vandal[1,1]
fp_tot = cm_arr_safe[0,1]+cm_arr_unsafe[0,1]+cm_arr_vandal[0,1]
fn_tot = cm_arr_safe[1,0]+cm_arr_unsafe[1,0]+cm_arr_vandal[1,0]

precision_tot = (tp_tot / (tp_tot + fp_tot))
recall_tot = ( tp_tot / (tp_tot + fn_tot ))

print("Total Precision: ", precision_tot)
print("Total Recall: ", recall_tot)

Precision for Label 0:  0.6052631578947368
Precision for Label 1:  0.6330275229357798
Precision for Label 2:  0.0
Recall for Label 0:  0.6448598130841121
Recall for Label 1:  0.6571428571428571
Recall for Label 2:  0.0
Total Precision:  0.6160714285714286
Total Recall:  0.6160714285714286


# Part V: Running the Models in a Streaming Setup

In [48]:
globals()['models_loaded'] = False

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("===================== %s =====================" % str(time))
    
    ## Convert to data frame
    df_pred = spark.read.json(rdd)
    print("Incoming Dataframe: ")
    df_pred.show()

    
    ## Preprocessing the incoming dataframe 
    df_pred_final , df_pred_label = preprocessing(df_pred)
    print("Preprocessed Dataframe: ")
    df_pred_final.show()
    
    ## Computing ancillary features
    df_pred_ancillary = compute_ancillary_features(df_final)
    df_ancillary_vector = vector_assembler(df_pred_ancillary)
    
    ## Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model_nb'] = model_nb
        globals()['my_model_rf'] = model_rf
        globals()['models_loaded'] = True #Update the control to notify model is loaded
        
    # Predict using the loaded model: 
    df_result_nb = globals()['my_model_nb'].transform(df_pred_label)
    df_result_rf = globals()['my_model_rf'].transform(df_ancillary_vector)
    
    print("Predicted Result for Naive Bayes Classifier: ")
    df_result_nb.show()
    
    print("Predicted Result for Random Forest Classifier: ")
    df_result_rf.show()

    

In [49]:
# ssc = StreamingContext(sc, 10)

In [50]:
# lines = ssc.socketTextStream("seppe.net", 7778)
# lines.foreachRDD(process)

In [51]:
# ssc_t = StreamingThread(ssc)
# ssc_t.start()

In [52]:
# ssc_t.stop()