# Part I: Preprocessing the Document Edits

## Step 0: Import, Initialization and Loading

IDEA: load all the part files into a single dataframe

In [1]:
import pandas as pd
import numpy as np
import difflib
import sys
import os

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

from pyspark.ml import Pipeline
        
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler

from pyspark.sql.types import IntegerType, StringType, ArrayType
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit, monotonically_increasing_id

from pyspark.sql.types import StructType
from pyspark.sql.types import StructField

from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

from nltk.stem.snowball import *



In [2]:
sc

In [3]:
spark

### Loading the data into a single dataframe

The idea is to load all the saved partfiles into a single dataframe. Next this dataframe can be used to train the model

In [4]:
def load_rdd(base_directory):
    # Get all the directory names of the saved myoutput folders
    foldernames = os.listdir(base_directory)
    
    # Create a dataframe from the rdds in the first folder of the list
    first_directory = base_directory + "/" + foldernames[0]
    rdd = sc.textFile(first_directory)
    df = spark.read.json(rdd)
    
    # Remove this folder from the list to prevent it from being added twice
    foldernames.remove(foldernames[0])
    
    for i in range(len(foldernames)):
        
        if foldernames[i] == '.DS_Store':
            continue
        
        directory = base_directory + "/" + foldernames[i]
        rdd = sc.textFile(directory)
        df_temp = spark.read.json(rdd)
        df = df.union(df_temp)

    return df

# base_directory = r'/Users/Simon/Documents/GitHub/adana_task3/Spark_Cleaned/myoutput-1586797640000/part-00003'
# base_directory = r'/Users/Simon/Documents/GitHub/adana_task3/Spark_Cleaned/myoutput-1586797640000'
base_directory = r'/Users/Simon/Documents/GitHub/adana_task3/Spark_Data_Limited'
# base_directory = r'/Users/Simon/Documents/GitHub/adana_task3/Spark_Cleaned'

df = load_rdd(base_directory)    
df.show()

# df.select('text_new').collect()

+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|           name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                    |  safe|            SnapSnap|{{Use dmy dates|d...|{{Use dmy dates|d...| Love Wedding Repeat|//en.wikipedia.or...|
|                    |  safe|           Altin0000|{{short descripti...|{{short descripti...|2020 coronavirus ...|//en.wikipedia.or...|
|        (→‎Episodes)|unsafe|2001:48f8:3028:d7...|{{Use American En...|{{Use American En...|SpongeBob SquareP...|//en.wikipedia.or...|
|                    |  safe|         Maister1921|{{Infobox volleyb...|{{Infobox volleyb...|VERVA Warszawa OR...|//en.wikipedia.or...|
|          (A9sjwisn)|unsafe|      114.124.241.64|{{Pea

## Step 1a: Tokenization & Normalization

The regexTokenizer is used because of its extra functionality compared to the standard Tokenizer built into spark. Also useful is that the tokens are normalized (decapitalized). 

In [5]:
rt_old = RegexTokenizer(inputCol="text_old", outputCol="words_old", toLowercase=True, pattern=("\\W"))

countTokens = udf(lambda words: len(words), IntegerType())

regexTokenized_old = rt_old.transform(df)
df_step1a = regexTokenized_old.withColumn("tokens_old", countTokens(col("words_old")))

#########################################################################################

rt_new = RegexTokenizer(inputCol="text_new", outputCol="words_new", toLowercase=True, pattern=("\\W"))
regexTokenized_new = rt_new.transform(df_step1a)
df_step1b = regexTokenized_new.withColumn("tokens_new", countTokens(col("words_new")))

# df_step1b.show(truncate=False)


In [6]:
# df_step1b.show()

## Step 1b: Delta Generator

In this crucial step the difference between input and output text is determined. The difference is found using the unified_diff function accesible in through the difflib python library. The function takes two lists of strings as inputs and computes the deleted and inserted (replaced) words. This difference is used to later classify the text edit.

In [7]:
def text_difference(text_old,text_new):
#     text_old = df_step1b.select("words_old").collect()[0][0]
#     text_new = df_step1b.select("words_new").collect()[0][0]

    # print(text_old)

    new_words = []
    deleted_words = []

    for line in difflib.unified_diff(text_old, text_new, fromfile='before.txt', tofile='after.txt'):
    #     sys.stdout.write(line)

        if "-" in line and " " not in line:
            new_line = line.replace("-", "")
            deleted_words.append(new_line)
        elif "+" in line and " " not in line:
            new_line = line.replace("+", "")
            new_words.append(new_line)

    #     print(line)


    # print("Deleted words: ", deleted_words)
    # print("Inserted words: ", new_words)

    edited_words = deleted_words + new_words
    print(edited_words)
    
    return edited_words

# text_old = df_step1b.select("words_old").collect()[0][0]
# text_new = df_step1b.select("words_new").collect()[0][0]
# edited_words = text_difference(text_old,text_new)


In [8]:
# def arrayUdf():
#     return edited_words

# # countTokens = udf(lambda words: len(words), IntegerType())
# callArrayUdf = udf(larrayUdf, ArrayType(StringType()))

# #calling udf function
# df_step1c = df_step1b.withColumn("diff_text", callArrayUdf())


# df_step1c.select("diff_text").show(truncate=False)

### User Defined Function

This code calculated the difference between the input and output text. This is accomplished by defining a UDF and a seperate function arrayUdf(). The udf is called on two columns *'words_old'* and *'words_new'*. Next a lambda function is defined to iterate over each row of the two input columns. Within the udf is refered to another function arrayUdf() which requires two inputs: the two tokenized lists of words which will be used to compute the difference. The arrayUdf() function acts as an itermediary to call on a different function: text_difference(). The text_difference() function uses the unified_diff generator from the difflib package to return the deltas between two lists of strings.

Through experimentation with the unified_diff generator, we found that it was much easier to first tokenize the input and output text and then compute the difference between the two tokenized lists of words. This in contrast to passing the two texts (*'text_old'* and *'text_new'*) of the rdd's as input directly and then tokenizing this *'difference_text'*. Although the latter method might create less computational overhead due to less tokenization, the former method proves to be much more reliable to determine which words have been deleted and which words are new.

In [9]:
def arrayUdf(text_old,text_new):
    edited_words = text_difference(text_old,text_new)
    return edited_words

# countTokens = udf(lambda words: len(words), IntegerType())
callArrayUdf = udf(lambda row: arrayUdf(row[0],row[1]), ArrayType(StringType()))

spark.udf.register("callArrayUdf",callArrayUdf)
#calling udf function


df_step1c = df_step1b.withColumn("diff_text", callArrayUdf(struct('words_old','words_new')))


# df_step1c.select("diff_text").show(truncate=False)

In [10]:
# Cross validate the change by checking the difference in number of tokens created

# tokens_old = df_step1b.select("tokens_old").collect()[0][0]
# tokens_new = df_step1b.select("tokens_new").collect()[0][0]

# diff_tokens = tokens_new - tokens_old
# print("The difference in number of tokens for input and output text = ", diff_tokens)

## Step 2: Stop Word Removal

In [11]:
locale = sc._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))

stopwords = StopWordsRemover.loadDefaultStopWords("english")
extra_stopwords = ["http","https","ref","www","com","org","url","web"]
stopwords = stopwords + extra_stopwords
print(stopwords)

remover = StopWordsRemover(inputCol="diff_text", outputCol="words_clean",stopWords=stopwords)
stopwords = remover.getStopWords()
df_step2 = remover.transform(df_step1c)
# df_step2.select("words_clean").show(truncate=False)




# (inputCol="words", outputCol="filtered",stopWords=StopWordsRemover.loadDefaultStopWords("english"))
# remover.transform(df_tokenized).show(truncate=False)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

## Step 3: Stemming

The chosen algorithm for stemming is the snowball stemming algorithm (a variant of the Porter algorithm). The snowball stemmer was chosen because it is slightly more aggresive at stemming the tokenized words than the standard Porter algorithm while still being less aggresive than the Lancaster algorithm. It is a nice 'middle ground' between the two stemming variants.

In [12]:
# stemmer = SnowballStemmer('english')
stemmer = PorterStemmer()

stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))

df_step3 = df_step2.withColumn("words_stemmed", stemmer_udf("words_clean"))

# df_step3.select("words_stemmed").show(truncate=False)

## Step 4: Feature Vectorization (TF-IDF)

In [13]:
tf = HashingTF(inputCol="words_stemmed", outputCol="tf")#, numFeatures=20)

df_step4a = tf.transform(df_step3)



idf = IDF(inputCol="tf", outputCol="tf_idf")
idfModel = idf.fit(df_step4a)
df_step4b = idfModel.transform(df_step4a)

# df_step4a.show(truncate=False)
# df_step4b.select("words_stemmed","tf_idf").show(truncate=False)

## Step 5: String Indexer

In this final step the labels (*Safe, Unsafe and Vandal*) are encoded to label indices. The most frequent label gets index 0 while the least frequent label gets the last index depending on the number of indices. In this case the least frequent label gets index 2.

In [14]:
label_indexer = StringIndexer(inputCol = "label", outputCol = "label_index")

df_step5a = label_indexer.fit(df_step4b).transform(df_step4b)

df_step5b = df_step5a.select("tf_idf","label_index")

df_final = df_step5b.withColumnRenamed("tf_idf","features")
df_final = df_final.withColumnRenamed("label_index","label")


df_final.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|      (262144,[],[])|  0.0|
|(262144,[131,1040...|  0.0|
|(262144,[21823,82...|  1.0|
|(262144,[12060,21...|  0.0|
|(262144,[29143,35...|  1.0|
|(262144,[8215,127...|  0.0|
|(262144,[21180,25...|  0.0|
|(262144,[13471,13...|  0.0|
|(262144,[65897,13...|  0.0|
|      (262144,[],[])|  0.0|
|(262144,[109320],...|  0.0|
|(262144,[53159,12...|  0.0|
|(262144,[215,2723...|  0.0|
|(262144,[14898,36...|  0.0|
|(262144,[78349,14...|  0.0|
|(262144,[14898,36...|  0.0|
|(262144,[78349,14...|  0.0|
|(262144,[14898,21...|  0.0|
|(262144,[97,3096,...|  1.0|
|(262144,[220567,2...|  0.0|
+--------------------+-----+
only showing top 20 rows



# Part II: Multinomial Naive Bayes Classifier

In [15]:
# rdd_final = df_final.rdd
(training_data, test_data) = df_final.randomSplit([0.7, 0.3], seed = 42)


In [16]:
# from pyspark.ml.classification import LogisticRegression

# lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
# lrModel = lr.fit(training_data)
# predictions = lrModel.transform(test_data)

In [17]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
model = nb.fit(training_data)
predictions = model.transform(test_data)



In [18]:
predictions.filter(predictions['prediction'] == 1).select('label','probability','prediction').show(truncate=False)

+-----+------------------------------------------------------------------+----------+
|label|probability                                                       |prediction|
+-----+------------------------------------------------------------------+----------+
|1.0  |[6.965922696119666E-9,0.9999999915451795,1.4888979822627257E-9]   |1.0       |
|0.0  |[9.015248300964145E-11,0.9999999999098474,5.386297692386849E-18]  |1.0       |
|0.0  |[1.2127528644099106E-8,0.9999999878724714,0.0]                    |1.0       |
|0.0  |[3.182778185138302E-4,0.9995359695618484,1.457526196376846E-4]    |1.0       |
|0.0  |[7.252809679245608E-118,1.0,0.0]                                  |1.0       |
|0.0  |[1.0285580293569733E-11,0.9999999999897144,1.6691147036706362E-39]|1.0       |
|0.0  |[2.243052649551445E-16,0.9999999999999998,1.1055806318743884E-30] |1.0       |
|0.0  |[2.12936918635517E-4,0.9997761462855461,1.0916795818429364E-5]    |1.0       |
|0.0  |[2.5998388706170613E-7,0.999999740016113,7.2586

In [19]:
predictions.filter(predictions['prediction'] == 2).select('label','probability','prediction').show(truncate=False)

+-----+-------------------------------------------------------------+----------+
|label|probability                                                  |prediction|
+-----+-------------------------------------------------------------+----------+
|0.0  |[0.10640016820806811,0.004932696899015449,0.8886671348929164]|2.0       |
+-----+-------------------------------------------------------------+----------+



In [22]:
predictions.filter(predictions['label'] == 1).select('label','probability','prediction').show(truncate=False)

+-----+------------------------------------------------------------------+----------+
|label|probability                                                       |prediction|
+-----+------------------------------------------------------------------+----------+
|1.0  |[6.965922696119666E-9,0.9999999915451795,1.4888979822627257E-9]   |1.0       |
|1.0  |[0.9720357165274826,0.027964222363647574,6.110886968682242E-8]    |0.0       |
|1.0  |[1.0,3.697009398233063E-20,1.7862898835E-313]                     |0.0       |
|1.0  |[0.9999956998551663,2.9465301587481647E-6,1.3536146749849673E-6]  |0.0       |
|1.0  |[2.0168634205501722E-38,1.0,7.782761536574368E-61]                |1.0       |
|1.0  |[3.8149309925146783E-4,0.9996185069007486,6.23E-322]              |1.0       |
|1.0  |[6.961177125345584E-14,0.9999999999999303,3.5146952127662255E-19] |1.0       |
|1.0  |[1.0488800199938614E-12,0.9999999999988511,9.986795023689449E-14] |1.0       |
|1.0  |[0.9708454040498471,0.029154595950152823,1.8112

In [23]:
predictions.filter(predictions['label'] == 2).select('label','probability','prediction').show(truncate=False)

+-----+-----------+----------+
|label|probability|prediction|
+-----+-----------+----------+
+-----+-----------+----------+



In [20]:
# globals()['models_loaded'] = False

# # the predict function will be registered as a udf!
# # we use a df with a diff column
# def predict(df):
#     if any([x in df.diff.lower() for x in ['bad', 'lol', 'joke']]):
#         return 'vandal'
#     else:
#         return 'safe'

# predict_udf = udf(predict, StringType()) # user-defined-function (pyspark)

# def process(time, rdd):
#     if rdd.isEmpty():
#         return
    
#     print("========= %s =========" % str(time))
    
#     # Convert to data frame
#     print("Show rdd")
#     rdd.show()
#     print()
#     df = spark.read.json(rdd)
#     print("Show df")
#     df.show()
    
#     # Tip: making a diff will probably help a lot as a feature in any model:
#     diff = make_diff(df.first().text_old, df.first().text_new)
#     df_withdiff = df.withColumn("diff", lit(diff))
#     print("Show df_withdiff")
#     print(lit(diff))
#     df_withdiff.select('diff').show()

    
#     # Utilize our predict function. Implementation of the udf!!!
#     df_withpreds = df_withdiff.withColumn("pred", predict_udf(
#         struct([df_withdiff[x] for x in df_withdiff.columns])
#     ))
#     print("Show df_withpreds")
#     df_withpreds.show()
    
#     # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict (you can)
#     # But an MLlib model you've built and saved with Spark
#     # In this case, you need to prevent loading your model in every call to "process" as follows:
    
#     # Load in the model if not yet loaded:
#     if not globals()['models_loaded']:
#         # load in your models here
#         globals()['my_model'] = '***' # Replace '***' with:    [...].load('my_logistic_regression')
#         globals()['models_loaded'] = True
        
#     # And then predict using the loaded model: 
#     # df_result = globals()['my_model'].transform(df)
#     # df_result.show()

In [21]:
label: string
comment: string
text_old: string
text_new: string
user: string

1. token:



words_old: [string,string,string]
words_new: [strings]
    
list1 = [words_old]
list2 = [words_new]
list3 = [diff_words]

list3 = ['yes','no','hello','booooobs']

def longest_char(list3):
    
    ...
    
    return longest_char = l, aantal = 2
    
    

SyntaxError: invalid syntax (<ipython-input-21-788a2010daf9>, line 7)