# Preprocessing the Document Edits
## Step 0: Import, Initialization and Loading

IDEA: load all the part files into a single dataframe

In [1]:
import pandas as pd
import numpy as np
import difflib
import sys

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, HashingTF, IDF

from pyspark.sql.types import IntegerType, StringType, ArrayType
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit

from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType

from nltk.stem.snowball import SnowballStemmer

In [2]:
sc

In [3]:
spark

### Loading the data into a single dataframe

The idea is to load all the saved partfiles into a single dataframe. Next this dataframe can be used to train the model

In [4]:
directory = r'/Users/Simon/Documents/GitHub/adana_task3/Spark_Cleaned/myoutput-1586797640000/part-00000'
rdd = sc.textFile(directory)
df = spark.read.json(rdd)

# df.select('text_new').collect()

## Step 1a: Tokenization & Normalization

The regexTokenizer is used because of its extra functionality compared to the standard Tokenizer built into spark. Also useful is that the tokens are normalized (decapitalized). 

In [5]:
rt_old = RegexTokenizer(inputCol="text_old", outputCol="words_old", toLowercase=True, pattern=("\\W"))

countTokens = udf(lambda words: len(words), IntegerType())

regexTokenized_old = rt_old.transform(df)
df_step1a = regexTokenized_old.withColumn("tokens_old", countTokens(col("words_old")))

#########################################################################################

rt_new = RegexTokenizer(inputCol="text_new", outputCol="words_new", toLowercase=True, pattern=("\\W"))
regexTokenized_new = rt_new.transform(df_step1a)
df_step1b = regexTokenized_new.withColumn("tokens_new", countTokens(col("words_new")))

df_step1b.show(truncate=False)


+----------------------------------+-----+----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Step 1b: Delta Generator

In this crucial step the difference between input and output text is determined. The difference is found using the unified_diff function accesible in through the difflib python library. The function takes two lists of strings as inputs and computes the deleted and inserted (replaced) words. This difference is used to later classify the text edit.

In [7]:
text_old = df_step1b.select("words_old").collect()[0][0]
text_new = df_step1b.select("words_new").collect()[0][0]

# print(text_old)

new_words = []
deleted_words = []

for line in difflib.unified_diff(text_old, text_new, fromfile='before.txt', tofile='after.txt'):
#     sys.stdout.write(line)

    if "-" in line and " " not in line:
        new_line = line.replace("-", "")
        deleted_words.append(new_line)
    elif "+" in line and " " not in line:
        new_line = line.replace("+", "")
        new_words.append(new_line)
    
#     print(line)

# print("Deleted words: ", deleted_words)
# print("Inserted words: ", new_words)

edited_words = deleted_words + new_words

print(edited_words)


# def ListToLit(word):
    
# udfListToLit = udf(lambda word: len(words), IntegerType())
# udfListToLit = udf(ListToLit, ArrayType())

# df_step1c = df_step1b.withColumn("words_edit", col(["test"]))
# df_step1c = df_step1b.withColumn("words_edit", lit(edited_words))

# df_step1c.select("words_edit").show()



['authority', 'control']


In [8]:
# Cross validate the change by checking the difference in number of tokens created

tokens_old = df_step1b.select("tokens_old").collect()[0][0]
tokens_new = df_step1b.select("tokens_new").collect()[0][0]

diff_tokens = tokens_new - tokens_old
print("The difference in number of tokens for input and output text = ", diff_tokens)

The difference in number of tokens for input and output text =  2


## Step 2: Stop Word Removal

In [12]:
locale = sc._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))

stopwords = StopWordsRemover.loadDefaultStopWords("english")
extra_stopwords = ["http","https","ref","www","com","org","url","web"]
stopwords = stopwords + extra_stopwords
print(stopwords)

remover = StopWordsRemover(inputCol="words", outputCol="words_clean",stopWords=stopwords)
stopwords = remover.getStopWords()
df_step2 = remover.transform(df_step1c)
df_step2.show(truncate=False)




# (inputCol="words", outputCol="filtered",stopWords=StopWordsRemover.loadDefaultStopWords("english"))
# remover.transform(df_tokenized).show(truncate=False)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

NameError: name 'df_step1c' is not defined

## Step 3: Stemming

The chosen algorithm for stemming is the snowball stemming algorithm (a variant of the Porter algorithm). The snowball stemmer was chosen because it is slightly more aggresive at stemming the tokenized words than the standard Porter algorithm while still being less aggresive than the Lancaster algorithm. It is a nice 'middle ground' between the two stemming variants.

In [10]:
stemmer = SnowballStemmer('english')

stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))

df_step3 = df_step2.withColumn("words_stemmed", stemmer_udf("words_clean"))

df_step3.select("words_stemmed").show(truncate=False)

NameError: name 'df_step2' is not defined

## Step 4: Feature Vectorization

In [11]:
tf = HashingTF(inputCol="words_stemmed", outputCol="tf") #, numFeatures=20)

df_step4a = tf.transform(df_step3)



idf = IDF(inputCol="tf", outputCol="tf_idf")
idfModel = idf.fit(df_step4a)
df_step4b = idfModel.transform(df_step4a)

# df_step4a.show(truncate=False)
df_step4b.show(truncate=False)

NameError: name 'df_step3' is not defined