In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [3]:
sc

In [4]:
spark

In [5]:
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [6]:
from difflib import unified_diff

def make_diff(old, new):
    return '\n'.join([ l for l in unified_diff(old.split('\n'), new.split('\n')) if l.startswith('+') or l.startswith('-') ])

In [7]:
globals()['models_loaded'] = False

# the predict function will be registered as a udf!
# we use a df with a diff column
def predict(df):
    if any([x in df.diff.lower() for x in ['bad', 'lol', 'joke']]):
        return 'vandal'
    else:
        return 'safe'

predict_udf = udf(predict, StringType()) # user-defined-function (pyspark)

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    print("Show rdd")
    rdd.show()
    print()
    df = spark.read.json(rdd)
    print("Show df")
    df.show()
    
    # Tip: making a diff will probably help a lot as a feature in any model:
    diff = make_diff(df.first().text_old, df.first().text_new)
    df_withdiff = df.withColumn("diff", lit(diff))
    print("Show df_withdiff")
    print(lit(diff))
    df_withdiff.select('diff').show()

    
    # Utilize our predict function. Implementation of the udf!!!
    df_withpreds = df_withdiff.withColumn("pred", predict_udf(
        struct([df_withdiff[x] for x in df_withdiff.columns])
    ))
    print("Show df_withpreds")
    df_withpreds.show()
    
    # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict (you can)
    # But an MLlib model you've built and saved with Spark
    # In this case, you need to prevent loading your model in every call to "process" as follows:
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = '***' # Replace '***' with:    [...].load('my_logistic_regression')
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model: 
    # df_result = globals()['my_model'].transform(df)
    # df_result.show()

In [8]:
ssc = StreamingContext(sc, 10)

In [9]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [10]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

Show rdd
BlockRDD[1] at socketTextStream at NativeMethodAccessorImpl.java:0

Show df
+-------------+------+----------+--------------------+--------------------+--------------------+--------------------+
|      comment| label| name_user|            text_new|            text_old|          title_page|            url_page|
+-------------+------+----------+--------------------+--------------------+--------------------+--------------------+
|(→‎Reception)|unsafe|2e524a3b5d|{{Infobox film
| ...|{{Infobox film
| ...|Home from the Sea...|//en.wikipedia.or...|
+-------------+------+----------+--------------------+--------------------+--------------------+--------------------+

Show df_withdiff
+--------------------+
|                diff|
+--------------------+
|--- 

+++ 

-The ...|
+--------------------+

Show df_withpreds
+-------------+------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----+
|      comment| label| name_u

In [11]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
Show rdd
BlockRDD[63] at socketTextStream at NativeMethodAccessorImpl.java:0

Show df
+--------------------+-----+-------------+--------------------+--------------------+--------------------+--------------------+
|             comment|label|    name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+-------------+--------------------+--------------------+--------------------+--------------------+
|Disambiguated:Geo...| safe|         Rodw|{{Infobox Politic...|{{Infobox Politic...|Robert Johnson (g...|//en.wikipedia.or...|
|→‎Early life:ref ...| safe|   Headhitter|{{pp-semi-sock|sm...|{{pp-semi-sock|sm...|     Yisrael Kristal|//en.wikipedia.or...|
|→‎External links:...| safe|Crouch, Swale|{{Use dmy dates|d...|{{Use dmy dates|d...| Chickering, Suffolk|//en.wikipedia.or...|
+--------------------+-----+-------------+--------------------+--------------------+----------------