# -1) imports

In [33]:
import findspark
findspark.init()
# Spark:
from pyspark     import SparkConf
from pyspark     import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml  import PipelineModel
from pyspark.sql.types         import FloatType, IntegerType
from pyspark.sql.types         import StructType, StructField
from pyspark.ml.feature        import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassificationModel
# Python :
from datetime import datetime
import math
from pyspark.streaming import StreamingContext


# 0) configuration spark context

In [39]:
listen_to_ip   = "54.37.12.49"
listen_to_port = 12010

current_date   = datetime.now().strftime("%Y-%m-%d %H:%M")
user           = "romain - exemple saprk streaming"
appName        = "%s le %s"%(user, current_date)
ip_fares       = "54.37.12.49"
master         = "spark://54.37.12.49:7077"
executor_cores = 2 
nb_cores_max   = 2
parallelism    = 4 * executor_cores
memory_by_node = "200M"
nb_executor    = nb_cores_max//executor_cores
dico_conf      = { "spark.app.name"             : appName        ,
                   "spark.master"               : master         ,
                   "spark.cores.max"            : nb_cores_max   , 
                   "spark.executor.cores"       : executor_cores ,
                   "spark.default.parallelism"  : parallelism    , 
                  #"spark.python.worker.memory" : memory_by_node ,
                   "spark.python.executor.memory" : memory_by_node }
conf = SparkConf()
for k,v in dico_conf.items():
    conf = conf.set(k,v)
sc    = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.getOrCreate()

# A) fonction important le modèle de prédiction pour chaque timestamp

In [42]:
def prevoir(time, rdd):
    sep       = ";"
    chemin = "hdfs://54.37.12.49:8020/cart_model_for_cycliste_v5"
    model  = DecisionTreeClassificationModel.load(chemin)
    # travail sur la string
    rdd2      = rdd.map( lambda data   : data.split(sep))
    rdd3      = rdd2.map(lambda tableau: [float(x) for x in tableau])
    # création df
    cols      = ['sportif', 'age', 'nb_km', 'vitesse', 'attente']
    fields    = [StructField(x, FloatType(), True) for x in cols]
    schema    = StructType(fields) 
    df        = spark.createDataFrame(rdd3, schema=schema ) 
    # preparation d'une colonne de features
    schema    = StructType(fields)
    assembler = VectorAssembler()
    assembler = assembler.setInputCols(cols)
    assembler = assembler.setOutputCol("features")
    df2       = assembler.transform(df)
    model.transform(df2).show()
    

# 1) Création d'un streaming context

In [None]:
ssc     = StreamingContext(sc, 3)
dstream = ssc.socketTextStream(listen_to_ip, listen_to_port)

# 2) application de la fonction de prédiction pour chaque batch de données

In [44]:
dstream.foreachRDD(prevoir)

In [45]:
dstream.pprint()

# 3) activer l'écoute sur la socket

In [46]:
ssc.start()

+-------+---+-----+-------+-------+--------+-------------+-----------+----------+
|sportif|age|nb_km|vitesse|attente|features|rawPrediction|probability|prediction|
+-------+---+-----+-------+-------+--------+-------------+-----------+----------+
+-------+---+-----+-------+-------+--------+-------------+-----------+----------+

-------------------------------------------
Time: 2018-02-27 04:43:48
-------------------------------------------

+-------+----+---------+---------+-------+--------------------+-------------+--------------------+----------+
|sportif| age|    nb_km|  vitesse|attente|            features|rawPrediction|         probability|prediction|
+-------+----+---------+---------+-------+--------------------+-------------+--------------------+----------+
|    0.0|15.0|3.0076275|27.610607|    0.5|[0.0,15.0,3.00762...|  [200.0,5.0]|[0.97560975609756...|       0.0|
+-------+----+---------+---------+-------+--------------------+-------------+--------------------+----------+

-----

In [47]:
ssc.stop()

+-------+----+---------+---------+-------+--------------------+-------------+--------------------+----------+
|sportif| age|    nb_km|  vitesse|attente|            features|rawPrediction|         probability|prediction|
+-------+----+---------+---------+-------+--------------------+-------------+--------------------+----------+
|    2.0|52.0|14.499323|7.1012588|    0.5|[2.0,52.0,14.4993...|[137.0,546.0]|[0.20058565153733...|       1.0|
|   -0.5|22.0|7.4904847|28.455948|    0.5|[-0.5,22.0,7.4904...|  [200.0,5.0]|[0.97560975609756...|       0.0|
|    6.0|64.0|10.608381|25.632332|    0.5|[6.0,64.0,10.6083...| [186.0,22.0]|[0.89423076923076...|       0.0|
+-------+----+---------+---------+-------+--------------------+-------------+--------------------+----------+



In [6]:
def prevoir(time, rdd):
    
    print(time)
    print(rdd)
    print("prevoir %s + show" %rdd.first())


In [7]:
def prevoir3(time, rdd):
    sep       = ";"
    str_recue = '0.0;0.1;0.2;0.3;0.4;0.5'
    rdd       = sc.parallelize([str_recue])
    chemin = "hdfs://54.37.12.49:8020/cart_model_for_cycliste_v4"
    model  = DecisionTreeClassificationModel.load(chemin)
    # travail sur la string
    rdd2      = rdd.map( lambda data   : data.split(sep))
    rdd3      = rdd2.map(lambda tableau: [float(x) for x in tableau])
    # création df
    cols      = ["c1", "c2", "c3", "c4", "c5", "c6"]
    fields    = [StructField(x, FloatType(), True) for x in cols]
    schema    = StructType(fields) 
    df        = spark.createDataFrame(rdd3, schema=schema ) 
    # preparation d'une colonne de features
    schema    = StructType(fields)
    assembler = VectorAssembler()
    assembler = assembler.setInputCols(cols)
    assembler = assembler.setOutputCol("features")
    df2       = assembler.transform(df)
    model.transform(df2).show()

In [24]:
if False:
    ssc.stop(False)
    print(ssc)

In [22]:
ssc.stop()