conda create -n pyspark python=3.6.8 pip wheel pandas matplotlib ipykernel

In [1]:
import os
import sys

os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell --master yarn ' # local[0]'
os.environ["PYSPARK_PYTHON"]='/usr/bin/python3'
#PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"]='/usr/bin/python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.2.3.1.0.0-78
      /_/

Using Python version 3.6.8 (default, Dec 30 2018 01:22:34)
SparkSession available as 'spark'.


In [2]:
from pyspark.sql.types import *

In [3]:
train_schema = StructType(
   fields = [
      StructField("uid", StringType(), True),
      StructField("gender_age", StringType(), True),
      StructField("visits", ArrayType(
          StructType([
               StructField("timestamp", LongType(), True),
               StructField("url", StringType(), True)
               ])
      ),True)
   ])

In [4]:
test_schema = StructType(
   fields = [
      StructField("uid", StringType(), True),
      StructField("visits", ArrayType(
          StructType([
               StructField("timestamp", LongType(), True),
               StructField("url", StringType(), True)
               ])
      ),True)
])

https://stackoverflow.com/questions/41399399/serialize-a-custom-transformer-using-python-to-be-used-within-a-pyspark-ml-pipel

https://www.slideshare.net/SparkSummit/building-custom-ml-pipelinestages-for-feature-selection-with-marc-kaminski

In [70]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable

class Url2DomainTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable):

    def __init__(self, inputCol=None, outputCol=None):
        super(Url2DomainTransformer, self).__init__()
        self.inputCol = inputCol
        self.outputCol = outputCol
    
     
    def _transform(self, dataset):
        import re
        from urllib.parse import urlparse
        from urllib.request import urlretrieve, unquote
        from pyspark.sql import functions as F 
        
        def url2domain(url):
            url = re.sub('(http(s)*://)+', 'http://', url)
            parsed_url = urlparse(unquote(url.strip()))
            if parsed_url.scheme not in ['http','https']: return None
            netloc = re.search("(?:www\.)?(.*)", parsed_url.netloc).group(1)
            if netloc is not None: return str(netloc.encode('utf8')).strip()
            return None    
        
        url2domain_udf = F.udf(lambda xx: [ url2domain(x) for x in xx],
                   ArrayType(StringType()))
        dataset = dataset.withColumn(self.outputCol,url2domain_udf(self.inputCol))
        return dataset

    

In [80]:
class SelectFields(Transformer, DefaultParamsReadable, DefaultParamsWritable):

    @keyword_only
    def __init__(self, selectFields=None):
        super(SelectFields, self).__init__()
        self.selectFields = selectFields
        
    def _transform(self, dataset):
        return dataset.select(self.selectFields)



In [62]:
label_strings = ['M:25-34',
 'F:25-34',
 'M:35-44',
 'F:35-44',
 'F:18-24',
 'F:45-54',
 'M:45-54',
 'M:18-24',
 'F:>=55',
 'M:>=55']

In [6]:
training = "lab04/lab04_train_merged_labels.json"
df_train = spark.read.json(training, train_schema)

In [7]:
df_train.select("visits.url").show()

+--------------------+
|                 url|
+--------------------+
|[http://zebra-zoy...|
|[http://sweetradi...|
|[http://ru.orifla...|
|[http://translate...|
|[https://mail.ram...|
|[https://cfire.ma...|
|[http://www.msn.c...|
|[http://www.gazpr...|
|[http://lifenews....|
|[https://www.goog...|
|[http://muz4in.ne...|
|[http://kosmetist...|
|[http://android.m...|
|[http://tsn.ua/po...|
|[http://www.jobin...|
|[http://www.abc-p...|
|[http://easygames...|
|[http://www.ratan...|
|[http://sam-zdrav...|
|[http://www.msn.c...|
+--------------------+
only showing top 20 rows



In [31]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasOutputCols, Param, Params
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql.functions import lit # for the dummy _transform

class SetValueTransformer(
    Transformer, HasOutputCols, DefaultParamsReadable, DefaultParamsWritable,
):
    value = Param(
        Params._dummy(),
        "value",
        "value to fill",
    )

    @keyword_only
    def __init__(self, outputCols=None, value=0.0):
        super(SetValueTransformer, self).__init__()
        self._setDefault(value=0.0)
        kwargs = self._input_kwargs
        self._set(**kwargs)

    @keyword_only
    def setParams(self, outputCols=None, value=0.0):
        """
        setParams(self, outputCols=None, value=0.0)
        Sets params for this SetValueTransformer.
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setValue(self, value):
        """
        Sets the value of :py:attr:`value`.
        """
        return self._set(value=value)

    def getValue(self):
        """
        Gets the value of :py:attr:`value` or its default value.
        """
        return self.getOrDefault(self.value)

    def _transform(self, dataset):
        
        dataset = dataset.withColumn(self.getOutputCols()[0], lit(self.getValue()))
        return dataset

In [72]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier

cv = CountVectorizer(inputCol="urls", outputCol="features")

indexer = StringIndexer(inputCol="gender_age", outputCol="label")

url2domain_transformer = Url2DomainTransformer(outputCol="urls",inputCol="visits.url")
select_transformer = SelectFields(selectFields=["uid","gender_age","urls"])

transformer = SetValueTransformer(outputCols=["a"])

lr = LogisticRegression(labelCol='label', probabilityCol='lr_probability', predictionCol='lr_prediction', rawPredictionCol='lr_rawPrediction')

#rf = RandomForestClassifier(labelCol='label', probabilityCol='rf_probability', predictionCol='rf_prediction', rawPredictionCol='rf_rawPrediction')

lr_label_converter = IndexToString(inputCol="lr_prediction", outputCol="lr_gender_age", labels=label_strings)

#rf_label_converter = IndexToString(inputCol="rf_prediction", outputCol="rf_gender_age", labels=label_strings)

pipeline = Pipeline(stages =  [ url2domain_transformer, select_transformer, cv, indexer, lr, lr_label_converter])



In [73]:
url2domain_transformer.transform(df_train).show(2)

+--------------------+----------+--------------------+--------------------+
|                 uid|gender_age|              visits|                urls|
+--------------------+----------+--------------------+--------------------+
|d50192e5-c44e-4ae...|   F:18-24|[[1419688144068, ...|[b'zebra-zoya.ru'...|
|d502331d-621e-472...|   M:25-34|[[1419717886224, ...|[b'sweetrading.ru...|
+--------------------+----------+--------------------+--------------------+
only showing top 2 rows



In [33]:
transformer.transform(df_train).show(3)

+--------------------+----------+--------------------+---+
|                 uid|gender_age|              visits|  a|
+--------------------+----------+--------------------+---+
|d50192e5-c44e-4ae...|   F:18-24|[[1419688144068, ...|0.0|
|d502331d-621e-472...|   M:25-34|[[1419717886224, ...|0.0|
|d50237ea-747e-48a...|   F:25-34|[[1418840296062, ...|0.0|
+--------------------+----------+--------------------+---+
only showing top 3 rows



In [11]:
select_transformer.selectFields

['uid', 'gender_age', 'urls']

In [12]:
df_t0 = url2domain_transformer.transform(df_train)

In [13]:
select_transformer.transform(df_t0).show(1)

+--------------------+----------+--------------------+
|                 uid|gender_age|                urls|
+--------------------+----------+--------------------+
|d50192e5-c44e-4ae...|   F:18-24|[b'zebra-zoya.ru'...|
+--------------------+----------+--------------------+
only showing top 1 row



In [74]:
model = pipeline.fit(df_train)

In [76]:
model.save("tst_custom_transformer_model")

In [78]:
from pyspark.ml import PipelineModel

In [79]:
model_reloaded =  PipelineModel.load("tst_custom_transformer_model")

TypeError: __init__() missing 1 required positional argument: 'selectFields'

In [16]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr_evaluator = BinaryClassificationEvaluator(
        rawPredictionCol='lr_rawPrediction', labelCol='label', metricName='areaUnderROC'
    )

In [17]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

#regParam for L2 regularization, https://craftappmobile.com/l1-vs-l2-regularization/
#elasticNetParam https://en.wikipedia.org/wiki/Elastic_net_regularization, https://www.quora.com/What-is-elastic-net-regularization-in-machine-learning

grid = ParamGridBuilder() \
    .addGrid(LogisticRegression.regParam, [0.1, 0.01]) \
    .addGrid(LogisticRegression.elasticNetParam, [0.1, 0.01]) \
    .build()



In [32]:
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=grid,
    evaluator=lr_evaluator,
    numFolds=3,
    parallelism=2
)

In [33]:
cvModel = crossval.fit(df_train)

In [34]:
cvModel.write.('lab04s_model')

AttributeError: 'Url2DomainTransformer' object has no attribute '_to_java'

Проверю модель на урлах, открытых в моём браузере

In [4]:
data = {
  "uid": "bd7a30e1-a25d-4cbf-a03f-61748cbe540e",
  "visits": [
    {
      "url": "https://mail.google.com/mail/u/0/#inbox",
      "timestamp": 1419775945781
    }
   ,  
   {
      "url": "https://lk-de.newprolab.com/",
      "timestamp": 1419775945781
    }
   ,  
   {
      "url": "https://yandex.ru/pogoda/moscow/maps/temperature?via=mmapwb&le_TemperatureBalloons=0&le_WindParticles=1&ll=25.976425_49.047348&z=4",
      "timestamp": 1419775945781
    }
  ,  
   {
      "url": "https://translate.yandex.ru/?lang=en-ru&text=derivation",
      "timestamp": 1419775945781
    }
 ,  
   {
      "url": "https://web.whatsapp.com/",
      "timestamp": 1419775945781
    }
,  
   {
      "url": "https://app.slack.com/client/TNG296ABE/CPPRL95HU/thread/CP73F91ST-1571040655.075700",
      "timestamp": 1419775945781
    }
,  
   {
      "url": "https://github.com/newprolab/content_dataengineer5/blob/master/labs/de_lab_04.md",
      "timestamp": 1419775945781
    }
  ]
}

In [5]:
rdd = sc.parallelize([data])

In [6]:
df_test = spark.read.json(rdd)

In [7]:
df_test.show()

+--------------------+--------------------+
|                 uid|              visits|
+--------------------+--------------------+
|bd7a30e1-a25d-4cb...|[[1419775945781, ...|
+--------------------+--------------------+



In [15]:
df_test = df_test.withColumn("urls",url2domain_udf(df_test["visits"].getField("url"))) 

In [16]:
df_test = df_test.select(["uid", "urls"])

In [17]:
model_reloaded.transform(df_test).show(1,False)

+------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+----------+
|uid                                 |urls                                                                                                                                    |features                              |rawPrediction                                                                                                                                 