conda create -n pyspark python=2.7 pip wheel pandas matplotlib ipykernel

In [1]:
import os
import sys

os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell --master yarn ' # local[0]'
#os.environ["PYSPARK_PYTHON"]='/usr/bin/python3'
#PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON
#os.environ["PYSPARK_DRIVER_PYTHON"]='/usr/bin/python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.2.3.1.0.0-78
      /_/

Using Python version 2.7.17 (default, Oct 21 2019 19:04:46)
SparkSession available as 'spark'.


In [5]:
from pyspark.sql.types import *

In [6]:
train_schema = StructType(
   fields = [
      StructField("uid", StringType(), True),
      StructField("gender_age", StringType(), True),
      StructField("visits", ArrayType(
          StructType([
               StructField("timestamp", LongType(), True),
               StructField("url", StringType(), True)
               ])
      ),True)
   ])

In [7]:
test_schema = StructType(
   fields = [
      StructField("uid", StringType(), True),
      StructField("visits", ArrayType(
          StructType([
               StructField("timestamp", LongType(), True),
               StructField("url", StringType(), True)
               ])
      ),True)
])

https://stackoverflow.com/questions/41399399/serialize-a-custom-transformer-using-python-to-be-used-within-a-pyspark-ml-pipel

https://www.slideshare.net/SparkSummit/building-custom-ml-pipelinestages-for-feature-selection-with-marc-kaminski

https://stackoverflow.com/questions/42140980/spark-ml-pipelines-unseen-label-exception-when-classifying-new-examples?rq=1

In [8]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable

class Url2DomainTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable):

    def __init__(self, inputCol="visits.url", outputCol="urls"):
        super(Url2DomainTransformer, self).__init__()
        self.inputCol = inputCol
        self.outputCol = outputCol


    def _transform(self, dataset):
        import re
        from urlparse import urlparse
        from urllib import urlretrieve, unquote
        from pyspark.sql import functions as F

        def url2domain(url):
            url = re.sub('(http(s)*://)+', 'http://', url)
            parsed_url = urlparse(unquote(url.strip()))
            if parsed_url.scheme not in ['http','https']: return None
            netloc = re.search("(?:www\.)?(.*)", parsed_url.netloc).group(1)
            if netloc is not None: return str(netloc.encode('utf8')).strip()
            return None

        url2domain_udf = F.udf(lambda xx: [ url2domain(x) for x in xx],
                   ArrayType(StringType()))
        dataset = dataset.withColumn(self.outputCol,url2domain_udf(self.inputCol))
        return dataset
    

In [9]:
class Timestamp2Weekday(Transformer, DefaultParamsReadable, DefaultParamsWritable):

    def __init__(self, inputCol="visits.timestamp", outputCol="weekdays"):
        super(Timestamp2Weekday, self).__init__()
        self.inputCol = inputCol
        self.outputCol = outputCol
        
    def _transform(self, dataset):
        from pyspark.sql import functions as F
        from datetime import datetime

        def unixtime2weekday(ts):
            dt = datetime.utcfromtimestamp(ts/1000)
            return "wd" + str(dt.weekday())

        unixtime2weekday_udf = F.udf(lambda xx: [ unixtime2weekday(x) for x in xx],
                   ArrayType(StringType()))
        dataset = dataset.withColumn(self.outputCol,unixtime2weekday_udf(self.inputCol))
        
        return dataset
        

In [10]:
class Timestamp2Hour(Transformer, DefaultParamsReadable, DefaultParamsWritable):

    def __init__(self, inputCol="visits.timestamp", outputCol="hours"):
        super(Timestamp2Hour, self).__init__()
        self.inputCol = inputCol
        self.outputCol = outputCol
        
    def _transform(self, dataset):
        from pyspark.sql import functions as F
        from datetime import datetime

        def unixtime2hour(ts):
            dt = datetime.utcfromtimestamp(ts/1000)
            return "h" + str(dt.hour)

        unixtime2hour_udf = F.udf(lambda xx: [ unixtime2hour(x) for x in xx],
                   ArrayType(StringType()))
        dataset = dataset.withColumn(self.outputCol,unixtime2hour_udf(self.inputCol))
        return dataset

In [67]:
class ConcatFeatures(Transformer, DefaultParamsReadable, DefaultParamsWritable):

    def __init__(self, inputCol='"urls", "weekdays","hours"', outputCol="f"):
        super(ConcatFeatures, self).__init__()
        self.inputCol = inputCol
        self.outputCol = outputCol
        
    def _transform(self, dataset):
        from pyspark.sql import functions as F
        from itertools import chain
        def concat(type):
            def concat_(*args):
                return list(chain.from_iterable((arg if arg else [] for arg in args)))
            return F.udf(concat_, ArrayType(type))
        concat_string_arrays = concat(StringType())
        dataset = dataset.withColumn(self.outputCol,concat_string_arrays("urls", "weekdays","hours"))
        return dataset

In [62]:
class SelectFields(Transformer, DefaultParamsReadable, DefaultParamsWritable):

    #@keyword_only
    def __init__(self, selectFields=["uid","gender_age","f"]):
        super(SelectFields, self).__init__()
        self.selectFields = selectFields
        
    def _transform(self, dataset):
        return dataset.select(self.selectFields)



In [53]:
label_strings = ['M:25-34',
 'F:25-34',
 'M:35-44',
 'F:35-44',
 'F:18-24',
 'F:45-54',
 'M:45-54',
 'M:18-24',
 'F:>=55',
 'M:>=55']

In [54]:
training = "lab04/lab04_train_merged_labels.json"
df_train = spark.read.json(training, train_schema)

In [55]:
df_train.select("visits.url").show()

+--------------------+
|                 url|
+--------------------+
|[http://zebra-zoy...|
|[http://sweetradi...|
|[http://ru.orifla...|
|[http://translate...|
|[https://mail.ram...|
|[https://cfire.ma...|
|[http://www.msn.c...|
|[http://www.gazpr...|
|[http://lifenews....|
|[https://www.goog...|
|[http://muz4in.ne...|
|[http://kosmetist...|
|[http://android.m...|
|[http://tsn.ua/po...|
|[http://www.jobin...|
|[http://www.abc-p...|
|[http://easygames...|
|[http://www.ratan...|
|[http://sam-zdrav...|
|[http://www.msn.c...|
+--------------------+
only showing top 20 rows



In [15]:
#indexer.fit(df_train.limit(10)).labels
df_train_cut = df_train.limit(10)
#labels = indexer.fit(df_train_cut).labels


In [63]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
#from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier

In [68]:
url2domain_transformer = Url2DomainTransformer(outputCol="urls",inputCol="visits.url")
timestamp2weekday_trasformer = Timestamp2Weekday(outputCol="weekdays",inputCol="visits.timestamp")
timestamp2hour_trasformer = Timestamp2Hour(outputCol="hours",inputCol="visits.timestamp")
concatfeatures_transformer = ConcatFeatures(inputCol= '"urls","weekdays","hours"')
select_transformer = SelectFields(selectFields=["uid","gender_age","f"])
indexer = StringIndexer(inputCol="gender_age", outputCol="label", handleInvalid="keep")
pipeline_transform = Pipeline(stages =  [url2domain_transformer, timestamp2weekday_trasformer, timestamp2hour_trasformer, concatfeatures_transformer, select_transformer, indexer])

In [69]:
pipeline_transform.fit(df_train).transform(df_train).show(20)

+--------------------+----------+--------------------+-----+
|                 uid|gender_age|                   f|label|
+--------------------+----------+--------------------+-----+
|d50192e5-c44e-4ae...|   F:18-24|[zebra-zoya.ru, n...|  4.0|
|d502331d-621e-472...|   M:25-34|[sweetrading.ru, ...|  0.0|
|d50237ea-747e-48a...|   F:25-34|[ru.oriflame.com,...|  1.0|
|d502f29f-d57a-46b...|   F:25-34|[translate-tattoo...|  1.0|
|d503c3b2-a0c2-4f4...|    M:>=55|[mail.rambler.ru,...|  9.0|
|d5090ddf-5648-487...|   F:25-34|[cfire.mail.ru, p...|  1.0|
|d50bcef8-16ff-4e8...|   F:25-34|[msn.com, msn.com...|  1.0|
|d50e23dc-0cbd-488...|   F:18-24|[gazprom.ru, re-s...|  4.0|
|d50fdabb-4208-441...|   F:45-54|[lifenews.ru, lif...|  5.0|
|d511b480-23a6-482...|   F:18-24|[google.ru, films...|  4.0|
|d51294ed-1b95-4e4...|   F:25-34|[muz4in.net, smac...|  1.0|
|d512e295-6a85-491...|   F:25-34|[kosmetista.ru, k...|  1.0|
|d51441ea-9dda-454...|   M:25-34|[android.mobile-r...|  0.0|
|d51822d4-105b-457...|  

In [47]:
from itertools import chain
def concat(type):
    def concat_(*args):
        return list(chain.from_iterable((arg if arg else [] for arg in args)))
    return F.udf(concat_, ArrayType(type))

In [None]:
F.c

In [50]:
concat_string_arrays = concat(StringType())
tst.withColumn("f",concat_string_arrays("urls", "weekdays","hours")).show(truncate=False)


+------------------------------------+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [72]:
cv = CountVectorizer(inputCol="f", outputCol="features")

#labels = indexer.fit(df_train_cut).labels
#df_train_cut=indexer.fit(df_train_cut).transform(df_train_cut)

#transformer = SetValueTransformer(outputCols=["a"])


lr = LogisticRegression(labelCol='label', probabilityCol='lr_probability', predictionCol='lr_prediction', rawPredictionCol='lr_rawPrediction')

#rf = RandomForestClassifier(labelCol='label', probabilityCol='rf_probability', predictionCol='rf_prediction', rawPredictionCol='rf_rawPrediction')

#lr_label_converter = IndexToString(inputCol="lr_prediction", outputCol="lr_gender_age", labels=label_strings)

#rf_label_converter = IndexToString(inputCol="rf_prediction", outputCol="rf_gender_age", labels=label_strings)
lr_label_converter = IndexToString(inputCol="lr_prediction", outputCol="lr_gender_age", labels=label_strings)

pipeline = Pipeline(stages =  [url2domain_transformer,timestamp2weekday_trasformer,timestamp2hour_trasformer,
                               concatfeatures_transformer,select_transformer, indexer,
                               cv, lr, lr_label_converter])



In [73]:
model_trasform = pipeline.fit(df_train)
model_trasform.write().overwrite().save("tst_custom_transformer_model")

In [74]:
from pyspark.ml import PipelineModel
model_transform_reloaded =  PipelineModel.load("tst_custom_transformer_model")

In [75]:
df_train_cut_transformed = model_transform_reloaded.transform(df_train)

In [76]:
df_train_cut_transformed.show()

+--------------------+----------+--------------------+-----+--------------------+--------------------+--------------------+-------------+-------------+
|                 uid|gender_age|                   f|label|            features|    lr_rawPrediction|      lr_probability|lr_prediction|lr_gender_age|
+--------------------+----------+--------------------+-----+--------------------+--------------------+--------------------+-------------+-------------+
|d50192e5-c44e-4ae...|   F:18-24|[zebra-zoya.ru, n...|  4.0|(111612,[3,5,10,1...|[-38.207154538841...|[3.45295271526290...|          4.0|      F:18-24|
|d502331d-621e-472...|   M:25-34|[sweetrading.ru, ...|  0.0|(111612,[0,1,2,3,...|[194.922438022165...|[1.0,4.0353291665...|          0.0|      M:25-34|
|d50237ea-747e-48a...|   F:25-34|[ru.oriflame.com,...|  1.0|(111612,[1,2,3,4,...|[-171.56202381797...|[3.61152224844157...|          1.0|      F:25-34|
|d502f29f-d57a-46b...|   F:25-34|[translate-tattoo...|  1.0|(111612,[0,2,3,5,...|[30.331

In [32]:
model = pipeline.fit(df_train)

In [33]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr_evaluator = BinaryClassificationEvaluator(
        rawPredictionCol='lr_rawPrediction', labelCol='label', metricName='areaUnderROC'
    )

In [34]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

#regParam for L2 regularization, https://craftappmobile.com/l1-vs-l2-regularization/
#elasticNetParam https://en.wikipedia.org/wiki/Elastic_net_regularization, https://www.quora.com/What-is-elastic-net-regularization-in-machine-learning

grid = ParamGridBuilder() \
    .addGrid(LogisticRegression.regParam, [0.1, 0.01]) \
    .addGrid(LogisticRegression.elasticNetParam, [0.1, 0.01]) \
    .build()


In [35]:
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=grid,
    evaluator=lr_evaluator,
    numFolds=3,
    parallelism=2
)

In [36]:
df_train.show(15)

+--------------------+----------+--------------------+
|                 uid|gender_age|              visits|
+--------------------+----------+--------------------+
|d50192e5-c44e-4ae...|   F:18-24|[[1419688144068, ...|
|d502331d-621e-472...|   M:25-34|[[1419717886224, ...|
|d50237ea-747e-48a...|   F:25-34|[[1418840296062, ...|
|d502f29f-d57a-46b...|   F:25-34|[[1418217864467, ...|
|d503c3b2-a0c2-4f4...|    M:>=55|[[1427272415001, ...|
|d5090ddf-5648-487...|   F:25-34|[[1419777541435, ...|
|d50bcef8-16ff-4e8...|   F:25-34|[[1426704753001, ...|
|d50e23dc-0cbd-488...|   F:18-24|[[1419613709992, ...|
|d50fdabb-4208-441...|   F:45-54|[[1427203859001, ...|
|d511b480-23a6-482...|   F:18-24|[[1427237735001, ...|
|d51294ed-1b95-4e4...|   F:25-34|[[1419755762980, ...|
|d512e295-6a85-491...|   F:25-34|[[1417901723401, ...|
|d51441ea-9dda-454...|   M:25-34|[[1427261052000, ...|
|d51822d4-105b-457...|   F:25-34|[[1427130952000, ...|
|d5183db2-c8e5-413...|   F:35-44|[[1427266797000, ...|
+---------

In [12]:
cvModel = crossval.fit(df_train)

NameError: name 'crossval' is not defined

In [180]:
print(indexer.explainParams())

handleInvalid: how to handle invalid data (unseen or NULL values) in features and label column of string type. Options are 'skip' (filter out rows with invalid data), error (throw an error), or 'keep' (put invalid data in a special additional bucket, at index numLabels). (default: error)
inputCol: input column name. (current: gender_age)
outputCol: output column name. (default: StringIndexer_41b393718a28418251d5__output, current: label)
stringOrderType: How to order labels of string column. The first label after ordering is assigned an index of 0. Supported options: frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc. (default: frequencyDesc)


In [198]:
cvModel.bestModel.write().overwrite().save("tst_cv_bestmodel_model")

In [77]:
from pyspark.ml import PipelineModel
bestmodel_reloaded =  PipelineModel.load("lab04s_bestmodel")

AttributeError: 'module' object has no attribute 'ConcatFeatures'

In [4]:
from pyspark.ml.feature import VectorSizeHint

In [3]:
bestmodel_reloaded.stages

[StringIndexer_470598b09e6dabf347c4,
 CountVectorizer_47c487b8be77ca0239fe,
 CountVectorizer_49c7829b120dc11bd6da,
 CountVectorizer_4e719c91cf7d96e802a3,
 VectorAssembler_45bca706b93e383ff22a,
 LogisticRegression_4043a2a8dbad4d0df63f,
 IndexToString_46559e6e59c0bf8ea7a7]

In [27]:
bestmodel_reloaded.transform(df_train.limit(10)).show()

AnalysisException: u"cannot resolve '`weekdays`' given input columns: [uid, gender_age, visits, urls];;\n'Project [uid#0, gender_age#1, urls#503, 'weekdays, 'hours]\n+- Project [uid#0, gender_age#1, visits#2, <lambda>(visits#2.url) AS urls#503]\n   +- GlobalLimit 10\n      +- LocalLimit 10\n         +- Relation[uid#0,gender_age#1,visits#2] json\n"

In [19]:
data = {
  "uid": "bd7a30e1-a25d-4cbf-a03f-61748cbe540e",
  "visits": [
    {
      "url": "https://mail.google.com/mail/u/0/#inbox",
      "timestamp": 1419775945781
    }
   ,  
   {
      "url": "https://lk-de.newprolab.com/",
      "timestamp": 1419775945781
    }
   ,  
   {
      "url": "https://yandex.ru/pogoda/moscow/maps/temperature?via=mmapwb&le_TemperatureBalloons=0&le_WindParticles=1&ll=25.976425_49.047348&z=4",
      "timestamp": 1419775945781
    }
  ,  
   {
      "url": "https://translate.yandex.ru/?lang=en-ru&text=derivation",
      "timestamp": 1419775945781
    }
 ,  
   {
      "url": "https://web.whatsapp.com/",
      "timestamp": 1419775945781
    }
,  
   {
      "url": "https://app.slack.com/client/TNG296ABE/CPPRL95HU/thread/CP73F91ST-1571040655.075700",
      "timestamp": 1419775945781
    }
,  
   {
      "url": "https://github.com/newprolab/content_dataengineer5/blob/master/labs/de_lab_04.md",
      "timestamp": 1419775945781
    }
  ]
}

In [20]:
rdd = sc.parallelize([data])

In [21]:
df_test = spark.read.json(rdd)

In [28]:
bestmodel_reloaded.params

[]

In [12]:
from pyspark.sql import functions as F
from datetime import datetime

In [27]:
ts = 1427261052000/1000
dt = datetime.utcfromtimestamp(ts)
dt.hour
dt.weekday()

2

In [19]:
from pyspark.sql import functions as F
from datetime import datetime

def unixtime2hour(ts):
    dt = datetime.utcfromtimestamp(ts/1000)
    return dt.hour

unixtime2hour_udf = F.udf(lambda xx: [ unixtime2hour(x) for x in xx],
                   ArrayType(StringType()))
        

In [30]:
from pyspark.sql import functions as F
from datetime import datetime

def unixtime2weekday(ts):
    dt = datetime.utcfromtimestamp(ts/1000)
    return dt.weekday()

unixtime2weekday_udf = F.udf(lambda xx: [ unixtime2weekday(x) for x in xx],
                   ArrayType(StringType()))
        

In [40]:
import collections
a = [0]
counter=collections.Counter(a)
print(counter)
# Counter({1: 4, 2: 4, 3: 2, 5: 2, 4: 1})
print(counter.values())
# [4, 4, 2, 1, 2]
print(counter.keys())
# [1, 2, 3, 4, 5]
print(counter.most_common(3))

Counter({0: 1})
[1]
[0]
[(0, 1)]


In [1]:
df_train.withColumn("hours",unixtime2weekday_udf("visits.timestamp"))\
                    .select("gender_age","hours").filter("gender_age='M:18-24'").show(20,False)

NameError: name 'df_train' is not defined

In [36]:
df_train.withColumn("ts",F.from_unixtime(df_train["visits.timestamp"])).show(10)

AnalysisException: u"cannot resolve 'from_unixtime(`visits`.`timestamp`, 'yyyy-MM-dd HH:mm:ss')' due to data type mismatch: argument 1 requires bigint type, however, '`visits`.`timestamp`' is of array<bigint> type.;;\n'Project [uid#0, gender_age#1, visits#2, from_unixtime(visits#2.timestamp, yyyy-MM-dd HH:mm:ss, Some(Etc/UTC)) AS ts#387]\n+- Relation[uid#0,gender_age#1,visits#2] json\n"

In [49]:
from datetime import datetime
ts = 1427261052000/1000

# if you encounter a "year is out of range" error the timestamp
# may be in milliseconds, try `ts /= 1000` in that case
dt = datetime.utcfromtimestamp(ts)
dt.hour


5

In [42]:
df_train.show()

+--------------------+----------+--------------------+
|                 uid|gender_age|              visits|
+--------------------+----------+--------------------+
|d50192e5-c44e-4ae...|   F:18-24|[[1419688144068, ...|
|d502331d-621e-472...|   M:25-34|[[1419717886224, ...|
|d50237ea-747e-48a...|   F:25-34|[[1418840296062, ...|
|d502f29f-d57a-46b...|   F:25-34|[[1418217864467, ...|
|d503c3b2-a0c2-4f4...|    M:>=55|[[1427272415001, ...|
|d5090ddf-5648-487...|   F:25-34|[[1419777541435, ...|
|d50bcef8-16ff-4e8...|   F:25-34|[[1426704753001, ...|
|d50e23dc-0cbd-488...|   F:18-24|[[1419613709992, ...|
|d50fdabb-4208-441...|   F:45-54|[[1427203859001, ...|
|d511b480-23a6-482...|   F:18-24|[[1427237735001, ...|
|d51294ed-1b95-4e4...|   F:25-34|[[1419755762980, ...|
|d512e295-6a85-491...|   F:25-34|[[1417901723401, ...|
|d51441ea-9dda-454...|   M:25-34|[[1427261052000, ...|
|d51822d4-105b-457...|   F:25-34|[[1427130952000, ...|
|d5183db2-c8e5-413...|   F:35-44|[[1427266797000, ...|
|d51974e3-

In [24]:
bestmodel_reloaded.transform(df_test.withColumn("gender_age",lit(""))).show(1,False)

+------------------------------------+----------+-------------------------------------------------------------------------------------------------------------------+-----+--------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+-------------+
|uid                                 |gender_age|urls                                                                                                               |label|features                              |lr_rawPrediction                                                                                       

In [174]:
StringIndexer().explainParams()

"handleInvalid: how to handle invalid data (unseen or NULL values) in features and label column of string type. Options are 'skip' (filter out rows with invalid data), error (throw an error), or 'keep' (put invalid data in a special additional bucket, at index numLabels). (default: error)\ninputCol: input column name. (undefined)\noutputCol: output column name. (default: StringIndexer_478e9078f8887914014c__output)\nstringOrderType: How to order labels of string column. The first label after ordering is assigned an index of 0. Supported options: frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc. (default: frequencyDesc)"

In [115]:
df_train_cut = df_train.limit(10)

In [83]:
df_train_cut.show()

+--------------------+----------+--------------------+-----+
|                 uid|gender_age|              visits|label|
+--------------------+----------+--------------------+-----+
|dd387df7-f50e-4b7...|   F:25-34|[[1419606827095, ...|  1.0|
|dd3aa566-00da-491...|   M:45-54|[[1426958952000, ...|  3.0|
|dd3adce0-6207-479...|   F:25-34|[[1414428818001, ...|  1.0|
|dd3bebdf-69ea-4fd...|   M:35-44|[[1422207817000, ...|  4.0|
|dd41e32e-202a-424...|   M:25-34|[[1418813830629, ...|  0.0|
|dd45f6b7-675c-414...|   F:18-24|[[1419921965314, ...|  2.0|
|dd472fa9-d1ee-4a5...|   M:25-34|[[1418674371440, ...|  0.0|
|dd474635-14fd-483...|   F:25-34|[[1418405925940, ...|  1.0|
|dd489bce-c115-463...|   M:25-34|[[1427219718001, ...|  0.0|
|dd4b6f79-3d65-4f5...|   M:25-34|[[1427186182000, ...|  0.0|
+--------------------+----------+--------------------+-----+



In [88]:
select_transformer.selectFields

['uid', 'gender_age', 'urls']

In [89]:
df_train_cut

DataFrame[uid: string, gender_age: string, visits: array<struct<timestamp:bigint,url:string>>, label: double]

In [87]:
df_train_cut

DataFrame[uid: string, gender_age: string, visits: array<struct<timestamp:bigint,url:string>>, label: double]

In [112]:
cvModel.bestModel.write().overwrite().save('lab04s_model')

In [53]:
pipeline.explainParams()

'stages: a list of pipeline stages (current: [Url2DomainTransformer_447b82347803217117ef, SelectFields_47998934796a145d7cf0, CountVectorizer_461ea09a378ed3faa2e4, StringIndexer_4b4aad7b8abe1f399a9c, LogisticRegression_44ce84cc5481de82497c, IndexToString_41cfb7bd197702f8c352])'

In [52]:
pipeline.explainParam(712)

ValueError: Cannot resolve 712 as a param.

In [33]:
cvModel = crossval.fit(df_train)

In [34]:
cvModel.write.('lab04s_model')

AttributeError: 'Url2DomainTransformer' object has no attribute '_to_java'

Проверю модель на урлах, открытых в моём браузере

In [4]:
data = {
  "uid": "bd7a30e1-a25d-4cbf-a03f-61748cbe540e",
  "visits": [
    {
      "url": "https://mail.google.com/mail/u/0/#inbox",
      "timestamp": 1419775945781
    }
   ,  
   {
      "url": "https://lk-de.newprolab.com/",
      "timestamp": 1419775945781
    }
   ,  
   {
      "url": "https://yandex.ru/pogoda/moscow/maps/temperature?via=mmapwb&le_TemperatureBalloons=0&le_WindParticles=1&ll=25.976425_49.047348&z=4",
      "timestamp": 1419775945781
    }
  ,  
   {
      "url": "https://translate.yandex.ru/?lang=en-ru&text=derivation",
      "timestamp": 1419775945781
    }
 ,  
   {
      "url": "https://web.whatsapp.com/",
      "timestamp": 1419775945781
    }
,  
   {
      "url": "https://app.slack.com/client/TNG296ABE/CPPRL95HU/thread/CP73F91ST-1571040655.075700",
      "timestamp": 1419775945781
    }
,  
   {
      "url": "https://github.com/newprolab/content_dataengineer5/blob/master/labs/de_lab_04.md",
      "timestamp": 1419775945781
    }
  ]
}

In [5]:
rdd = sc.parallelize([data])

In [6]:
df_test = spark.read.json(rdd)

In [7]:
df_test.show()

+--------------------+--------------------+
|                 uid|              visits|
+--------------------+--------------------+
|bd7a30e1-a25d-4cb...|[[1419775945781, ...|
+--------------------+--------------------+



In [15]:
df_test = df_test.withColumn("urls",url2domain_udf(df_test["visits"].getField("url"))) 

In [16]:
df_test = df_test.select(["uid", "urls"])

In [17]:
model_reloaded.transform(df_test).show(1,False)

+------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+----------+
|uid                                 |urls                                                                                                                                    |features                              |rawPrediction                                                                                                                                 