conda create -n pyspark python=3.6.8 pip wheel pandas matplotlib ipykernel

In [1]:
import os
import sys

os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell --master local[0]'
os.environ["PYSPARK_PYTHON"]='/usr/bin/python3'
#PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"]='/usr/bin/python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.2.3.1.0.0-78
      /_/

Using Python version 3.6.8 (default, Dec 30 2018 01:22:34)
SparkSession available as 'spark'.


In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.classification import LogisticRegression

cv = CountVectorizer(inputCol="urls", outputCol="features")

indexer = StringIndexer(inputCol="gender_age", outputCol="label")

lr = LogisticRegression()

pipeline = Pipeline(stages=[cv, indexer, lr])

In [3]:
training = "lab04/lab04_train_merged_labels.json"
#schema = "STRUCT: $uid"
df_train = spark.read.json(training)

In [4]:
df_train.schema

StructType(List(StructField(gender_age,StringType,true),StructField(uid,StringType,true),StructField(visits,ArrayType(StructType(List(StructField(timestamp,LongType,true),StructField(url,StringType,true))),true),true)))

In [5]:
ts_col=df_train["visits"].getField("timestamp")

In [6]:
df_train.select("visits").show(5,False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

import re
from urlparse import urlparse
from urllib import urlretrieve, unquote

def url2domain(url):
    #url = url.decode('utf-8')
    url = re.sub('(http(s)*://)+', 'http://', url)
    parsed_url = urlparse(unquote(url.strip()))
    if parsed_url.scheme not in ['http','https']: return None
    netloc = re.search("(?:www\.)?(.*)", parsed_url.netloc).group(1)
    if netloc is not None: 
        #return str(netloc.encode('utf8')).strip()
        return str(netloc.encode('utf8')).strip()
    return None

In [7]:
import re
from urllib.parse import urlparse
from urllib.request import urlretrieve, unquote

def url2domain(url):
    url = re.sub('(http(s)*://)+', 'http://', url)
    parsed_url = urlparse(unquote(url.strip()))
    if parsed_url.scheme not in ['http','https']: return None
    netloc = re.search("(?:www\.)?(.*)", parsed_url.netloc).group(1)
    if netloc is not None: return str(netloc.encode('utf8')).strip()
    return None

In [8]:
url0 = "http://www.zebra-zoya.ru/русский_сайт/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun"

In [9]:
url2domain(url0)

"b'zebra-zoya.ru'"

In [10]:
urls = df_train["visits"].getField("url")

In [11]:
from pyspark.sql import functions as F 

In [12]:
from pyspark.sql.types import ArrayType
from pyspark.sql.types import StringType

In [13]:
url2domain_udf = F.udf(lambda xx: [ url2domain(x) for x in xx],
                   ArrayType(StringType()))

In [14]:
df_train = df_train.withColumn("urls",url2domain_udf(urls))

In [15]:
df_train = df_train.select(["uid", "urls", "gender_age"])

In [16]:
l = [["1","2","2"],["1","2","3"]]

In [18]:
da = dict.fromkeys(l)

TypeError: unhashable type: 'list'

In [19]:
df = df_train

In [20]:
empty_cols = []

for c in df.columns:
    print('=' * 3, c, '=' * 3)
    df.select(c).describe().show()
    
    p = df.select(c).describe().toPandas()
    try:
        if float(p[p['summary'] == 'mean'][c].tolist()[0]) == .0 and \
            float(p[p['summary'] == 'stddev'][c].tolist()[0])== .0:
            empty_cols.append(c)
    except: continue

=== uid ===
+-------+--------------------+
|summary|                 uid|
+-------+--------------------+
|  count|               36138|
|   mean|                null|
| stddev|                null|
|    min|03001878-d923-488...|
|    max|ffc8d1e1-c2ef-47a...|
+-------+--------------------+

=== urls ===
+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

=== gender_age ===
+-------+----------+
|summary|gender_age|
+-------+----------+
|  count|     36138|
|   mean|      null|
| stddev|      null|
|    min|   F:18-24|
|    max|    M:>=55|
+-------+----------+



In [21]:
df.count()

36138

In [22]:
model = pipeline.fit(df)

In [24]:
model.save("lab04_model")

In [26]:
from pyspark.ml import PipelineModel

In [28]:
model_reloaded =  PipelineModel.load("lab04_model")