conda create -n pyspark python=3.6.8 pip wheel pandas matplotlib ipykernel

In [1]:
import os
import sys

os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell --master local[0]'
os.environ["PYSPARK_PYTHON"]='/usr/bin/python3'
#PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"]='/usr/bin/python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.2.3.1.0.0-78
      /_/

Using Python version 3.6.8 (default, Dec 30 2018 01:22:34)
SparkSession available as 'spark'.


In [2]:
training = "lab04/lab04_train_merged_labels.json"
#schema = "STRUCT: $uid"
df_train = spark.read.json(training)

In [3]:
df_train.schema

StructType(List(StructField(gender_age,StringType,true),StructField(uid,StringType,true),StructField(visits,ArrayType(StructType(List(StructField(timestamp,LongType,true),StructField(url,StringType,true))),true),true)))

In [4]:
ts_col=df_train["visits"].getField("timestamp")

In [5]:
df_train.select("visits").show(5,False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

import re
from urlparse import urlparse
from urllib import urlretrieve, unquote

def url2domain(url):
    #url = url.decode('utf-8')
    url = re.sub('(http(s)*://)+', 'http://', url)
    parsed_url = urlparse(unquote(url.strip()))
    if parsed_url.scheme not in ['http','https']: return None
    netloc = re.search("(?:www\.)?(.*)", parsed_url.netloc).group(1)
    if netloc is not None: 
        #return str(netloc.encode('utf8')).strip()
        return str(netloc.encode('utf8')).strip()
    return None

In [6]:
import re
from urllib.parse import urlparse
from urllib.request import urlretrieve, unquote

def url2domain(url):
    url = re.sub('(http(s)*://)+', 'http://', url)
    parsed_url = urlparse(unquote(url.strip()))
    if parsed_url.scheme not in ['http','https']: return None
    netloc = re.search("(?:www\.)?(.*)", parsed_url.netloc).group(1)
    if netloc is not None: return str(netloc.encode('utf8')).strip()
    return None

In [7]:
url0 = "http://www.zebra-zoya.ru/русский_сайт/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun"

In [8]:
url2domain(url0)

"b'zebra-zoya.ru'"

In [9]:
urls = df_train["visits"].getField("url")

In [10]:
from pyspark.sql import functions as F 

In [11]:
from pyspark.sql.types import ArrayType
from pyspark.sql.types import StringType

In [12]:
url2domain_udf = F.udf(lambda xx: [ url2domain(x) for x in xx],
                   ArrayType(StringType()))

In [13]:
df_train = df_train.withColumn("urls",url2domain_udf(urls))

In [14]:
df_train = df_train.select(["uid", "urls", "gender_age"])

In [15]:
l = [["1","2","2"],["1","2","3"]]

In [16]:
df = df_train

In [17]:
empty_cols = []

for c in df.columns:
    print('=' * 3, c, '=' * 3)
    df.select(c).describe().show()
    
    p = df.select(c).describe().toPandas()
    try:
        if float(p[p['summary'] == 'mean'][c].tolist()[0]) == .0 and \
            float(p[p['summary'] == 'stddev'][c].tolist()[0])== .0:
            empty_cols.append(c)
    except: continue

=== uid ===
+-------+--------------------+
|summary|                 uid|
+-------+--------------------+
|  count|               36138|
|   mean|                null|
| stddev|                null|
|    min|03001878-d923-488...|
|    max|ffc8d1e1-c2ef-47a...|
+-------+--------------------+

=== urls ===
+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

=== gender_age ===
+-------+----------+
|summary|gender_age|
+-------+----------+
|  count|     36138|
|   mean|      null|
| stddev|      null|
|    min|   F:18-24|
|    max|    M:>=55|
+-------+----------+



In [18]:
df.count()

36138

In [19]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.classification import LogisticRegression

cv = CountVectorizer(inputCol="urls", outputCol="features")

indexer = StringIndexer(inputCol="gender_age", outputCol="label")

lr = LogisticRegression()

early_stages = [cv, indexer, lr]
#pipeline = Pipeline(stages=[cv, indexer, lr])

In [20]:
labels = indexer.fit(df).labels

In [21]:
labels

['M:25-34',
 'F:25-34',
 'M:35-44',
 'F:35-44',
 'F:18-24',
 'F:45-54',
 'M:45-54',
 'M:18-24',
 'F:>=55',
 'M:>=55']

In [22]:
label_converter = IndexToString(inputCol="prediction", outputCol="gender_age", labels=labels)

In [23]:
pipeline = Pipeline(stages = early_stages + [label_converter])

In [24]:
model = pipeline.fit(df)

In [21]:
#data = model.transform(df)

In [25]:
#data.select("gender_age","features","label").show()

+----------+--------------------+-----+
|gender_age|            features|label|
+----------+--------------------+-----+
|   F:18-24|(111581,[110,5629...|  4.0|
|   M:25-34|(111581,[142,151,...|  0.0|
|   F:25-34|(111581,[20,210,4...|  1.0|
|   F:25-34|(111581,[3454,903...|  1.0|
|    M:>=55|(111581,[5,69,90,...|  9.0|
|   F:25-34|(111581,[3755,849...|  1.0|
|   F:25-34|(111581,[0,7,81,1...|  1.0|
|   F:18-24|(111581,[25,33,22...|  4.0|
|   F:45-54|(111581,[42,633,3...|  5.0|
|   F:18-24|(111581,[33,43,49...|  4.0|
|   F:25-34|(111581,[0,16,18,...|  1.0|
|   F:25-34|(111581,[63,8590]...|  1.0|
|   M:25-34|(111581,[65,247,4...|  0.0|
|   F:25-34|(111581,[1014,140...|  1.0|
|   F:35-44|(111581,[24,45,29...|  3.0|
|   M:35-44|(111581,[842],[1.0])|  2.0|
|   M:25-34|(111581,[15369],[...|  0.0|
|   F:18-24|(111581,[7,103,53...|  4.0|
|   F:35-44|(111581,[106910],...|  3.0|
|    F:>=55|(111581,[190],[2.0])|  8.0|
+----------+--------------------+-----+
only showing top 20 rows



In [58]:
df.show(1)

+--------------------+--------------------+----------+
|                 uid|                urls|gender_age|
+--------------------+--------------------+----------+
|d50192e5-c44e-4ae...|[b'zebra-zoya.ru'...|   F:18-24|
+--------------------+--------------------+----------+
only showing top 1 row



In [25]:
model.save("lab04_model")

In [27]:
from pyspark.ml import PipelineModel

In [28]:
model_reloaded =  PipelineModel.load("lab04_model")

In [29]:
model_reloaded.extractParamMap()

{}

In [30]:
data = {
  "uid": "bd7a30e1-a25d-4cbf-a03f-61748cbe540e",
  "visits": [
    {
      "url": "https://mail.google.com/mail/u/0/#inbox",
      "timestamp": 1419775945781
    }
   ,  
   {
      "url": "https://lk-de.newprolab.com/",
      "timestamp": 1419775945781
    }
   ,  
   {
      "url": "https://yandex.ru/pogoda/moscow/maps/temperature?via=mmapwb&le_TemperatureBalloons=0&le_WindParticles=1&ll=25.976425_49.047348&z=4",
      "timestamp": 1419775945781
    }
  ,  
   {
      "url": "https://translate.yandex.ru/?lang=en-ru&text=derivation",
      "timestamp": 1419775945781
    }
 ,  
   {
      "url": "https://web.whatsapp.com/",
      "timestamp": 1419775945781
    }
,  
   {
      "url": "https://app.slack.com/client/TNG296ABE/CPPRL95HU/thread/CP73F91ST-1571040655.075700",
      "timestamp": 1419775945781
    }
,  
   {
      "url": "https://github.com/newprolab/content_dataengineer5/blob/master/labs/de_lab_04.md",
      "timestamp": 1419775945781
    }
  ]
}

In [31]:
rdd = sc.parallelize([data])

In [32]:
df_test = spark.read.json(rdd)

In [33]:
df_test.show()

+--------------------+--------------------+
|                 uid|              visits|
+--------------------+--------------------+
|bd7a30e1-a25d-4cb...|[[1419775945781, ...|
+--------------------+--------------------+



In [34]:
df_test = df_test.withColumn("urls",url2domain_udf(df_test["visits"].getField("url"))) 

In [35]:
df_test = df_test.select(["uid", "urls"])

In [36]:
model_reloaded.transform(df_test).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------+
|                 uid|                urls|            features|       rawPrediction|         probability|prediction|gender_age|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------+
|bd7a30e1-a25d-4cb...|[b'mail.google.co...|(111581,[7,20934,...|[-2.1159833078154...|[9.20242487670868...|       3.0|   F:35-44|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------+



In [132]:
predict = model_reloaded.transform(df_test)

In [133]:
predict.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                 uid|                urls|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|bd7a30e1-a25d-4cb...|[b'interfax.ru', ...|(111581,[151],[1.0])|[1.40039842369644...|[0.27740693297982...|       0.0|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------+



In [136]:
df_test.show()

+--------------------+--------------------+----------+
|                 uid|                urls|gender_age|
+--------------------+--------------------+----------+
|bd7a30e1-a25d-4cb...|[b'interfax.ru', ...|          |
+--------------------+--------------------+----------+



In [143]:
c = predict["prediction"]

In [146]:
#indexer = StringIndexer(inputCol="gender_age", outputCol="label")
indexer.

'gender_age'