In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.5
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [12]:
from pyspark.sql.types import *
import pyspark.sql.functions as f
from pyspark.ml.recommendation import ALS

In [3]:
test = spark.read.csv('/labs/laba03/lab10_test.csv', header=True)
train = spark.read.csv('/labs/laba03/lab10_train.csv', header=True)

In [8]:
train = train.withColumn('user_id', train.user_id.cast(IntegerType()))
train = train.withColumn('item_id', train.item_id.cast(IntegerType()))
train = train.withColumn('purchase', train.item_id.cast(IntegerType()))
test = test.withColumn('user_id', test.user_id.cast(IntegerType()))
test = test.withColumn('item_id', test.item_id.cast(IntegerType()))

In [11]:
user = train.groupBy('user_id').sum('purchase').select('user_id', f.col('sum(purchase)').alias('sum_user'))
item = train.groupBy('item_id').sum('purchase').select('item_id', f.col('sum(purchase)').alias('sum_item'))

train2 = train .join(user, on='user_id', how='left')\
               .join(item, on='item_id', how='left')

train2 = train2.fillna(0)
train2 = train2.withColumn('targ', (f.col('sum_user') + f.col('sum_item')) / 2)
train2 = train2.select('user_id', 'item_id', f.col('targ').alias('purchase'))

In [13]:
#0.910123208718 rank=15, maxIter=10, regParam=0.05, alpha=1.0
als = ALS(coldStartStrategy="nan",  rank=15, maxIter=10, regParam=0.05, alpha=1.0, \
          userCol='user_id', itemCol='item_id', ratingCol='purchase', \
          nonnegative=False, implicitPrefs=True, seed=871)
%time als_model = als.fit(train2)

CPU times: user 16 ms, sys: 4 ms, total: 20 ms
Wall time: 52.1 s


In [14]:
predict_test = als_model.transform(test)

In [15]:
@f.pandas_udf(FloatType())
def to_probs(values):
    return values.apply(lambda x: (x / 1.02))

preds = predict_test.withColumn("purchase", to_probs(f.col("prediction")))
preds = preds.sort(['user_id', 'item_id'], ascending=[True, True])

preds.select('user_id', 'item_id', 'purchase').toPandas().to_csv('lab03_.csv', index=False)

### Подбор параметров по сетке

In [16]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

# The ALS instance
als = ALS(userCol='user_id',
          itemCol='item_id',
          ratingCol='purchase',
          nonnegative=False, 
          implicitPrefs=True,
          coldStartStrategy="nan",
          seed=42)

als_paramgrid = (ParamGridBuilder()
                 .addGrid(als.rank, [6, 10, 15])
                 .addGrid(als.maxIter, [10, 15])
                 .addGrid(als.regParam, [0.1, 0.05, 0.15])
                 .addGrid(als.alpha, [0.5, 1.0, 2.0, 3.0, 5.0])
                 .build())

# The evaluation function for determining the best model
rmse_eval = RegressionEvaluator(labelCol='purchase',
                                predictionCol='prediction', 
                                metricName='rmse')

# The cross validation instance
als_cv = CrossValidator(estimator=als,
                        estimatorParamMaps=als_paramgrid,
                        evaluator=rmse_eval,
                        numFolds=2, 
                        seed=42)

# Fit the models and find the best one!
als_cv = als_cv.fit(train.dropna())

In [17]:
als_best = als_cv.bestModel

In [18]:
predict_test = als_model.transform(test)

In [19]:
@f.pandas_udf(FloatType())
def to_probs(values):
    return values.apply(lambda x: (x / 1.02))

predAls = predict_test.withColumn("purchase", to_probs(f.col("prediction")))
predAls = predAls.sort(['user_id', 'item_id'], ascending=[True, True])

predAls.select('user_id', 'item_id', 'purchase').toPandas().to_csv('lab03_1.csv', index=False)