In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from elasticsearch import Elasticsearch, helpers
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col
import uuid

In [None]:
!pip freeze|grep 

In [2]:
spark = SparkSession.builder.config("spark.submit.deployMode","client").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/12/08 04:49:54 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
22/12/08 04:49:54 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
22/12/08 04:49:54 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
22/12/08 04:49:54 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [3]:
sc = spark.sparkContext

{'MKL_NUM_THREADS': '1', 'OPENBLAS_NUM_THREADS': '1', 'PYTHONHASHSEED': '0'}

In [None]:
raw_reviews = spark.read.json('project/Books.json')
raw_reviews.count()

In [None]:
raw_reviews

In [None]:
raw_metadata = spark.read.json('project/meta_Books.json')
raw_metadata.count()

In [None]:
raw_metadata

In [None]:
metadata = raw_metadata.select('asin', 'title', 'description', 'brand')

In [None]:
raw_reviews.createOrReplaceTempView("view_raw_reviews")

In [None]:
reviews_filtered = spark.sql("select asin, reviewerId, overall from\
                            (select *, count(*) over (partition by reviewerId) as c\
                            from view_raw_reviews) where c >= 5")

In [None]:
reviews_filtered.write.option("header",True).csv("project/reviews_filtered")

In [None]:
reviews_filtered = spark.read.option("header", True).csv("project/reviews_filtered")

In [None]:
reviews_filtered.count()

In [None]:
stringindexer = StringIndexer().setInputCol("reviewerId").setOutputCol("reviewerIdIdx")
model_reviewerId = stringindexer.fit(reviews_filtered)

In [None]:
reviews_filtered_transformed = model_reviewerId.transform(reviews_filtered)

In [None]:
stringindexer = StringIndexer().setInputCol("asin").setOutputCol("asinIdx")
model_asin = stringindexer.fit(reviews_filtered_transformed)

In [None]:
reviews_filtered_transformed = model_asin.transform(reviews_filtered_transformed)

In [None]:
stringindexer = StringIndexer().setInputCol("asin").setOutputCol("asinIdx")
model_asin = stringindexer.fit(metadata)
metadata_transformed = model_asin.transform(metadata)

In [None]:
reviews_filtered_transformed = reviews_filtered_transformed\
                                .withColumn("reviewerIdIdx", col("reviewerIdIdx").cast('int'))\
                                .withColumn("asinIdx", col("asinIdx").cast('int'))\
                                .withColumn("overall", col("overall").cast('float'))

In [None]:
reviews_filtered_transformed.printSchema()

In [None]:
reviews_filtered_transformed.write.option("header", True).csv("project/reviews_filtered_transformed")

In [None]:
reviews_filtered_transformed = spark.read.csv("project/reviews_filtered_transformed", inferSchema=True, header=True)

In [None]:
reviews_filtered_transformed.printSchema()

In [None]:
reviews_filtered_transformed.count()

In [None]:
reviews_filtered_transformed.show(20)

In [None]:
(training, validation) = reviews_filtered_transformed.randomSplit([0.8, 0.2])

In [None]:
als = ALS(maxIter=10, regParam=0.05, rank=48, userCol="reviewerIdIdx", itemCol="asinIdx", ratingCol="overall",
          coldStartStrategy="drop")

In [None]:
model = als.fit(training)

In [None]:
model.save("project/als_model")

In [None]:
model = ALSModel.load("project/als_model")

In [None]:
itemfactors = spark.createDataFrame(model.itemFactors.rdd)

In [None]:
itemfactors.count()

In [None]:
itemfactors.show(50)

In [None]:
userfactors.count(reviews_filtered_transformed)

In [None]:
items_frame = itemfactors.select('id','features').toPandas()\
                .rename(columns={"id": "item_id", "features": "features"})

In [None]:
items_frame.merge()

In [None]:
es_client = Elasticsearch('https://1054-216-165-95-130.ngrok.io')

In [None]:
index_name = "productlens"
try:
    es_client.indices.delete(index=index_name)
except Exception as e:
    pass