In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Create a Spark session
spark = SparkSession.builder.appName("RecommendationSystem").getOrCreate()

# Define the schema for the dataset
schema = StructType([
    StructField("review_text", StringType(), True),
    StructField("summary", StringType(), True),
    StructField("profile_name", StringType(), True),
    StructField("helpfulness", StringType(), True),
    StructField("time", StringType(), True),
    StructField("score", DoubleType(), True),
])

# Path to your JSON file
json_file_path = "movies.json"

# Load the JSON data into a PySpark DataFrame
recommendation_df = spark.read.json(json_file_path, schema=schema)

# Show the first few rows of the DataFrame
recommendation_df.show()


+-----------+--------------------+--------------------+-----------+----------+-----+
|review_text|             summary|        profile_name|helpfulness|      time|score|
+-----------+--------------------+--------------------+-----------+----------+-----+
|       null|"There Is So Much...|Brian E. Erland "...|        7/7|1182729600|  3.0|
|       null|Worthwhile and Im...|          Grady Harp|        4/4|1181952000|  3.0|
|       null|This movie needed...|Chrissy K. McVay ...|       8/10|1164844800|  5.0|
|       null|distantly based o...|        golgotha.gov|        1/1|1197158400|  3.0|
|       null|"What's going on ...|KerrLines "&#34;M...|        1/1|1188345600|  3.0|
|       null|Pretty pointless ...|abra "a devoted r...|        0/0|1229040000|  2.0|
|       null|This is junk, sta...| Charles R. Williams|       3/11|1164153600|  1.0|
|       null|A  Rock N Roll Hi...|   Anthony Accordino|      64/65|1060473600|  5.0|
|       null|A  MUST-HAVE  vid...|    Joseph P. Aiello|      26/2

In [24]:
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml import Pipeline

# Create a Spark session
spark = SparkSession.builder.appName("RecommendationModel").getOrCreate()

# Read data from a JSON file
json_file_path = "movies.json"
df = spark.read.json(json_file_path)

# Extract relevant columns for recommendation
df = df.select("user_id", "product_id", "score")

# Convert user_id and product_id to numeric indices
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_index", handleInvalid="keep")
product_indexer = StringIndexer(inputCol="product_id", outputCol="product_index", handleInvalid="keep")

# Create an ALS (Alternating Least Squares) recommendation model
als = ALS(maxIter=5, regParam=0.01, userCol="user_index", itemCol="product_index", ratingCol="score")

# Create a pipeline to execute the indexers and ALS model
pipeline = Pipeline(stages=[user_indexer, product_indexer, als])

# Split the data into training and test sets
(training_data, test_data) = df.randomSplit([0.8, 0.2])

# Train the recommendation model
model = pipeline.fit(training_data)

# Make predictions on the test set
predictions = model.transform(test_data)

# Show the predictions
predictions.select("user_id", "product_id", "prediction").show()

# Stop the Spark session
spark.stop()


+--------------+----------+-----------+
|       user_id|product_id| prediction|
+--------------+----------+-----------+
|A13RM1AWD1C5ZR|B0001G6PZC|-0.24127243|
|A145CT70T2SB51|B0012EM5GK|  5.1345353|
|A10X0JN8KTK89H|6304286961|  2.9998443|
|A103EXN5Q7HX6Z|B0095D5454|        NaN|
|A10X4NX66GHW49|B000063W82|        NaN|
|A10XUPHLWRCGY4|B000063W82|        NaN|
|A11977Q3OXQYHD|B00096S43U|        NaN|
|A11IKZMXWDI763|B0001G6PZC|        NaN|
|A122E0BU0KVYDA|B000063W82|        NaN|
|A1250POFZL8U1B|B0001G6PZC|        NaN|
|A13OMT8D4GPIBV|6304286961|        NaN|
|A149KBE47CBLYD|B00004CQTP|        NaN|
|A14LAZ5I1YD6I1|B0016OLXN2|        NaN|
|A14MNXASHGYPN7|B000063W82|        NaN|
|A14N5L5T089VX3|0800103688|        NaN|
|A12MRCRQV18FEQ|B002OHDRF2| 0.99803513|
|A13TO1ZFAH9SVN|B000063W1R|  7.6159043|
|A141HP4LYPWMSR|B000063W1R|-0.73775494|
|A10DB0H2NZF11E|B002OHDRF2|  4.9901752|
|A13IKSGDYNBNQS|B0001G6PZC| -0.3449365|
+--------------+----------+-----------+
only showing top 20 rows



In [19]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

# Create a Spark session
spark = SparkSession.builder.appName("RecommendationModel").getOrCreate()

# Read data from a JSON file
json_file_path = "movies.json"
df = spark.read.json(json_file_path)

# Extract relevant columns for recommendation
df = df.select("user_id", "product_id", "score")

# Convert user_id and product_id to numeric indices
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_index", handleInvalid="keep")
product_indexer = StringIndexer(inputCol="product_id", outputCol="product_index", handleInvalid="keep")

# Create an ALS (Alternating Least Squares) recommendation model
als = ALS(maxIter=5, regParam=0.01, userCol="user_index", itemCol="product_index", ratingCol="score")

# Create a pipeline to execute the indexers and ALS model
pipeline = Pipeline(stages=[user_indexer, product_indexer, als])

# Split the data into training and test sets
(training_data, test_data) = df.randomSplit([0.8, 0.2])

# Train the recommendation model
model = pipeline.fit(training_data)

# Make predictions on the test set
predictions = model.transform(test_data)

# Show the predictions
predictions.select("user_id", "product_id", "prediction").show()

# Stop the Spark session
spark.stop()




+--------------+----------+----------+
|       user_id|product_id|prediction|
+--------------+----------+----------+
|A137AX2DG2D3QD|B002OHDRF2| 1.9686402|
|A127PCLKOSDS04|B000UGBOT0| 1.1798981|
|A133CUOCQTPKTT|B002OHDRF2| 2.9810488|
|A12X8OWP4N74KP|B0001G6PZC| 1.0251399|
|A100Y8WSLFJN7Q|B000063W1R|       NaN|
|A103EXN5Q7HX6Z|B0095D5454|       NaN|
|A103ZG6ASSR7UT|B004TWOX26|       NaN|
|A1041HQGJDKFG5|B0001G6PZC|       NaN|
|A10GI7HCPT27SZ|0800103688|       NaN|
|A110UAX71H8R0W|0790747324|       NaN|
|A11HMB8Z48EV76|B0001G6PZC|       NaN|
|A11JC53JUZ0TBK|B000063W82|       NaN|
|A11NH0A73VNNA6|6304286961|       NaN|
|A11R16C1ZKEWOI|0800103688|       NaN|
|A11SMJ2SH7OAR7|B006FYGF8Q|       NaN|
|A12FDY8QMUW9V5|B000UGBOT0|       NaN|
|A12Q0LLN5R2XAG|B002OHDRF2|       NaN|
|A12SX8D30RZ6QF|B0001G6PZC|       NaN|
|A12V5I7WHOYXVK|0790747324|       NaN|
|A13547L1PPP0OP|B000063W82|       NaN|
+--------------+----------+----------+
only showing top 20 rows



In [26]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# Create a Spark session
spark = SparkSession.builder.appName("RecommendationModel").getOrCreate()

# Read data from a JSON file
json_file_path = "movies.json"
df = spark.read.json(json_file_path)

# Extract relevant columns for recommendation
df = df.select("user_id", "product_id", "score")

# Check for missing or NaN values in the 'score' column
df = df.dropna(subset=["score"])

# Convert user_id and product_id to numeric indices
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_index", handleInvalid="keep")
product_indexer = StringIndexer(inputCol="product_id", outputCol="product_index", handleInvalid="keep")

# Create an ALS (Alternating Least Squares) recommendation model
als = ALS(maxIter=5, regParam=0.01, userCol="user_index", itemCol="product_index", ratingCol="score")

# Create a pipeline to execute the indexers and ALS model
pipeline = Pipeline(stages=[user_indexer, product_indexer, als])

# Split the data into training and test sets
(training_data, test_data) = df.randomSplit([0.8, 0.2])

# Train the recommendation model
model = pipeline.fit(training_data)

# Make predictions on the test set
predictions = model.transform(test_data)

# Check for NaN values in the 'prediction' column
predictions = predictions.dropna(subset=["prediction"])

# Evaluate the model using RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="score", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Stop the Spark session
spark.stop()




Root Mean Squared Error (RMSE): 5.178928544801977


In [29]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# Create a Spark session
spark = SparkSession.builder.appName("RecommendationModel").getOrCreate()

# Read data from a JSON file
json_file_path = "movies.json"
df = spark.read.json(json_file_path)

# Extract relevant columns for recommendation
df = df.select("user_id", "product_id", "score")

# Convert user_id and product_id to numeric indices
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_index", handleInvalid="keep")
product_indexer = StringIndexer(inputCol="product_id", outputCol="product_index", handleInvalid="keep")

# Combine features into a feature vector
feature_columns = ["user_index", "product_index"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Create a linear regression model
lr = LinearRegression(featuresCol="features", labelCol="score", maxIter=10, regParam=0.01)

# Create a pipeline to execute the indexers, assembler, and linear regression model
pipeline = Pipeline(stages=[user_indexer, product_indexer, assembler, lr])

# Split the data into training and test sets
(training_data, test_data) = df.randomSplit([0.8, 0.2])

# Train the recommendation model
model = pipeline.fit(training_data)

# Make predictions on the test set
predictions = model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Show the predictions
predictions.select("user_id", "product_id", "prediction").show()

# Stop the Spark session
spark.stop()


Root Mean Squared Error (RMSE): 1.2578639101786042
+--------------+----------+------------------+
|       user_id|product_id|        prediction|
+--------------+----------+------------------+
|A103EXN5Q7HX6Z|B0095D5454| 4.280162584897461|
|A1041HQGJDKFG5|B0001G6PZC| 4.305241120296268|
|A1078L9AXZRGT7|B000063W82| 4.306244261712219|
|A10X5D8JIK3QMS|B001GBPZRU| 3.999015879981755|
|A10XHOI86O75LH|B00022VM5S| 3.959491712264729|
|A11ED8O95W2103|B000063W82| 4.091157626453714|
|A11JC53JUZ0TBK|B000063W82| 4.306244261712219|
|A11NH0A73VNNA6|6304286961| 4.278156302065557|
|A11PZ6HSK13L66|B00020HBNC| 3.959826092736713|
|A11R16C1ZKEWOI|0800103688| 4.072405235748979|
|A134K1N1C8VC37|6300147967| 4.221980382772231|
|A13BLSXL78EMRX|0790747324| 4.286515813865159|
|A13D1WTFEMS9VH|B00022VM5I| 4.056087851886209|
|A13OMT8D4GPIBV|6304286961| 4.072181195053247|
|A13RBXRUQ1LVAW|B000UGBOT0|4.2811657263134135|
|A13RM1AWD1C5ZR|B0001G6PZC| 4.305241120296268|
|A13RYD26OASOWN|B002OHDRF2| 4.120936612739646|
|A13TO1ZF