In [0]:
from pyspark.sql import SparkSession,functions as func
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,FloatType,LongType,DoubleType

In [0]:
spark=SparkSession.builder.appName("Regression Models").getOrCreate()

In [0]:
links=spark.read.option("inferSchema","true").option("header","true").csv("dbfs:/FileStore/tables/links.csv")
links.show(5)

In [0]:
links.printSchema()

In [0]:
ratings=spark.read.option("inferSchema","true").option("header","true").csv("dbfs:/FileStore/tables/ratings.csv")
ratings.show(5)

In [0]:
ratings.printSchema()

In [0]:
links.createOrReplaceTempView("link")
ratings.createOrReplaceTempView("rating")

In [0]:
join_df=spark.sql("select l.*,r.userId,r.rating from link l inner join rating r on l.movieId=r.movieId")
join_df.show(10)

In [0]:
join_df.createOrReplaceTempView("temp_table")

In [0]:
genome_score=spark.read.option("inferSchema","true").option("header","true").csv("dbfs:/FileStore/tables/genome_score.csv")
genome_score.show(10)

In [0]:
genome_score.createOrReplaceTempView("score")

In [0]:
final_df=spark.sql("select t.*,s.tagId,s.relevance from temp_table t inner join score s on t.movieId=s.movieId")
final_df.show(10)

In [0]:
final_df.count()

In [0]:
final_df.printSchema()

In [0]:
from pyspark.ml.regression import *
from pyspark.ml.feature import *
from pyspark.ml.linalg import *
from pyspark.ml.evaluation import *

In [0]:
featureAssembler=VectorAssembler(inputCols=["movieId","imdbId","tmdbId","userId","tagId","relevance"],outputCol="Independent Variables")
output=featureAssembler.transform(final_df)

In [0]:
output.show(5)

In [0]:
dataset=output.select("Independent Variables","rating")
dataset.show(5)

In [0]:
train_data,test_data=dataset.randomSplit([0.7,0.3])

In [0]:
# Applying Linear Regression Model.

In [0]:
lr=LinearRegression(featuresCol="Independent Variables",labelCol="rating",maxIter=10,regParam=0.5)
regressor=lr.fit(train_data)

In [0]:
print(regressor.coefficients)
print(regressor.intercept)

In [0]:
pred=regressor.evaluate(test_data)
pred.predictions.show(5)

In [0]:
evaluator=RegressionEvaluator(labelCol="rating",predictionCol="prediction",metricName="rmse")
pred_result=regressor.transform(test_data)
rmse=evaluator.evaluate(pred_result)
print(rmse)

In [0]:
#Applying Decision Tree Regressor.

In [0]:
dtr=DecisionTreeRegressor(featuresCol="Independent Variables",labelCol="rating")
model=dtr.fit(train_data)

In [0]:
print("Feature Importances:",model.featureImportances.toArray)

In [0]:
predicts=model.transform(test_data)
predicts.show(5)

In [0]:
new_evaluator=RegressionEvaluator(labelCol="rating",predictionCol="prediction",metricName="rmse")
rmse_value=new_evaluator.evaluate(predicts)
print(rmse_value)

In [0]:
#Applying Random Forest Regressor.

In [0]:
rfr=RandomForestRegressor(featuresCol="Independent Variables",labelCol="rating")
new_model=rfr.fit(train_data)

In [0]:
print("Feature Importances :",new_model.featureImportances.toArray)

In [0]:
new_predicts=new_model.transform(test_data)
new_predicts.show(5)

In [0]:
evaluator2=RegressionEvaluator(labelCol="rating",predictionCol="prediction",metricName="rmse")
calculated_rmse=evaluator2.evaluate(new_predicts)
print(calculated_rmse)

In [0]:
# Applying Gradient-boosted tree regression.

In [0]:
gbt=GBTRegressor(featuresCol="Independent Variables",labelCol="rating",maxIter=10)
gbt_model=gbt.fit(train_data)

In [0]:
gbt_predicts=gbt_model.transform(test_data)
gbt_predicts.show(5)

In [0]:
gbt_evaluator=RegressionEvaluator(labelCol="rating",predictionCol="prediction",metricName="rmse")
gbt_rmse=gbt_evaluator.evaluate(gbt_predicts)
print(gbt_rmse)

In [0]:
#Introducing MLFlow with Databricks.
# pip install mlflow

In [0]:
import mlflow

In [0]:
with mlflow.start_run():
  lr1=LinearRegression(featuresCol="Independent Variables",labelCol="rating",maxIter=5,regParam=0.1)
  regressor1=lr1.fit(train_data)
  print(regressor1.coefficients)
  print(regressor1.intercept)
  pred1=regressor1.evaluate(test_data)
  pred1.predictions.show(5)
  evaluator1=RegressionEvaluator(labelCol="rating",predictionCol="prediction",metricName="rmse")
  pred_result1=regressor1.transform(test_data)
  rmse1=evaluator1.evaluate(pred_result1)
  print(rmse1)

In [0]:
with mlflow.start_run():
  dtr1=DecisionTreeRegressor(featuresCol="Independent Variables",labelCol="rating")
  model1=dtr1.fit(train_data)
  print("Feature Importances:",model1.featureImportances.toArray)
  predicts1=model1.transform(test_data)
  predicts1.show(5)
  new_evaluator1=RegressionEvaluator(labelCol="rating",predictionCol="prediction",metricName="rmse")
  rmse_value1=new_evaluator1.evaluate(predicts1)
  print(rmse_value1)

In [0]:
with mlflow.start_run():
  rfr1=RandomForestRegressor(featuresCol="Independent Variables",labelCol="rating")
  new_model1=rfr1.fit(train_data)
  print("Feature Importances :",new_model1.featureImportances.toArray)
  new_predicts1=new_model1.transform(test_data)
  new_predicts1.show(5)
  new_evaluator2=RegressionEvaluator(labelCol="rating",predictionCol="prediction",metricName="rmse")
  calculated_rmse1=new_evaluator2.evaluate(new_predicts1)
  print(calculated_rmse1)

In [0]:
with mlflow.start_run():
  gbt1=GBTRegressor(featuresCol="Independent Variables",labelCol="rating",maxIter=10)
  gbt_model1=gbt1.fit(train_data)
  gbt_predicts1=gbt_model1.transform(test_data)
  gbt_predicts1.show(5)
  gbt_evaluator1=RegressionEvaluator(labelCol="rating",predictionCol="prediction",metricName="rmse")
  gbt_rmse1=gbt_evaluator1.evaluate(gbt_predicts1)
  print(gbt_rmse1)